From 2158f4a9c891f8a52c13a08b28ee55341b6ac8c4 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Mon, 10 Sep 2018 11:29:40 -0700 Subject: [PATCH 001/237] add export import test to TestJitGenerated (#10982) Summary: Checking assertExportImport for all of the generated test jit tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10982 Differential Revision: D9636935 Pulled By: eellison fbshipit-source-id: f3f1ce77d454848098f2ac7e0fa18bf8564890be --- test/test_jit.py | 28 ++++++++++++++++++---------- torch/csrc/jit/export.cpp | 5 +++-- torch/csrc/jit/import.cpp | 2 ++ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 5eb7d4649bd763..342087e277292c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -351,8 +351,6 @@ def allSum(vs): return ge - -class TestJit(JitTestCase): def assertExportImport(self, trace, inputs): graph = trace if isinstance(trace, torch._C.Graph) else trace.graph() m = torch.jit.ScriptModule() @@ -361,6 +359,9 @@ def assertExportImport(self, trace, inputs): self.assertEqual(m.forward(*inputs), m_import.forward(*inputs)) + +class TestJit(JitTestCase): + def test_simple(self): x = torch.tensor([0.4], requires_grad=True) y = torch.tensor([0.7], requires_grad=True) @@ -7164,10 +7165,12 @@ def new_fn(*tensors_): return new_fn, [arg for arg in args if isinstance(arg, torch.Tensor)] -def create_traced_fn(fn): +# create a trace function from input fn +def create_traced_fn(self, fn): def traced_fn(*inputs, **kwargs): fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs) traced = torch.jit.trace(fn_tensors, inputs_tensors) + self.assertExportImport(traced.graph, inputs_tensors) return traced(*inputs_tensors) return traced_fn @@ -7177,7 +7180,10 @@ def the_method({}): ''' -def create_script_fn(method_name, func_type, output_process_fn): +# create a script function from (name, func_type, output_process_fn), +# returns a function takes in (args, kwargs) and runs the compiled function and +# then applies the post process fn to the outputs +def create_script_fn(self, method_name, func_type, output_process_fn): def script_fn(*args, **kwargs): formals = [] tensors = [] @@ -7204,6 +7210,8 @@ def script_fn(*args, **kwargs): script = script_template.format(', '.join(formals), call) CU = torch.jit.CompilationUnit(script) + self.assertExportImport(CU.the_method.graph, tensors) + return output_process_fn(CU.the_method(*tensors)) return script_fn @@ -7272,7 +7280,7 @@ def clone_inputs(requires_grad): self.assertTrue(torch.allclose(g2, g2_test, atol=5e-4, rtol=1e-4)) -class TestJitGenerated(TestCase): +class TestJitGenerated(JitTestCase): pass @@ -7560,12 +7568,12 @@ def fn(*inputs, **kwargs): if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name): if test_name not in EXCLUDE_TRACED: - check_against_reference(self, create_traced_fn(fn), + check_against_reference(self, create_traced_fn(self, fn), fn, (self_variable,) + args_variable, kwargs_variable) if not is_magic_method and test_name not in EXCLUDE_SCRIPT: check_against_reference(self, - create_script_fn(name, 'method', output_process_fn), + create_script_fn(self, name, 'method', output_process_fn), fn, (self_variable,) + args_variable, kwargs_variable) # functional interface tests @@ -7578,11 +7586,11 @@ def fn(*inputs, **kwargs): f_args_tensor = (self_tensor,) + args_tensor if not is_inplace and test_name not in EXCLUDE_TRACED: - check_against_reference(self, create_traced_fn(fn), fn, f_args_variable, kwargs_variable) + check_against_reference(self, create_traced_fn(self, fn), fn, f_args_variable, kwargs_variable) if not is_inplace and test_name not in EXCLUDE_SCRIPT: check_against_reference(self, - create_script_fn(name, 'functional', output_process_fn), + create_script_fn(self, name, 'functional', output_process_fn), fn, f_args_variable, kwargs_variable) check(name) @@ -7620,7 +7628,7 @@ def fn(*inputs, **kwargs): if test_name not in EXCLUDE_SCRIPT: check_against_reference(self, - create_script_fn(name, 'nn_functional', output_process_fn), + create_script_fn(self, name, 'nn_functional', output_process_fn), fn, f_args_variable, kwargs_variable) post_add_test(test_name, skipTestIf, do_test) diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index e5e6cf1960df60..c4ef14f191f0ee 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -557,8 +557,9 @@ void ModuleEncoder::EncodeTypeInfo( type_proto->set_denotation("IntType"); } else if (kind == TypeKind::NoneType) { type_proto->set_denotation("NoneType"); - } - else { + } else if (kind == TypeKind::GeneratorType) { + type_proto->set_denotation("GeneratorType"); + } else { throw std::runtime_error("unexpected type kind"); } } diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index ba6fe9fe0a27a4..f868f1ff555ad5 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -258,6 +258,8 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) { return IntType::get(); } else if (kind == "NoneType") { return NoneType::get(); + } else if (kind == "GeneratorType") { + return GeneratorType::get(); } else { throw std::runtime_error("unexpected string for type kind"); } From 040d75d455cb2963de9a147001f11f63459c0620 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Mon, 10 Sep 2018 12:30:42 -0700 Subject: [PATCH 002/237] Add option to use CUDA memory leak testing as a context manager (#11380) Summary: cc SsnL Pull Request resolved: https://github.com/pytorch/pytorch/pull/11380 Reviewed By: ezyang Differential Revision: D9705877 Pulled By: zou3519 fbshipit-source-id: 02470c25236f57fa02f4ac9d7ed63d38a6355db2 --- test/common.py | 56 ++++++++++++++++++++++++++++++++--------------- test/test_cuda.py | 7 ++++++ 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/test/common.py b/test/common.py index e65795967deb7a..04986fcf5af57b 100644 --- a/test/common.py +++ b/test/common.py @@ -128,14 +128,6 @@ def dec(fn): return dec -def get_cuda_memory_usage(): - # we don't need CUDA synchronize because the statistics are not tracked at - # actual freeing, but at when marking the block as free. - num_devices = torch.cuda.device_count() - gc.collect() - return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices)) - - def suppress_warnings(fn): @wraps(fn) def wrapper(*args, **kwargs): @@ -215,6 +207,38 @@ def is_iterable(obj): return False +class CudaMemoryLeakCheck(): + def __init__(self, testcase, name=None): + self.name = testcase.id() if name is None else name + self.testcase = testcase + + # initialize context & RNG to prevent false positive detections + # when the test is the first to initialize those + from common_cuda import initialize_cuda_context_rng + initialize_cuda_context_rng() + + @staticmethod + def get_cuda_memory_usage(): + # we don't need CUDA synchronize because the statistics are not tracked at + # actual freeing, but at when marking the block as free. + num_devices = torch.cuda.device_count() + gc.collect() + return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices)) + + def __enter__(self): + self.befores = self.get_cuda_memory_usage() + + def __exit__(self, exec_type, exec_value, traceback): + # Don't check for leaks if an exception was thrown + if exec_type is not None: + return + afters = self.get_cuda_memory_usage() + for i, (before, after) in enumerate(zip(self.befores, afters)): + self.testcase.assertEqual( + before, after, '{} leaked {} bytes CUDA memory on device {}'.format( + self.name, after - before, i)) + + class TestCase(unittest.TestCase): precision = 1e-5 maxDiff = None @@ -232,12 +256,12 @@ def __init__(self, method_name='runTest'): from common_cuda import TEST_CUDA fullname = self.id().lower() # class_name.method_name if TEST_CUDA and ('gpu' in fullname or 'cuda' in fullname): - # initialize context & RNG to prevent false positive detections - # when the test is the first to initialize those - from common_cuda import initialize_cuda_context_rng - initialize_cuda_context_rng() setattr(self, method_name, self.wrap_with_cuda_memory_check(test_method)) + def assertLeaksNoCudaTensors(self, name=None): + name = self.id() if name is None else name + return CudaMemoryLeakCheck(self, name) + def wrap_with_cuda_memory_check(self, method): # Assumes that `method` is the tested function in `self`. # NOTE: Python Exceptions (e.g., unittest.Skip) keeps objects in scope @@ -247,12 +271,8 @@ def wrap_with_cuda_memory_check(self, method): # call in try-finally and always do the check. @wraps(method) def wrapper(self, *args, **kwargs): - befores = get_cuda_memory_usage() - method(*args, **kwargs) - afters = get_cuda_memory_usage() - for i, (before, after) in enumerate(zip(befores, afters)): - self.assertEqual(before, after, '{} leaked {} bytes CUDA memory on device {}'.format( - self.id(), after - before, i)) + with self.assertLeaksNoCudaTensors(): + method(*args, **kwargs) return types.MethodType(wrapper, self) def setUp(self): diff --git a/test/test_cuda.py b/test/test_cuda.py index 1ca7155dd098c3..71e4a8220c6e68 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -1955,6 +1955,13 @@ def leak_gpu1(): with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 1"): leak_gpu1() + def test_cuda_memory_leak_detection_propagates_errors(self): + with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"): + with self.assertLeaksNoCudaTensors(): + x = torch.randn(3, 1, device='cuda') + y = torch.randn(2, 1, device='cuda') + z = x + y + def load_ignore_file(): from os.path import join, dirname From ce6906b05116a0ef9dfd3e52284c0fcf77186129 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Mon, 10 Sep 2018 12:31:22 -0700 Subject: [PATCH 003/237] Narrowing Blob (#11167) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11167 Narrow the Blob API as preparation for merging Blob/IValue - get rid of templated IsType and Operator::InputIsType / OutputIsType - Use 'using' instead of 'typedef' for DestroyCall (just for readability) Reviewed By: ezyang Differential Revision: D9623916 fbshipit-source-id: 952f0b0cf5a525094b02e8d2798dd57a56a9e1d8 --- .../contrib/tensorrt/tensorrt_tranformer.cc | 4 ++-- caffe2/core/blob.h | 24 +++++++------------ caffe2/core/blob_gpu_test.cc | 6 ++--- caffe2/core/blob_test.cc | 16 ++++++------- caffe2/core/operator.h | 22 +++++------------ caffe2/ideep/operators/concat_split_op.cc | 2 +- .../ideep/operators/operator_fallback_ideep.h | 2 +- caffe2/ideep/operators/utility_ops.cc | 2 +- caffe2/mkl/operators/operator_fallback_mkl.h | 2 +- caffe2/mkl/operators/packed_fc_op.cc | 2 +- caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm | 2 +- caffe2/mpi/mpi_ops.h | 2 +- caffe2/observers/profile_observer_gpu.cc | 8 +++---- caffe2/operators/ensure_cpu_output_op.h | 4 ++-- caffe2/operators/if_op.h | 2 +- caffe2/operators/onnxifi_op.cc | 2 +- caffe2/operators/operator_fallback_gpu.h | 4 ++-- caffe2/operators/string_ops_test.cc | 2 +- caffe2/operators/stylizer_ops.cc | 2 +- caffe2/operators/utility_ops.h | 8 +++---- caffe2/operators/while_op.h | 2 +- caffe2/predictor/predictor.cc | 2 +- caffe2/sgd/adam_op.h | 2 +- caffe2/sgd/fp16_momentum_sgd_op.h | 4 ++-- caffe2/sgd/fp32_momentum_sgd_op.h | 4 ++-- caffe2/sgd/iter_op.h | 2 +- caffe2/sgd/momentum_sgd_op.h | 8 +++---- caffe2/sgd/yellowfin_op.h | 2 +- 28 files changed, 64 insertions(+), 80 deletions(-) diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc index 716ff7a70814e0..3612d8b46f1f8d 100644 --- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc +++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc @@ -95,10 +95,10 @@ void BlobToTensorProto( } // Set values - if (blob->template IsType(CPU)) { + if (blob->IsTensorType(CPU)) { const auto& cpu_tensor = blob->template Get(); CPUTensorToTensorProto(cpu_tensor, t); - } else if (blob->template IsType(CUDA)) { + } else if (blob->IsTensorType(CUDA)) { const auto& cuda_tensor = blob->template Get(); const auto cpu_tensor = TensorCPU(cuda_tensor, context); context->FinishDeviceComputation(); diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h index f085ee23995bd7..1df7bc9beafcb8 100644 --- a/caffe2/core/blob.h +++ b/caffe2/core/blob.h @@ -23,9 +23,9 @@ namespace caffe2 { * properly when the blob is deallocated or re-allocated with a new type. A blob * could contain anything, although the most common case is to contain a Tensor. */ -class CAFFE2_API Blob { +class CAFFE2_API Blob final { public: - typedef void (*DestroyCall)(void*); + using DestroyCall = void(void*); /** * Initializes an empty Blob. @@ -63,14 +63,8 @@ class CAFFE2_API Blob { return meta_.Match(); } - // TODO(jerryzh): Remove template - template - bool IsType(DeviceType device_type) const { - static_assert( - std::is_same::value, - "IsType(DeviceType) only available on " - "Tensor types."); - bool is_match = meta_.Match(); + bool IsTensorType(DeviceType device_type) const { + bool is_match = meta_.Match(); auto* tensor = static_cast(pointer_); if (is_match && tensor && tensor->GetDeviceType() == device_type) { return true; @@ -149,7 +143,7 @@ class CAFFE2_API Blob { } inline Tensor* GetMutableTensor(DeviceType device_type) { - if (IsType(device_type)) { + if (IsTensorType(device_type)) { return static_cast(pointer_); } else { VLOG(1) << "Create new mutable object " << TypeMeta::TypeName() @@ -178,7 +172,7 @@ class CAFFE2_API Blob { } inline void* - Reset(void* allocated, const TypeMeta& meta, const DestroyCall& destroy) { + Reset(void* allocated, const TypeMeta& meta, DestroyCall* destroy) { if (pointer_ && destroy_) { destroy_(pointer_); } @@ -192,8 +186,8 @@ class CAFFE2_API Blob { * Releases the ownership, if any, this Blob has on the underlying pointer. * The user is then responsible for freeing the data if needed */ - inline DestroyCall Release() { - DestroyCall d = destroy_; + inline DestroyCall* Release() { + DestroyCall* d = destroy_; destroy_ = nullptr; return d; } @@ -289,7 +283,7 @@ class CAFFE2_API Blob { } TypeMeta meta_; void* pointer_ = nullptr; - DestroyCall destroy_ = nullptr; + DestroyCall* destroy_ = nullptr; AT_DISABLE_COPY_AND_ASSIGN(Blob); }; diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc index 0f0fcb54906dcd..29bf0c3bc52ae6 100644 --- a/caffe2/core/blob_gpu_test.cc +++ b/caffe2/core/blob_gpu_test.cc @@ -148,7 +148,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) { } \ Blob new_blob; \ EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \ - EXPECT_TRUE(new_blob.IsType(CUDA)); \ + EXPECT_TRUE(new_blob.IsTensorType(CUDA)); \ Tensor new_cpu_tensor(blob.Get(), CPU); \ EXPECT_EQ(new_cpu_tensor.ndim(), 2); \ EXPECT_EQ(new_cpu_tensor.dim(0), 2); \ @@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) { // Test if the restored blob is still of the same device. blob.Reset(); EXPECT_NO_THROW(blob.Deserialize(serialized)); - EXPECT_TRUE(blob.IsType(CUDA)); + EXPECT_TRUE(blob.IsTensorType(CUDA)); EXPECT_EQ(GetGPUIDForPointer(blob.Get().data()), gpu_id); // Test if we force the restored blob on a different device, we @@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) { blob.Reset(); proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0); EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString())); - EXPECT_TRUE(blob.IsType(CUDA)); + EXPECT_TRUE(blob.IsTensorType(CUDA)); EXPECT_EQ(GetGPUIDForPointer(blob.Get().data()), 0); } } diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc index 97d17c6e5924f3..8103071c81ee26 100644 --- a/caffe2/core/blob_test.cc +++ b/caffe2/core/blob_test.cc @@ -86,15 +86,15 @@ TEST(BlobTest, Blob) { int* int_unused CAFFE2_UNUSED = blob.GetMutable(); EXPECT_TRUE(blob.IsType()); EXPECT_FALSE(blob.IsType()); - EXPECT_FALSE(blob.IsType(CPU)); + EXPECT_FALSE(blob.IsTensorType(CPU)); BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable(); EXPECT_TRUE(blob.IsType()); EXPECT_FALSE(blob.IsType()); - EXPECT_FALSE(blob.IsType(CPU)); + EXPECT_FALSE(blob.IsTensorType(CPU)); Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU); - EXPECT_TRUE(blob.IsType(CPU)); + EXPECT_TRUE(blob.IsTensorType(CPU)); EXPECT_FALSE(blob.IsType()); EXPECT_FALSE(blob.IsType()); } @@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) { } \ Blob new_blob; \ EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \ - EXPECT_TRUE(new_blob.IsType(CPU)); \ + EXPECT_TRUE(new_blob.IsTensorType(CPU)); \ const TensorCPU& new_tensor = blob.Get(); \ EXPECT_EQ(new_tensor.ndim(), 2); \ EXPECT_EQ(new_tensor.dim(0), 2); \ @@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) { EXPECT_EQ(tensor_proto.field_name##_size(), 0); \ Blob new_blob; \ EXPECT_NO_THROW(new_blob.Deserialize(serialized)); \ - EXPECT_TRUE(new_blob.IsType(CPU)); \ + EXPECT_TRUE(new_blob.IsTensorType(CPU)); \ const TensorCPU& new_tensor = blob.Get(); \ EXPECT_EQ(new_tensor.ndim(), 2); \ EXPECT_EQ(new_tensor.dim(0), 0); \ @@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) { EXPECT_EQ(proto.type(), "Tensor"); Blob new_blob; EXPECT_NO_THROW(new_blob.Deserialize(serialized)); - EXPECT_TRUE(new_blob.IsType(CPU)); + EXPECT_TRUE(new_blob.IsTensorType(CPU)); const TensorCPU& new_tensor = blob.Get(); EXPECT_EQ(new_tensor.ndim(), 2); EXPECT_EQ(new_tensor.dim(0), 2); @@ -724,7 +724,7 @@ TEST(TensorTest, float16) { } Blob new_blob; EXPECT_NO_THROW(new_blob.Deserialize(serialized)); - EXPECT_TRUE(new_blob.IsType(CPU)); + EXPECT_TRUE(new_blob.IsTensorType(CPU)); const TensorCPU& new_tensor = blob.Get(); EXPECT_EQ(new_tensor.ndim(), 1); EXPECT_EQ(new_tensor.dim(0), kSize); @@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) { load_op->Run(); VLOG(1) << "Reading blob from workspace"; auto new_blob = ws.GetBlob("test"); - EXPECT_TRUE(new_blob->IsType(CPU)); + EXPECT_TRUE(new_blob->IsTensorType(CPU)); const auto& new_tensor = new_blob->Get(); EXPECT_EQ(new_tensor.ndim(), d1); diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 2def93f0b51d08..e75681ff3a9df6 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -143,36 +143,26 @@ class CAFFE2_API OperatorBase : public Observable { inline bool InputIsType(int idx) { static_assert( !std::is_same::value, - "You should use InputIsType(int, DeviceType) for " + "You should use InputIsTensorType(int, DeviceType) for " "Tensor."); return inputs_.at(idx)->template IsType(); } - template - inline bool InputIsType(int idx, DeviceType device_type) { - static_assert( - std::is_same::value, - "InputIsType(idx, DeviceType) only available on " - "Tensor types."); - return inputs_.at(idx)->template IsType(device_type); + inline bool InputIsTensorType(int idx, DeviceType device_type) { + return inputs_.at(idx)->IsTensorType(device_type); } template inline bool OutputIsType(int idx) { static_assert( !std::is_same::value, - "You should use OutputIsType(int, DeviceType) for " + "You should use OutputIsTensorType(int, DeviceType) for " "Tensor."); return outputs_.at(idx)->template IsType(); } - template - inline bool OutputIsType(int idx, DeviceType type) { - static_assert( - std::is_same::value, - "OutputIsType(idx, DeviceType) only available on " - "Tensor types."); - return outputs_.at(idx)->template IsType(type); + inline bool OutputIsTensorType(int idx, DeviceType type) { + return outputs_.at(idx)->IsTensorType(type); } inline int InputSize() const { diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc index 25d4e16d2f9e7a..311c6446184a87 100644 --- a/caffe2/ideep/operators/concat_split_op.cc +++ b/caffe2/ideep/operators/concat_split_op.cc @@ -33,7 +33,7 @@ class IDEEPConcatOp final : public IDEEPOperator { if (OperatorBase::InputBlob(i).template IsType()) { inputs.emplace_back(Input(i)); } else { - CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType(CPU), + CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU), "Expect cpu tensor if not itensor"); auto& tensor_cpu = OperatorBase::Input(i, CPU); CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 || diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index 9ae2323442120a..08e6de2ae3f0dc 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator { continue; } CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(CPU), + local_output_blobs_[i]->IsTensorType(CPU), "IDEEP fallback op currently does not support non-TensorCPU " "output type who needs copying."); const auto& src = local_output_blobs_[i]->template Get(); diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc index 63bd0da7cb5cb6..626568a989b939 100644 --- a/caffe2/ideep/operators/utility_ops.cc +++ b/caffe2/ideep/operators/utility_ops.cc @@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator { USE_IDEEP_DEF_ALIASES(); bool RunOnDevice() override { const auto& input_blob = OperatorBase::InputBlob(0); - if (input_blob.template IsType(CPU)) { + if (input_blob.IsTensorType(CPU)) { VLOG(2) << "Directing sharing of TensorCPU"; const auto& X = OperatorBase::Input(0, CPU); auto* Y = OperatorBase::Output(0, CPU); diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h index ce4c85d2c231e0..6d9713b74612d8 100644 --- a/caffe2/mkl/operators/operator_fallback_mkl.h +++ b/caffe2/mkl/operators/operator_fallback_mkl.h @@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator { continue; } CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(CPU), + local_output_blobs_[i]->IsTensorType(CPU), "MKL fallback op currently does not support non-TensorCPU " "output type who needs copying."); const auto& src = local_output_blobs_[i]->template Get(); diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc index 1f3231dc521f5d..0ed93cf061070c 100644 --- a/caffe2/mkl/operators/packed_fc_op.cc +++ b/caffe2/mkl/operators/packed_fc_op.cc @@ -49,7 +49,7 @@ class PackedFCOp final : public Operator { // Check out what is the passed in format. const MKLPackedMatrix* packed_matrix = nullptr; - if (OperatorBase::InputIsType(1, CPU)) { + if (OperatorBase::InputIsTensorType(1, CPU)) { const auto& W = Input(1); CAFFE_ENFORCE_EQ(W.ndim(), 2); CAFFE_ENFORCE_EQ(W.dim32(0), N); diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 755e1b5a57b8a9..2238d7af08dda6 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -489,7 +489,7 @@ bool RunOnDevice() override { "noise_size", 491 /* prime to avoid artifacts */); // Treaded as half4 in the kernel, so need half4 here. noiseSize = divRoundUp(noiseSize, 4) * 4; - if (!noiseBlob->IsType(CPU) || + if (!noiseBlob->IsTensorType(CPU) || noiseBlob->Get().size() != noiseSize) { VLOG(2) << "Initializing stylizer with noise: " << noiseSize; caffe2::Timer rt; diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h index b84de851a9e948..8657c107ed0f33 100644 --- a/caffe2/mpi/mpi_ops.h +++ b/caffe2/mpi/mpi_ops.h @@ -36,7 +36,7 @@ class MPIBroadcastOp final : public Operator { bool RunOnDevice() override { MPI_Comm comm = OperatorBase::Input(0).comm(); CAFFE_ENFORCE( - OperatorBase::OutputIsType(0, Context::GetDeviceType()), + OperatorBase::OutputIsTensorType(0, Context::GetDeviceType()), "Output is of wrong type."); auto* output = Output(0); // Make sure that output is already allocated. diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc index 5d40916be09346..bf4e20b7904711 100644 --- a/caffe2/observers/profile_observer_gpu.cc +++ b/caffe2/observers/profile_observer_gpu.cc @@ -26,12 +26,12 @@ void ProfileOperatorObserver::Dump() const { LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type() << " op#" << getId() << " ---------"; for (int i = 0; i < subject_->InputSize(); ++i) { - if (subject_->InputIsType(i, CPU)) { + if (subject_->InputIsTensorType(i, CPU)) { const auto& tensor = subject_->Input(i, CPU); const auto& name = subject_->debug_def().input(i); TensorPrinter printer(name); LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor); - } else if (subject_->InputIsType(i, CUDA)) { + } else if (subject_->InputIsTensorType(i, CUDA)) { const auto& tensor = subject_->Input(i, CUDA); const auto& name = subject_->debug_def().input(i); TensorPrinter printer(name); @@ -46,12 +46,12 @@ void ProfileOperatorObserver::Dump() const { } for (int o = 0; o < subject_->OutputSize(); ++o) { - if (subject_->OutputIsType(o, CPU)) { + if (subject_->OutputIsTensorType(o, CPU)) { auto* tensor = subject_->Output(o, CPU); const auto& name = subject_->debug_def().output(o); TensorPrinter printer(name); LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor); - } else if (subject_->OutputIsType(o, CUDA)) { + } else if (subject_->OutputIsTensorType(o, CUDA)) { auto* tensor = subject_->Output(o, CUDA); const auto& name = subject_->debug_def().output(o); TensorPrinter printer(name); diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h index 08207644f7f094..3b8cb439f12066 100644 --- a/caffe2/operators/ensure_cpu_output_op.h +++ b/caffe2/operators/ensure_cpu_output_op.h @@ -15,9 +15,9 @@ class EnsureCPUOutputOp : public Operator { : Operator(operator_def, ws) {} bool RunOnDevice() override { - if (this->template InputIsType(0, CPU)) { + if (this->InputIsTensorType(0, CPU)) { return CopyWithContext(); - } else if (this->template InputIsType(0, Context::GetDeviceType())) { + } else if (this->InputIsTensorType(0, Context::GetDeviceType())) { // CUDA Context will go this branch return CopyWithContext(); } else { diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h index cff2a620ef4694..e76fea0ee8f21a 100644 --- a/caffe2/operators/if_op.h +++ b/caffe2/operators/if_op.h @@ -32,7 +32,7 @@ class IfOp final : public Operator { bool RunOnDevice() override { CAFFE_ENFORCE( - this->template InputIsType(0, Context::GetDeviceType()), + this->InputIsTensorType(0, Context::GetDeviceType()), "Invalid condition in If operator: tensor expected"); const auto& condition = Input(0); diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc index 4e9f6f2ac280f1..aee7fff4bc3391 100644 --- a/caffe2/operators/onnxifi_op.cc +++ b/caffe2/operators/onnxifi_op.cc @@ -15,7 +15,7 @@ void BlobToTensorDescriptor( // Memory type // We only allow weights to be CPU tensor for now CAFFE_ENFORCE( - blob->template IsType(CPU), + blob->IsTensorType(CPU), "Initialization blob ", name, " needs to be TensorCPU"); diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h index b0ee9611a69042..8ef39e7c0e78d1 100644 --- a/caffe2/operators/operator_fallback_gpu.h +++ b/caffe2/operators/operator_fallback_gpu.h @@ -64,7 +64,7 @@ class GPUFallbackOpEx final : public Operator { bool RunOnDevice() override { bool need_sync = false; for (int i = 0; i < InputSize(); ++i) { - if (this->template InputIsType(i, CUDA)) { + if (this->InputIsTensorType(i, CUDA)) { local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom( Input(i), &context_); need_sync = true; @@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator { continue; } CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(CPU), + local_output_blobs_[i]->IsTensorType(CPU), "GPU fallback op currently does not support non-TensorCPU " "output type who needs copying."); Output(i)->CopyFrom(local_output_blobs_[i]->template Get()); diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc index ece70ffd2425e1..c9ba13efb50258 100644 --- a/caffe2/operators/string_ops_test.cc +++ b/caffe2/operators/string_ops_test.cc @@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test { const std::string* checkAndGetOutput(int outputSize) { const auto* output = ws_.GetBlob("Y"); EXPECT_NE(output, nullptr); - EXPECT_TRUE(output->IsType(CPU)); + EXPECT_TRUE(output->IsTensorType(CPU)); const auto& outputTensor = output->Get(); EXPECT_EQ(outputTensor.ndim(), 1); EXPECT_EQ(outputTensor.dim(0), outputSize); diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc index 8f1e0895a28596..a6d395fe9ba647 100644 --- a/caffe2/operators/stylizer_ops.cc +++ b/caffe2/operators/stylizer_ops.cc @@ -82,7 +82,7 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp auto defaultNoiseSize = OperatorBase::GetSingleArgument( "noise_size", 491 /* prime to avoid artifacts */); - if (!noiseBlob->IsType(CPU)) { + if (!noiseBlob->IsTensorType(CPU)) { // Initialize random noise on first use. // Cache it to maintain temporal consistency. auto* t = noiseBlob->GetMutableTensor(CPU); diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index c9564dfa74a86a..634281428bb898 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -91,8 +91,8 @@ class PrintOp final : public Operator { return true; } - if (!this->template InputIsType(0, Context::GetDeviceType()) && - !this->template InputIsType(0, CPU)) { + if (!this->InputIsTensorType(0, Context::GetDeviceType()) && + !this->InputIsTensorType(0, CPU)) { LOG(INFO) << "Blob of type: " << OperatorBase::Inputs().at(0)->meta().name(); return true; @@ -113,7 +113,7 @@ class PrintOp final : public Operator { unsigned char, std::string>; - if (this->template InputIsType(0, CPU)) { + if (this->InputIsTensorType(0, CPU)) { return DispatchHelper::call( this, this->template Input(0, CPU)); } else { @@ -129,7 +129,7 @@ class PrintOp final : public Operator { // will handle memory deallocation itself so no smart pointer is needed. const TensorCPU* tensor; Tensor tensor_copy_if_needed(CPU); - if (this->template InputIsType(0, CPU)) { + if (this->InputIsTensorType(0, CPU)) { tensor = &this->template Input(0, CPU); } else { tensor_copy_if_needed.CopyFrom(Input(0), &context_); diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h index 258862b690e4a6..3e90341bcdd7ef 100644 --- a/caffe2/operators/while_op.h +++ b/caffe2/operators/while_op.h @@ -35,7 +35,7 @@ class WhileOp final : public Operator { bool RunOnDevice() override { CAFFE_ENFORCE( - this->template InputIsType(0, Context::GetDeviceType()), + this->InputIsTensorType(0, Context::GetDeviceType()), "Invalid condition in While operator: tensor expected"); const auto& condition = Input(0); diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc index cb1c9028d5c12b..84dac93753d37a 100644 --- a/caffe2/predictor/predictor.cc +++ b/caffe2/predictor/predictor.cc @@ -10,7 +10,7 @@ void enforceIsTensor(Workspace* ws, const std::string& name) { auto blob = ws->GetBlob(name); CAFFE_ENFORCE(blob, "Blob does not exist: ", name); CAFFE_ENFORCE( - blob->template IsType(CPU), "Blob is not a CPU Tensor: ", name); + blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name); } TensorCPU* getTensor(Workspace* ws, const std::string& name) { diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h index dadf7f4ee22015..3cd301a8223009 100644 --- a/caffe2/sgd/adam_op.h +++ b/caffe2/sgd/adam_op.h @@ -88,7 +88,7 @@ class AdamOp final : public Operator { epsilon_(this->template GetSingleArgument("epsilon", 1e-5f)) {} bool RunOnDevice() override { // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(ITER, CPU)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size()); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size()); diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h index 7b1c68634de228..a8f3ce75c7fc2c 100644 --- a/caffe2/sgd/fp16_momentum_sgd_op.h +++ b/caffe2/sgd/fp16_momentum_sgd_op.h @@ -37,8 +37,8 @@ class FP16MomentumSGDUpdateOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h index 75907a63501da9..57ea18a097b099 100644 --- a/caffe2/sgd/fp32_momentum_sgd_op.h +++ b/caffe2/sgd/fp32_momentum_sgd_op.h @@ -33,8 +33,8 @@ class FP32MomentumSGDUpdateOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h index 91709f47f3453a..22ec8d252c455f 100644 --- a/caffe2/sgd/iter_op.h +++ b/caffe2/sgd/iter_op.h @@ -39,7 +39,7 @@ class IterOp final : public Operator { bool RunOnDevice() override { if (InputSize() == 0) { LOG(INFO) << "[Input size is zero]"; - if (!OperatorBase::OutputIsType(0, CPU)) { + if (!OperatorBase::OutputIsTensorType(0, CPU)) { // This is the first run; set the iter to start with 0. LOG(ERROR) << "You are using an old definition of IterOp that will " "be deprecated soon. More specifically, IterOp now " diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h index 6e79d5dbedc7da..c3f25c84c9b8aa 100644 --- a/caffe2/sgd/momentum_sgd_op.h +++ b/caffe2/sgd/momentum_sgd_op.h @@ -47,8 +47,8 @@ class MomentumSGDOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE(Input(LR).size() == 1); CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); @@ -87,8 +87,8 @@ class MomentumSGDUpdateOp final : public Operator { bool RunOnDevice() override { auto device_type = Context::GetDeviceType(); // Iter live on the CPU - CAFFE_ENFORCE(OperatorBase::InputIsType(GRAD, device_type)); - CAFFE_ENFORCE(OperatorBase::InputIsType(MOMENTUM, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type)); + CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type)); CAFFE_ENFORCE_EQ(Input(LR).size(), 1); CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size()); Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h index 249f638bfac03d..94150413df1750 100644 --- a/caffe2/sgd/yellowfin_op.h +++ b/caffe2/sgd/yellowfin_op.h @@ -126,7 +126,7 @@ CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory) CAFFE2_YF_READ_INPUT(GRAD, grad) #undef CAFFE2_YF_READ_OUTPUT -CAFFE_ENFORCE(OperatorBase::InputIsType(ITER, CPU)); +CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU)); CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1); CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1); CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim()); From 09292f2c03fa37eaa67651114d6b1b5943ec53d5 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Mon, 10 Sep 2018 12:31:24 -0700 Subject: [PATCH 004/237] Some improvements to IValue (#11238) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11238 - when moving an IValue, free the old value instead of keeping it allocated - making classes final - moving std::string - making ConstantList const Reviewed By: ezyang Differential Revision: D9644700 fbshipit-source-id: ab7228368e4f00f664ba54e1242b0307d91c5e7e --- torch/csrc/jit/ivalue.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index c8475b7ff86183..0cb821e99a505b 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -13,14 +13,14 @@ template using Shared = c10::intrusive_ptr; // string -struct TORCH_API ConstantString : c10::intrusive_ptr_target { +struct TORCH_API ConstantString final : c10::intrusive_ptr_target { private: const std::string str_; public: ConstantString(std::string str) : str_(std::move(str)) {} - static c10::intrusive_ptr create(const std::string str_) { - return c10::make_intrusive(str_); + static c10::intrusive_ptr create(std::string str_) { + return c10::make_intrusive(std::move(str_)); } const std::string & string() const { return str_; @@ -34,9 +34,9 @@ struct TORCH_API ConstantString : c10::intrusive_ptr_target { // non-mutable list template -struct TORCH_API ConstantList : c10::intrusive_ptr_target { +struct TORCH_API ConstantList final : c10::intrusive_ptr_target { private: - std::vector elements_; + const std::vector elements_; public: ConstantList(std::vector elements_) : elements_(std::move(elements_)) {} @@ -67,7 +67,7 @@ using DoubleList = ConstantList; #define TORCH_FORALL_TAGS(_) \ _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList) -struct TORCH_API IValue { +struct TORCH_API IValue final { IValue() : payload(0) , tag(Tag::None) @@ -89,12 +89,12 @@ struct TORCH_API IValue { } } IValue & operator=(IValue && rhs) & noexcept { - rhs.swap(*this); + IValue(std::move(rhs)).swap(*this); // this also sets rhs to None return *this; } IValue & operator=(IValue const & rhs) & { - IValue(rhs).swap(*this); - return *this; + IValue(rhs).swap(*this); + return *this; } void swap(IValue & rhs) noexcept { std::swap(payload, rhs.payload); @@ -173,7 +173,7 @@ struct TORCH_API IValue { IValue(c10::intrusive_ptr v); IValue(std::vector v); IValue(at::ArrayRef v) - : IValue(std::vector(v.begin(), v.end())) {} + : IValue(v.vec()) {} bool isIntList() const { return Tag::IntList == tag; } c10::intrusive_ptr toIntList() && { JIT_ASSERT(isIntList()); @@ -190,7 +190,7 @@ struct TORCH_API IValue { // ConstantString IValue(c10::intrusive_ptr v); - IValue(const std::string& v); + IValue(std::string v); bool isString() const { return Tag::String == tag; } c10::intrusive_ptr toString() && { JIT_ASSERT(isString()); @@ -369,8 +369,8 @@ inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::String), is_intrusive_ptr(true) { as_intrusive_ptr = v.release(); } -inline IValue::IValue(const std::string& v) -: IValue(ConstantString::create(v)) {} +inline IValue::IValue(std::string v) +: IValue(ConstantString::create(std::move(v))) {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::DoubleList), is_intrusive_ptr(true) { From 252f93df091ad4c282f38312a42d0865a47e2de6 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Mon, 10 Sep 2018 12:31:27 -0700 Subject: [PATCH 005/237] Improve Tensor() constructor (#11258) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11258 The two intrusive_ptr constructors in Tensor can be combined into one implementation that does both, moving and copying. Reviewed By: ezyang Differential Revision: D9652088 fbshipit-source-id: 5efca02654ba305c99c20bbeb83551469d17a51d --- aten/src/ATen/templates/Tensor.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 77c18466404c83..b96b2cf13d86be 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -51,9 +51,7 @@ struct AT_API Tensor { c10::raw::intrusive_ptr::incref(tensor_impl); } } - Tensor(const c10::intrusive_ptr& ptr) - : tensor_impl_(ptr) {} - Tensor(c10::intrusive_ptr&& ptr) + Tensor(c10::intrusive_ptr ptr) : tensor_impl_(std::move(ptr)) {} Tensor(const Tensor&) = default; From b0c139727112423fdf1f989b85da0e3e5af28650 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Mon, 10 Sep 2018 12:31:29 -0700 Subject: [PATCH 006/237] Fix intrusive_ptr move/copy for different NullType's (#11260) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11260 This is needed to make something like this work: intrusive_ptr a = make_intrusive(...); Reviewed By: ezyang Differential Revision: D9652089 fbshipit-source-id: 19c65e98460ccb27bc69e36d7e558cb9d6e67615 --- aten/src/ATen/core/intrusive_ptr.h | 73 +++--------- aten/src/ATen/core/intrusive_ptr_test.cpp | 115 ++++++++++++++++++- aten/src/ATen/native/sparse/SparseTensor.cpp | 2 +- torch/csrc/autograd/variable.h | 10 +- 4 files changed, 133 insertions(+), 67 deletions(-) diff --git a/aten/src/ATen/core/intrusive_ptr.h b/aten/src/ATen/core/intrusive_ptr.h index 65c6b5e702f2a0..961915555a3756 100644 --- a/aten/src/ATen/core/intrusive_ptr.h +++ b/aten/src/ATen/core/intrusive_ptr.h @@ -119,6 +119,15 @@ struct AT_CORE_EXPORT intrusive_target_default_null_type final { return nullptr; } }; + +template +TTarget* assign_ptr_(TTarget* rhs) { + if (FromNullType::singleton() == rhs) { + return ToNullType::singleton(); + } else { + return rhs; + } +} } // namespace detail template @@ -191,17 +200,10 @@ class AT_CORE_EXPORT intrusive_ptr final { template /* implicit */ intrusive_ptr(intrusive_ptr&& rhs) noexcept - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr move constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr move constructor got pointer with differing null value."); -#endif rhs.target_ = FromNullType::singleton(); } @@ -212,17 +214,10 @@ class AT_CORE_EXPORT intrusive_ptr final { template /* implicit */ intrusive_ptr( const intrusive_ptr& rhs) - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr copy constructor got pointer with differing null value."); -#endif retain_(); } @@ -240,13 +235,6 @@ class AT_CORE_EXPORT intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr move assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr move assignment got pointer with differing null value."); -#endif intrusive_ptr tmp = std::move(rhs); swap(tmp); return *this; @@ -261,13 +249,6 @@ class AT_CORE_EXPORT intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. intrusive_ptr copy assignment got pointer with differing null value."); -#endif intrusive_ptr tmp = rhs; swap(tmp); return *this; @@ -464,17 +445,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { template /* implicit */ weak_intrusive_ptr( weak_intrusive_ptr&& rhs) noexcept - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr move constructor got pointer with differing null value."); -#endif rhs.target_ = FromNullType::singleton(); } @@ -486,17 +460,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { template /* implicit */ weak_intrusive_ptr( const weak_intrusive_ptr& rhs) - : target_(rhs.target_) { + : target_(detail::assign_ptr_(rhs.target_)) { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr copy constructor got pointer with differing null value."); -#endif retain_(); } @@ -515,13 +482,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr move assignment got pointer with differing null value."); -#endif weak_intrusive_ptr tmp = std::move(rhs); swap(tmp); return *this; @@ -537,13 +497,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final { static_assert( std::is_convertible::value, "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type."); -#ifndef _WIN32 - // This static_assert triggers on MSVC - // error C2131: expression did not evaluate to a constant - static_assert( - NullType::singleton() == FromNullType::singleton(), - "NullType mismatch. weak_intrusive_ptr copy assignment got pointer with differing null value."); -#endif weak_intrusive_ptr tmp = rhs; swap(tmp); return *this; diff --git a/aten/src/ATen/core/intrusive_ptr_test.cpp b/aten/src/ATen/core/intrusive_ptr_test.cpp index df98563f1ff1fc..4d0701ebe164d7 100644 --- a/aten/src/ATen/core/intrusive_ptr_test.cpp +++ b/aten/src/ATen/core/intrusive_ptr_test.cpp @@ -59,6 +59,23 @@ class ChildDestructableMock final : public DestructableMock { ChildDestructableMock(bool* resourcesReleased, bool* wasDestructed) : DestructableMock(resourcesReleased, wasDestructed) {} }; +class NullType1 final { + static SomeClass singleton_; +public: + static constexpr SomeClass* singleton() { + return &singleton_; + } +}; +SomeClass NullType1::singleton_; +class NullType2 final { + static SomeClass singleton_; +public: + static constexpr SomeClass* singleton() { + return &singleton_; + } +}; +SomeClass NullType2::singleton_; +static_assert(NullType1::singleton() != NullType2::singleton(), ""); } // namespace static_assert( @@ -262,6 +279,19 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2; + obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, givenValidPtr_whenCopyAssigning_thenPointsToSameObject) { intrusive_ptr obj1 = make_intrusive(); intrusive_ptr obj2 = make_intrusive(); @@ -359,6 +389,19 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2; + obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, givenPtr_whenMoveConstructing_thenPointsToSameObject) { intrusive_ptr obj1 = make_intrusive(); SomeClass* obj1ptr = obj1.get(); @@ -420,6 +463,18 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, givenPtr_whenCopyConstructing_thenPointsToSameObject) { intrusive_ptr obj1 = make_intrusive(); SomeClass* obj1ptr = obj1.get(); @@ -482,6 +537,18 @@ TEST( EXPECT_FALSE(obj2.defined()); } +TEST( + IntrusivePtrTest, + givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) { + intrusive_ptr obj1; + intrusive_ptr obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_EQ(NullType1::singleton(), obj1.get()); + EXPECT_EQ(NullType2::singleton(), obj2.get()); + EXPECT_FALSE(obj1.defined()); + EXPECT_FALSE(obj2.defined()); +} + TEST(IntrusivePtrTest, SwapFunction) { intrusive_ptr obj1 = make_intrusive(); intrusive_ptr obj2 = make_intrusive(); @@ -1520,9 +1587,9 @@ weak_intrusive_ptr make_weak_only(Args&&... args) { auto intrusive = make_intrusive(std::forward(args)...); return weak_intrusive_ptr(intrusive); } -template -weak_intrusive_ptr make_invalid_weak() { - return weak_intrusive_ptr(intrusive_ptr()); +template > +weak_intrusive_ptr make_invalid_weak() { + return weak_intrusive_ptr(intrusive_ptr()); } } // namespace @@ -1752,6 +1819,17 @@ TEST( EXPECT_TRUE(obj2.weak.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = make_invalid_weak(); + obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST( WeakIntrusivePtrTest, givenValidPtr_whenCopyAssigning_thenPointsToSameObject) { @@ -1930,6 +2008,17 @@ TEST( EXPECT_TRUE(obj2.weak.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = make_invalid_weak(); + obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST( WeakIntrusivePtrTest, givenPtr_whenMoveConstructing_thenPointsToSameObject) { @@ -2014,6 +2103,16 @@ TEST( EXPECT_TRUE(obj2.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = std::move(obj1); + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST( WeakIntrusivePtrTest, givenPtr_whenCopyConstructing_thenPointsToSameObject) { @@ -2097,6 +2196,16 @@ TEST( EXPECT_TRUE(obj2.expired()); } +TEST( + WeakIntrusivePtrTest, + givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) { + weak_intrusive_ptr obj1 = make_invalid_weak(); + weak_intrusive_ptr obj2 = obj1; + EXPECT_NE(NullType1::singleton(), NullType2::singleton()); + EXPECT_TRUE(obj1.expired()); + EXPECT_TRUE(obj2.expired()); +} + TEST(WeakIntrusivePtrTest, SwapFunction) { IntrusiveAndWeak obj1 = make_weak_intrusive(); IntrusiveAndWeak obj2 = make_weak_intrusive(); diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index c96e577ba9d47a..b09feb0dc1e7dd 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -68,7 +68,7 @@ SparseTensor new_sparse(const SparseType& dtype) { } else { type_id = SparseCPUTensorId(); } - return SparseTensor(c10::make_intrusive(type_id, dtype.scalarType()).release(), /* retain */ false); + return SparseTensor(c10::make_intrusive(type_id, dtype.scalarType())); } /*** Helper methods ***/ diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index 5a59afda5a8640..c5057711edfcd7 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -266,6 +266,7 @@ struct TORCH_API Variable : public at::Tensor { //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Variable(Variable::Impl* self, bool retain); + Variable(c10::intrusive_ptr self); Impl* get() const; }; @@ -417,7 +418,7 @@ inline Variable make_variable_view( Edge gradient_edge = Edge()) { if (data.defined()) { return Variable(c10::make_intrusive( - std::move(base), std::move(data), std::move(gradient_edge)).release(), false); + std::move(base), std::move(data), std::move(gradient_edge))); } return Variable(); } @@ -427,7 +428,7 @@ inline Variable make_variable(at::Tensor data, bool requires_grad = false) { !data.is_variable(), "Must not create a new variable from a variable, use its .data()"); if (data.defined()) { - return Variable(c10::make_intrusive(data, requires_grad).release(), false); + return Variable(c10::make_intrusive(data, requires_grad)); } return Variable(); } @@ -437,7 +438,7 @@ inline Variable make_variable(at::Tensor data, Edge gradient_edge) { !data.is_variable(), "Must not create a new variable from a variable, use its .data()"); if (data.defined()) { - return Variable(c10::make_intrusive(data, false, std::move(gradient_edge)).release(), false); + return Variable(c10::make_intrusive(data, false, std::move(gradient_edge))); } return Variable(); } @@ -597,6 +598,9 @@ inline PyObject* Variable::pyobj() const noexcept { inline Variable::Variable(Variable::Impl* self, bool retain) : at::Tensor(self, retain) {} +inline Variable::Variable(c10::intrusive_ptr self) + : at::Tensor(std::move(self)) {} + inline Variable::Impl* Variable::get() const { AT_CHECK(defined(), "Called Variable::get() on an undefined Variable"); return static_cast(tensor_impl_.get()); From 198ade74f929bb4f0d8b743a517aaeeaa219cadf Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Mon, 10 Sep 2018 12:31:34 -0700 Subject: [PATCH 007/237] Remove manual refcounting from Tensor class (#11294) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11294 The Tensor(ptr, retain) constructor is error prone and circumvents the intrusive_ptr safety. This diff removes that and pushes the responsibility to callers. Step by step, manual refcounting can be pushed back and possibly eliminated in the end. Reviewed By: ezyang Differential Revision: D9663476 fbshipit-source-id: 7f010e5e47b137a9575960201c5bf5d552c5c2f5 --- aten/src/ATen/core/C++17.h | 2 ++ aten/src/ATen/function_wrapper.py | 6 +++-- aten/src/ATen/templates/Tensor.h | 12 +++------- aten/src/ATen/templates/TypeDefault.cpp | 6 ++++- .../src/THC/generic/THCTensorMathPointwise.cu | 23 ++++++++++++------- torch/csrc/autograd/variable.cpp | 4 +++- torch/csrc/autograd/variable.h | 4 ---- torch/csrc/jit/ivalue.h | 9 ++++++-- 8 files changed, 39 insertions(+), 27 deletions(-) diff --git a/aten/src/ATen/core/C++17.h b/aten/src/ATen/core/C++17.h index d8440ceea0c21a..5964a9dea36b2f 100644 --- a/aten/src/ATen/core/C++17.h +++ b/aten/src/ATen/core/C++17.h @@ -82,6 +82,7 @@ template using remove_cv_t = std::remove_cv_t; template using result_of_t = std::result_of_t; template using decay_t = std::decay_t; template using remove_const_t = std::remove_const_t; +template using remove_pointer_t = std::remove_pointer_t; #else template using conditional_t = typename std::conditional::type; template using enable_if_t = typename std::enable_if::type; @@ -91,6 +92,7 @@ template using remove_cv_t = typename std::remove_cv::type; template using result_of_t = typename std::result_of::type; template using decay_t = typename std::decay::type; template using remove_const_t = typename std::remove_const::type; +template using remove_pointer_t = typename std::remove_pointer::type; #endif diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index e87a7bb88f8eb4..85b99469cdee15 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -1299,7 +1299,7 @@ def allocate_arg(env, arg, output_count): .format(name, name)) return [ 'auto {}_ = {};'.format(name, allocation), - 'auto {} = Tensor({}, false);'.format(name, tensor_arg), + 'auto {} = Tensor(c10::intrusive_ptr::reclaim({}));'.format(name, tensor_arg), ] def resize_arg(arg): @@ -1507,7 +1507,9 @@ def emit_body(env, option): else "" wrapped_tensor = CodeTemplate(ALLOC_WRAP[ret['type']]).substitute( env, arguments=[call]) - return_tensor = "return Tensor((${wrapped_tensor})${maybe_scalar},false);" + return_tensor = ( + "return Tensor(" + + "c10::intrusive_ptr::reclaim((${wrapped_tensor})${maybe_scalar}));") body.append(CodeTemplate(return_tensor).substitute( env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar)) # return the same underlying Tensor type for both real and accreal; this ensures diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index b96b2cf13d86be..e54c6e4b62fc40 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -41,18 +41,12 @@ namespace at { // special care must be taken to handle this. struct AT_API Tensor { Tensor(){}; - Tensor(TensorImpl* tensor_impl, bool retain) - : tensor_impl_(c10::intrusive_ptr::reclaim( - tensor_impl)) { - if (tensor_impl == nullptr) { + Tensor(c10::intrusive_ptr tensor_impl) + : tensor_impl_(std::move(tensor_impl)) { + if (tensor_impl_.get() == nullptr) { throw std::runtime_error("TensorBaseImpl with nullptr not supported"); } - if (retain && tensor_impl != UndefinedTensor::singleton()) { - c10::raw::intrusive_ptr::incref(tensor_impl); - } } - Tensor(c10::intrusive_ptr ptr) - : tensor_impl_(std::move(ptr)) {} Tensor(const Tensor&) = default; Tensor(Tensor&&) = default; diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 5e614edc57f216..54c6c282dcb95d 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -107,7 +107,11 @@ Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) co return Storage(scalarType(), size, allocator); } Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const { - return Tensor(static_cast(th_pointer), retain); + auto tensor_impl = c10::intrusive_ptr::reclaim(static_cast(th_pointer)); + if (retain && tensor_impl.get() != UndefinedTensor::singleton()) { + c10::raw::intrusive_ptr::incref(tensor_impl.get()); + } + return Tensor(std::move(tensor_impl)); } Storage TypeDefault::unsafeStorageFromTH(void * th_pointer, bool retain) const { if (retain && th_pointer) { diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index 9192d6c9f9a1d5..2d69e60a40fed3 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -228,35 +228,42 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, #endif +namespace { +c10::intrusive_ptr retainTensorImpl(THCTensor* self) { + c10::raw::intrusive_ptr::incref(self); + return c10::intrusive_ptr::reclaim(self); +} +} + THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) { - auto out = at::Tensor(self_, true); + auto out = at::Tensor(retainTensorImpl(self_)); #ifdef THC_REAL_IS_HALF auto alpha = at::Half(value); #else auto alpha = value; #endif - at::add_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha); + at::add_out(out, retainTensorImpl(src1), retainTensorImpl(src2), alpha); } THC_API void THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) { - auto out = at::Tensor(self_, true); + auto out = at::Tensor(retainTensorImpl(self_)); #ifdef THC_REAL_IS_HALF auto alpha = at::Half(value); #else auto alpha = value; #endif - at::sub_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha); + at::sub_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)), alpha); } THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { - auto out = at::Tensor(self_, true); - at::mul_out(out, at::Tensor(src1, true), at::Tensor(src2, true)); + auto out = at::Tensor(retainTensorImpl(self_)); + at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } THC_API void @@ -370,8 +377,8 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, scalar_t value, THCTens THC_API void THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { - auto out = at::Tensor(self_, true); - at::div_out(out, at::Tensor(src1, true), at::Tensor(src2, true)); + auto out = at::Tensor(retainTensorImpl(self_)); + at::div_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } THC_API void diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index a3bfd2f7749a60..3808aef75c4d57 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -108,7 +108,9 @@ std::shared_ptr Variable::Impl::get_grad_accumulator() { if (result) return result; - result = std::make_shared(Variable(this, true)); + c10::raw::intrusive_ptr::incref(this); + auto intrusive_from_this = c10::intrusive_ptr::reclaim(this); + result = std::make_shared(Variable(std::move(intrusive_from_this))); grad_accumulator_ = result; return result; } diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index c5057711edfcd7..bd2e475645975a 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -265,7 +265,6 @@ struct TORCH_API Variable : public at::Tensor { // Private Methods //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Variable(Variable::Impl* self, bool retain); Variable(c10::intrusive_ptr self); Impl* get() const; }; @@ -595,9 +594,6 @@ inline PyObject* Variable::pyobj() const noexcept { // Private Methods //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -inline Variable::Variable(Variable::Impl* self, bool retain) - : at::Tensor(self, retain) {} - inline Variable::Variable(c10::intrusive_ptr self) : at::Tensor(std::move(self)) {} diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 0cb821e99a505b..7d9bea6e678eec 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -118,13 +118,18 @@ struct TORCH_API IValue final { bool isTensor() const { return Tag::Tensor == tag; } at::Tensor toTensor() && { JIT_ASSERT(isTensor()); - at::Tensor t(as_tensor_impl, /*retain=*/false); + at::Tensor t(c10::intrusive_ptr::reclaim(as_tensor_impl)); clearToNone(); return t; } at::Tensor toTensor() const & { JIT_ASSERT(isTensor()); - return at::Tensor(as_tensor_impl, /*retain=*/true); + JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensor::singleton())); + auto tensor_impl = c10::intrusive_ptr::reclaim(as_tensor_impl); + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::incref(tensor_impl.get()); + } + return at::Tensor(std::move(tensor_impl)); } // Tuple From ea0ee77c61beacfe67c0209de4e142e22ce136f1 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Mon, 10 Sep 2018 12:36:38 -0700 Subject: [PATCH 008/237] Fix katex math rendering (#11472) Summary: I'm 80% sure that this fixes the math bug. But I can't repro locally so I don't know. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11472 Differential Revision: D9755328 Pulled By: SsnL fbshipit-source-id: 130be664d3c6ceee3c0c166c1a86fc9ec3b79d74 --- docs/source/conf.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 1b4d8d6ff83195..cbcdf707f2fdb4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,22 +55,6 @@ 'sphinxcontrib.katex', ] -# katex (mathjax replacement) macros -# -# - -katex_macros = r''' -"\\op": "\\operatorname{{#1}}", -"\\i": "\\mathrm{i}", -"\\e": "\\mathrm{e}^{#1}", -"\\w": "\\omega", -"\\vec": "\\mathbf{#1}", -"\\x": "\\vec{x}", -"\\d": "\\operatorname{d}\\!{}", -"\\dirac": "\\operatorname{\\delta}\\left(#1\\right)", -"\\scalarprod": "\\left\\langle#1,#2\\right\\rangle", -''' - # katex options # # @@ -78,10 +62,9 @@ katex_options = r''' delimiters : [ {left: "$$", right: "$$", display: true}, - {left: "\\(", right: "\\)", display: true}, + {left: "\\(", right: "\\)", display: false}, {left: "\\[", right: "\\]", display: true} -], -strict : false +] ''' napoleon_use_ivar = True From 18e5fd36c2d8b55520aee0998258a61dc696602a Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Mon, 10 Sep 2018 13:42:51 -0700 Subject: [PATCH 009/237] Normalize gradients before reduction in DistributedDataParallelC10d (#11109) Summary: Normalizing by the world size before the reduction is less likely to cause overflow in FP16 training. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11109 Differential Revision: D9594708 Pulled By: myleott fbshipit-source-id: 93ab53cb782ee1cbe1264e529b333490a0940338 --- test/test_c10d.py | 31 +++++++++++++++++++++++++++ torch/nn/parallel/distributed_c10d.py | 4 +++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/test/test_c10d.py b/test/test_c10d.py index 755982246ac4aa..4e804ee1635ac4 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -722,6 +722,37 @@ def test_sync_params_with_buffers(self): for i, buffer in enumerate(device_data): self.assertEqual(buffer, target[i]) + @skip_if_not_multigpu + @skip_if_not_nccl + def test_fp16(self): + store = c10d.TCPStore('localhost', self.port, self.rank == 0) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + gpus = gpus_for_rank(self.world_size)[self.rank] + model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half() + nn.init.constant_(model.weight, 1) + ddp_model = distributed_c10d._DistributedDataParallelC10d( + model, + device_ids=[gpus[0]], + process_group=process_group, + bucket_cap_mb=1, + ) + + # Input 2**15, so that the gradients will overflow with a + # world_size of 2, unless we normalize the gradient by the + # world_size before the reduction + input = torch.Tensor([[2**15]]).cuda(gpus[0]).half() + + # Step model + ddp_model.train() + output = ddp_model(input) + loss = output.sum() + loss.backward() + + self.assertFalse( + any(torch.isinf(p.grad).any() for p in ddp_model.parameters()) + ) + if __name__ == '__main__': assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process" diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py index 78a3bbfc2c79fa..61acf2c8784f04 100644 --- a/torch/nn/parallel/distributed_c10d.py +++ b/torch/nn/parallel/distributed_c10d.py @@ -332,6 +332,9 @@ def _queue_reduction(self, bucket_idx): if len(self.device_ids) > 1: nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) + # divide by the number of processes here to reduce chances of overflow + grads_batch_coalesced[0] /= self.process_group.size() + # now work on the first gpu reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]], self.allreduce_opts) @@ -345,7 +348,6 @@ def _sync_reduction_works(self): # wait will let current stream wait on the c10d reduction stream self.reduction_works[bucket_idx].wait() - self.buckets_coalesced[bucket_idx] /= self.process_group.size() grads_batch_reduced = _unflatten_dense_tensors( self.buckets_coalesced[bucket_idx], grads_batch[0]) From 35008e0a1a95420033ddad2ce60c5e3450bda71c Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 10 Sep 2018 14:04:14 -0700 Subject: [PATCH 010/237] Add flags to fix half comparison and test (#11395) Summary: The controller you requested could not be found. found there are some issues when using comparison operators for half types when certain THC header are included. I was able to reproduce and added a test. I also fix the issue by adding the proper definitions to avoid this issue. Reported in https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333 Related: https://github.com/pytorch/tutorials/pull/292 soumith fmassa Pull Request resolved: https://github.com/pytorch/pytorch/pull/11395 Differential Revision: D9725102 Pulled By: goldsborough fbshipit-source-id: 630425829046bbebea3409bb792a9d62c91f41ad --- .gitignore | 1 + test/cpp_extensions/half_support.cpp | 0 test/cpp_extensions/half_support.cu | 19 +++++++++++++ test/test_cpp_extensions.py | 41 ++++++++++++++++++++++++++++ torch/utils/cpp_extension.py | 10 +++++-- 5 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 test/cpp_extensions/half_support.cpp create mode 100644 test/cpp_extensions/half_support.cu diff --git a/.gitignore b/.gitignore index da783554fe1120..110046e09abf94 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ test/data/legacy_modules.t7 test/data/legacy_serialized.pt test/data/linear.pt test/htmlcov +test/cpp_extensions/install/ third_party/build/ tools/shared/_utils_internal.py torch.egg-info/ diff --git a/test/cpp_extensions/half_support.cpp b/test/cpp_extensions/half_support.cpp new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/test/cpp_extensions/half_support.cu b/test/cpp_extensions/half_support.cu new file mode 100644 index 00000000000000..a3621bfe7c55fb --- /dev/null +++ b/test/cpp_extensions/half_support.cu @@ -0,0 +1,19 @@ +#include + +#include + +template +__global__ void half_test_kernel(const T* input, U* output) { + if (input[0] < input[1] || input[0] >= input[1]) { + output[0] = 123; + } +} + +at::Tensor half_test(at::Tensor input) { + auto output = at::empty(1, input.options().dtype(at::kFloat)); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] { + half_test_kernel + <<<1, 1>>>(input.data(), output.data()); + }); + return output; +} diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py index a5312cd0383008..f24571e6aad899 100755 --- a/test/test_cpp_extensions.py +++ b/test/test_cpp_extensions.py @@ -274,6 +274,47 @@ def test_complex_registration(self): torch.empty(2, 2, dtype=torch.complex64) + @unittest.skipIf(not TEST_CUDA, "CUDA not found") + def test_half_support(self): + ''' + Checks for an issue with operator< ambiguity for half when certain + THC headers are included. + + See https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333 + for the corresponding issue. + ''' + cuda_source = ''' + #include + + template + __global__ void half_test_kernel(const T* input, U* output) { + if (input[0] < input[1] || input[0] >= input[1]) { + output[0] = 123; + } + } + + at::Tensor half_test(at::Tensor input) { + auto output = at::empty(1, input.options().dtype(at::kFloat)); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] { + half_test_kernel<<<1, 1>>>( + input.data(), + output.data()); + }); + return output; + } + ''' + + module = torch.utils.cpp_extension.load_inline( + name='half_test_extension', + cpp_sources='at::Tensor half_test(at::Tensor input);', + cuda_sources=cuda_source, + functions=['half_test'], + verbose=True) + + x = torch.randn(3, device='cuda', dtype=torch.half) + result = module.half_test(x) + self.assertEqual(result[0], 123) + if __name__ == '__main__': common.run_tests() diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 0af802625087c5..43edbd2f8bd055 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -69,6 +69,12 @@ def _find_cuda_home(): # it the below pattern. BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+') +COMMON_NVCC_FLAGS = [ + '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', + '-D__CUDA_NO_HALF2_OPERATORS__', +] + def is_binary_build(): return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__) @@ -165,7 +171,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): self.compiler.set_executable('compiler_so', nvcc) if isinstance(cflags, dict): cflags = cflags['nvcc'] - cflags += ['--compiler-options', "'-fPIC'"] + cflags = COMMON_NVCC_FLAGS + ['--compiler-options', "'-fPIC'"] + cflags elif isinstance(cflags, dict): cflags = cflags['cxx'] # NVCC does not allow multiple -std to be passed, so we avoid @@ -831,7 +837,7 @@ def _write_ninja_file(path, flags = ['cflags = {}'.format(' '.join(cflags))] if with_cuda: - cuda_flags = common_cflags + cuda_flags = common_cflags + COMMON_NVCC_FLAGS if sys.platform == 'win32': cuda_flags = _nt_quote_args(cuda_flags) else: From 70d93f4777be108beea1bb0bff02e6213b49ea9a Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Mon, 10 Sep 2018 14:38:32 -0700 Subject: [PATCH 011/237] Check for maximum numel in NCCL broadcasting (#11466) Summary: NCCL1 uses `int` as its numerical type for fields like `count`, which makes broadcasting tensors larger than `2 << 31 - 1` impossible, and raises opaque error `invalid arguments`. NCCL2 greatly increase the limit on many platforms by using `size_t`. This patch statically detects this type, and raises properly if the broadcast tensor exceeds the limit. No test because I don't think our test suite should broadcast big tensors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11466 Differential Revision: D9754753 Pulled By: SsnL fbshipit-source-id: 73506450cae047e06b5b225b39efdb42d5d26685 --- torch/csrc/cuda/comm.cpp | 3 +++ torch/csrc/cuda/nccl.cpp | 27 +++++++++++++++++++++++++++ torch/csrc/cuda/nccl.h | 2 ++ 3 files changed, 32 insertions(+) diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index 5ca60c7c62a2a0..716a1d30c3c9cd 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -72,6 +72,9 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu [&](const at::Tensor& t) { return t.get_device() == devices[0]; })) { throw std::runtime_error("all tensors must be on devices[0]"); } +#ifdef USE_NCCL + buffer_size = std::min(torch::cuda::nccl::get_max_count(), buffer_size); +#endif tensor_list2d outputs(devices.size()); outputs[0] = tensors.vec(); diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index f69e69e633b2e6..ab0fecfd761c24 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -5,6 +5,8 @@ #include #include +#include +#include #include #include #include @@ -177,6 +179,28 @@ std::uint64_t version() { #endif } +namespace { + // NCCL changed the numerical type used for count between NCCL1 and NCCL2. + // So we use the following struct, which gets the type of the second argument + // of T, if T is a function type, with ncclBcast, to get that type statically + // and programmatically. + + template + struct GetSecondArgType; + + template + struct GetSecondArgType { + typedef typename std::decay::type type; + }; + + constexpr auto count_max = std::numeric_limits::type>::max(); +} + +size_t get_max_count() { + return count_max; +} + + void broadcast(TensorList tensors, const stream_list& streams, const comm_list& user_comms) { #ifdef USE_NCCL using namespace torch::cuda::nccl::detail; @@ -192,6 +216,9 @@ void broadcast(TensorList tensors, const stream_list& streams, const comm_list& device_guard.set_index(tensors[i].get_device()); // TODO: use current stream const auto stream = (streams.empty() || !streams[i]) ? nullptr : THCStream_stream(streams[i]); + AT_CHECK(static_cast(numel) <= static_cast(count_max), + "Broadcast tensor has ", numel, " elements, which exceeds the " + "maximum NCCL supports (", count_max, ")"); CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream)); } #else diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h index 57bb5952454e38..919363474b9f9f 100644 --- a/torch/csrc/cuda/nccl.h +++ b/torch/csrc/cuda/nccl.h @@ -47,4 +47,6 @@ void broadcast(at::TensorList tensors, const stream_list& streams = {}, const comm_list& user_comms = {}); +size_t get_max_count(); + }}} From 3e665cc29bbf429fef742450528b2272a3bc17af Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Mon, 10 Sep 2018 15:01:07 -0700 Subject: [PATCH 012/237] Improve support for tracing sizes, add more tracer warnings (#11288) Summary: Many constructors like `torch.zeros` or `torch.randn` didn't support size tracing correctly which is fixed by this pass. Same issue has been fixed in legacy tensor constructors. Additionally, new tensor constructors, which do not participate in tracing (most notably `torch.tensor`, `torch.as_tensor` and `torch.from_numpy`) raise a warning when they are used. Finally, entering a traceable operation disables the tracing in its body. This is needed because zdevito Pull Request resolved: https://github.com/pytorch/pytorch/pull/11288 Reviewed By: ezyang Differential Revision: D9751183 Pulled By: apaszke fbshipit-source-id: 51444a39d76a3e164adc396c432fd5ee3c8d5f7f --- ...ript.test_index_put_trace_with_view.expect | 4 +-- ...t.test_index_put_trace_without_view.expect | 4 +-- test/onnx/test_pytorch_onnx_caffe2.py | 8 ++--- test/test_jit.py | 6 ++-- tools/autograd/gen_variable_type.py | 31 ++++++------------- .../templates/python_torch_functions.cpp | 5 +++ torch/csrc/autograd/python_variable.cpp | 2 ++ torch/csrc/jit/tracer.cpp | 11 ++++--- torch/csrc/jit/tracer.h | 2 ++ torch/csrc/utils/python_arg_parser.h | 2 +- torch/csrc/utils/tensor_new.cpp | 8 ++--- torch/onnx/symbolic.py | 19 ++++++++++++ 12 files changed, 61 insertions(+), 41 deletions(-) diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect index 37f08643f139a4..cc03d3d5296d08 100644 --- a/test/expect/TestScript.test_index_put_trace_with_view.expect +++ b/test/expect/TestScript.test_index_put_trace_with_view.expect @@ -7,6 +7,6 @@ graph(%0 : Double(100) %6 : int = prim::Constant[value=0]() %7 : Long(4) = aten::_cast_Long(%1, %6) %8 : Dynamic[] = prim::ListConstruct(%7) - %20 : Double(100) = aten::index_put(%0, %8, %5) - return (%20); + %9 : Double(100) = aten::index_put(%0, %8, %5) + return (%9); } diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect index 772308223b454b..c72506796064b2 100644 --- a/test/expect/TestScript.test_index_put_trace_without_view.expect +++ b/test/expect/TestScript.test_index_put_trace_without_view.expect @@ -4,6 +4,6 @@ graph(%0 : Double(100) %3 : int = prim::Constant[value=0]() %4 : Long(4) = aten::_cast_Long(%1, %3) %5 : Dynamic[] = prim::ListConstruct(%4) - %17 : Double(100) = aten::index_put(%0, %5, %2) - return (%17); + %6 : Double(100) = aten::index_put(%0, %5, %2) + return (%6); } diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index f8038d2465dbc6..1a9e9cdbff1442 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -346,11 +346,11 @@ def test_rnn_init_predict_split(self): mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params)[0]) prepared = c2.prepare(mp, device='CPU') if self.embed_params: - assert len(prepared.init_net.op) == 1038 - assert len(prepared.predict_net.op) == 101 + assert len(prepared.init_net.op) == 1019 + assert len(prepared.predict_net.op) == 142 else: - assert len(prepared.init_net.op) == 27 - assert len(prepared.predict_net.op) == 1112 + assert len(prepared.init_net.op) == 8 + assert len(prepared.predict_net.op) == 1153 def test_alexnet(self): state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False) diff --git a/test/test_jit.py b/test/test_jit.py index 342087e277292c..32ccf392f60b93 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -916,6 +916,7 @@ def f(x, y): torch.jit.verify(f, (x, y), loss_fn=lambda z, w: z * w, devices=[]) + @suppress_warnings def test_constant(self): x = torch.randn(2, 2, requires_grad=True) @@ -6197,7 +6198,7 @@ def test_index_put(target, indices, rhs): target[indices] = rhs return target - self.assertExpected(str(test_index_put.graph)) + self.assertExpectedGraph(test_index_put.graph) def test_index_put_trace_without_view(self): @_trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(4)) @@ -6205,7 +6206,7 @@ def test_index_put(target, indices, rhs): target[indices] = rhs return target - self.assertExpected(str(test_index_put.graph)) + self.assertExpectedGraph(test_index_put.graph) def test_annotated_script_fn(self): @torch.jit.script @@ -6895,6 +6896,7 @@ def forward(self, x): net = Net(upscale_factor=4) self.checkTrace(net, (torch.rand(5, 1, 64, 64),)) + @suppress_warnings def test_time_sequence_prediction(self): class Sequence(torch.jit.ScriptModule): def __init__(self): diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index d6bcb0821e83c0..86dd5fcb98bda0 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -128,13 +128,15 @@ PRE_RECORD_TRACE = CodeTemplate("""\ torch::jit::Node* node = nullptr; +std::shared_ptr tracer_state; if (jit::tracer::isTracing()) { - auto& graph = jit::tracer::getTracingState()->graph; - node = graph->create(jit::aten::${trace_name}, /*outputs=*/0); + tracer_state = jit::tracer::getTracingState(); + node = tracer_state->graph->create(jit::aten::${trace_name}, /*outputs=*/0); jit::tracer::recordSourceLocation(node); ${add_trace_inputs} - graph->appendNode(node); + tracer_state->graph->appendNode(node); ${inplace_guard} + jit::tracer::setTracingState(nullptr); } """) @@ -145,35 +147,20 @@ ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${input}", ${input});""") POST_RECORD_TRACE = CodeTemplate("""\ -if (jit::tracer::isTracing()) { +if (tracer_state) { + jit::tracer::setTracingState(std::move(tracer_state)); ${record_trace_outputs} } """) -RECORD_ATTRIBUTE = CodeTemplate("""\ -setattr(trace_info.n, jit::attr::${attr_name}, ${name});""") - -RECORD_POSITIONAL_ATTRIBUTE = CodeTemplate("""\ -setposattr(trace_info.n, ${i}, "${name}", ${name});""") - -POSITIONAL_ATTR_NYI = """\ -throw std::runtime_error("Can't have size-dependent arguments to functions that " - "take variable number of tensor arguments"); -""" - def should_trace(declaration): - # Operations involving Generator, Storage, Type are not traceable - # at the moment - if any(arg['simple_type'] in {'Generator', 'Storage', 'ScalarType', 'Type', 'optional'} - for arg in declaration['arguments']): + # Operations involving Storage or Type are not traceable at the moment + if any(arg['simple_type'] in {'Storage', 'Type'} for arg in declaration['arguments']): return False # We can't trace functions which don't have any Tensor or TensorList returns if 'Tensor' not in declaration['return_type']: return False - tensor_args = [arg for arg in declaration['arguments'] if arg['simple_type'] in {'Tensor', 'TensorList'}] - if len(tensor_args) == 0: - return False name = declaration['name'] base_name = name[:-1] if declaration['inplace'] else name[:-4] if name.endswith('_out') else name if base_name in DONT_RECORD_TRACE: diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 144771290d1c89..a7174b52e52b88 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -19,6 +19,7 @@ #include "torch/csrc/utils/tensor_layouts.h" #include "torch/csrc/utils/tensor_new.h" #include "torch/csrc/utils/tensor_numpy.h" +#include "torch/csrc/jit/tracer.h" #include "torch/csrc/autograd/generated/variable_factories.h" #include @@ -320,6 +321,7 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + jit::tracer::warn("torch.as_tensor"); return THPVariable_Wrap(torch::utils::as_tensor(default_type(), args, kwargs)); END_HANDLE_TH_ERRORS } @@ -327,6 +329,7 @@ static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg) { HANDLE_TH_ERRORS + jit::tracer::warn("torch.from_numpy"); auto data = torch::utils::tensor_from_numpy(arg); return THPVariable_Wrap(make_variable(std::move(data), /*requires_grad=*/false)); END_HANDLE_TH_ERRORS @@ -351,6 +354,7 @@ static PyObject * THPVariable__promote_types(PyObject* self, PyObject* args, PyO static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + jit::tracer::warn("torch.sparse_coo_tensor"); return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(default_type(), args, kwargs)); END_HANDLE_TH_ERRORS } @@ -358,6 +362,7 @@ static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + jit::tracer::warn("torch.tensor"); return THPVariable_Wrap(torch::utils::tensor_ctor(default_type(), args, kwargs)); END_HANDLE_TH_ERRORS } diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index b1c2d85592d6dd..5666da28135f1f 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -22,6 +22,7 @@ #include "torch/csrc/utils/python_strings.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/tensor_new.h" +#include "torch/csrc/jit/tracer.h" #include @@ -125,6 +126,7 @@ static void THPVariable_dealloc(THPVariable* self) static PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS + jit::tracer::warn("torch.Tensor"); auto& default_type = torch::tensors::get_default_tensor_type(); auto tensor = torch::utils::legacy_tensor_ctor(default_type, args, kwargs); return THPVariable_NewWithVar(type, std::move(tensor)); diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index d2c4ef9f0da5a7..03e31440ef5c95 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -26,8 +26,9 @@ void genericAddInput(Node *n, T value) { n->addInput(v); } -void badArgType() { - AT_ERROR("Found an unsupported argument type in the JIT tracer. File a bug report."); +template +void badArgType(const T& v) { + AT_ERROR("Found an unsupported argument type in the JIT tracer: ", at::demangle_type(), ". File a bug report."); } thread_local std::shared_ptr tracing_state; @@ -39,8 +40,10 @@ void addInputs(Node *n, const char * name, bool value) { detail::g void addInputs(Node *n, const char * name, double value) { detail::genericAddInput(n, value); } void addInputs(Node *n, const char * name, const at::Scalar& value) { detail::genericAddInput(n, value); } void addInputs(Node *n, const char * name, const at::Tensor& value) { n->addInput(getValueTrace(value)); } -void addInputs(Node *n, const char * name, const std::string& value) { detail::badArgType(); } -void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(); } +void addInputs(Node *n, const char * name, const std::string& value) { detail::badArgType(value); } +void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); } +void addInputs(Node *n, const char * name, at::Generator * value) { detail::badArgType(value); } +void addInputs(Node *n, const char * name, at::ScalarType value) { detail::badArgType(value); } void addInputs(Node *n, const char * name, at::TensorList value) { Graph *g = n->owningGraph(); diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 73adda456c39f1..37cd9b332923ee 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -172,6 +172,8 @@ TORCH_API void addInputs(Node *n, const char * name, const ArrayRef& val TORCH_API void addInputs(Node *n, const char * name, const std::string& value); TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value); TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value); +TORCH_API void addInputs(Node *n, const char * name, at::Generator * value); +TORCH_API void addInputs(Node *n, const char * name, at::ScalarType value); template void addInputs(Node *n, const char * name, std::array value) { diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index a44ce9b3ed0221..86453a79e30dc2 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -292,7 +292,7 @@ inline std::vector PythonArgs::intlistWithDefault(int i, std::vector parsed_args; auto r = parser.parse(args, kwargs, parsed_args); @@ -525,7 +525,7 @@ Tensor new_empty(const Type& type, PyObject* args, PyObject* kwargs) { Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) { static PythonArgParser parser({ "new_full(IntList size, Scalar fill_value, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", - }); + }, /*traceable=*/true); ParsedArgs<5> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); @@ -539,7 +539,7 @@ Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) { Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) { static PythonArgParser parser({ "new_ones(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", - }); + }, /*traceable=*/true); ParsedArgs<4> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); @@ -553,7 +553,7 @@ Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) { Tensor new_zeros(const Type& type, PyObject* args, PyObject* kwargs) { static PythonArgParser parser({ "new_zeros(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", - }); + }, /*traceable=*/true); ParsedArgs<4> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 23e95e282c4408..03e93125f40755 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -936,6 +936,25 @@ def zeros_like(g, input): return g.op("Sub", input, input).setType(input.type().contiguous()) +scalar_type_to_onnx = [ + cast_pytorch_to_onnx["Byte"], + cast_pytorch_to_onnx["Char"], + cast_pytorch_to_onnx["Short"], + cast_pytorch_to_onnx["Int"], + cast_pytorch_to_onnx["Long"], + cast_pytorch_to_onnx["Half"], + cast_pytorch_to_onnx["Float"], + cast_pytorch_to_onnx["Double"], +] + + +@parse_args('v', 'i', 'i', 'v') +def zeros(g, shape, scalar_type, layout, device): + # NOTE: no way to set device in ONNX, so we ignore it + return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type], + input_as_shape_i=1, value_f=0) + + def full_like(g, input, fill_value): # TODO: a more efficient implementation (ConstantFill?) return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1))) From a0d4106c075ba8e00ed867364ea1f730d2735e90 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 10 Sep 2018 15:24:47 -0700 Subject: [PATCH 013/237] Integrate custom op tests with CI (#10611) Summary: This PR is stacked on https://github.com/pytorch/pytorch/pull/10610, and only adds changes in one file `.jenkins/pytorch/test.sh`, where we now build the custom op tests and run them. I'd also like to take this PR to discuss whether the [`TorchConfig.cmake`](https://github.com/pytorch/pytorch/blob/master/cmake/TorchConfig.cmake.in) I made is robust enough (we will also see in the CI) orionr Yangqing dzhulgakov what do you think? Also ezyang for CI changes Pull Request resolved: https://github.com/pytorch/pytorch/pull/10611 Differential Revision: D9597627 Pulled By: goldsborough fbshipit-source-id: f5af8164c076894f448cef7e5b356a6b3159f8b3 --- .gitignore | 2 + .jenkins/pytorch/build.sh | 10 ++++ .jenkins/pytorch/macos-test.sh | 22 +++++++++ .jenkins/pytorch/test.sh | 19 +++++++- cmake/TorchConfig.cmake.in | 58 +++++++++++++++--------- setup.py | 34 +++++++++----- test/custom_operator/CMakeLists.txt | 9 ++++ test/custom_operator/model.py | 13 +++++- test/custom_operator/op.h | 2 +- test/custom_operator/test_custom_ops.cpp | 10 ++-- test/custom_operator/test_custom_ops.py | 4 +- torch/CMakeLists.txt | 11 +++-- torch/op.h | 1 + 13 files changed, 150 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 110046e09abf94..e3e7acb8e310cb 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ docs/cpp/html/ docs/cpp/api/ test/.coverage test/cpp/api/mnist +test/custom_operator/model.pt test/data/gpu_tensors.pt test/data/legacy_modules.t7 test/data/legacy_serialized.pt @@ -66,6 +67,7 @@ torch/lib/protoc torch/lib/tmp_install torch/lib/torch_shm_manager torch/lib/python* +torch/share/ torch/version.py # IPython notebook checkpoints diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 3ffed384b081b7..f108c37cdd6ed1 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -138,4 +138,14 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then pushd ../cpp-build/caffe2 WERROR=1 VERBOSE=1 DEBUG=1 python $BUILD_LIBTORCH_PY popd + + # Build custom operator tests. + CUSTOM_OP_BUILD="$PWD/../custom-op-build" + CUSTOM_OP_TEST="$PWD/test/custom_operator" + SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + mkdir "$CUSTOM_OP_BUILD" + pushd "$CUSTOM_OP_BUILD" + CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake "$CUSTOM_OP_TEST" + make VERBOSE=1 + popd fi diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 87e0476e418ba5..16d34342c544c8 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -78,13 +78,35 @@ test_cpp_api() { "$CPP_BUILD"/caffe2/bin/test_api } +test_custom_script_ops() { + echo "Testing custom script operators" + pushd test/custom_operator + # Build the custom operator library. + rm -rf build && mkdir build + pushd build + SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake .. + make VERBOSE=1 + popd + + # Run tests Python-side and export a script module. + python test_custom_ops.py -v + python model.py --export-script-module=model.pt + # Run tests C++-side and load the exported script module. + build/test_custom_ops ./model.pt + popd +} + + if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then test_python_all test_cpp_api + test_custom_script_ops else if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then test_python_all elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then test_cpp_api + test_custom_script_ops fi fi diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 05bd71602b9783..bc22fa7a36070a 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -97,7 +97,7 @@ test_aten() { # put the dynamic libraries somewhere were the dynamic linker can find them. # This is a bit of a hack. if [[ "$BUILD_ENVIRONMENT" == *ppc64le* ]]; then - SUDO=sudo + SUDO=sudo fi ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin @@ -140,12 +140,28 @@ test_libtorch() { fi } +test_custom_script_ops() { + if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then + echo "Testing custom script operators" + CUSTOM_OP_BUILD="$PWD/../custom-op-build" + pushd test/custom_operator + cp -r "$CUSTOM_OP_BUILD" build + # Run tests Python-side and export a script module. + python test_custom_ops.py -v + python model.py --export-script-module=model.pt + # Run tests C++-side and load the exported script module. + build/test_custom_ops ./model.pt + popd + fi +} + if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then test_python_nn test_python_all_except_nn test_aten test_torchvision test_libtorch + test_custom_script_ops else if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then test_python_nn @@ -154,5 +170,6 @@ else test_aten test_torchvision test_libtorch + test_custom_script_ops fi fi diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index a14b2e1b0e8b44..f177a9c2c3504d 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -17,32 +17,48 @@ # # torch_add_custom_op_library( ) -SET(TORCH_ROOT "${CMAKE_CURRENT_LIST_DIR}/../") +if ($ENV{TORCH_INSTALL_PREFIX}) + set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX}) +else() + # Assume we are in /share/cmake/Torch/TorchConfig.cmake + get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE) +endif() -set(TORCH_INCLUDE_DIRS - "${TORCH_ROOT}" - "${TORCH_ROOT}/aten/src" - "${CMAKE_CURRENT_LIST_DIR}/aten/src" - "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src" - "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src/TH" -) +# Include directories. +set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include") -find_library(TORCH_LIBRARY torch PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH) -find_library(CAFFE2_LIBRARY caffe2 PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH) +# Library dependencies. +find_package(Caffe2 REQUIRED) +find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib") +set(TORCH_LIBRARIES ${TORCH_LIBRARY} ${Caffe2_MAIN_LIBS}) if (@USE_CUDA@) - find_package(CUDA REQUIRED) - find_library(CAFFE2_CUDA_LIBRARY caffe2_gpu PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH) - set(TORCH_CUDA_LIBRARIES -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 cuda nvrtc cudart nvToolsExt) - list(APPEND TORCH_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + if(MSVC) + set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") + if ($ENV{NVTOOLEXT_HOME}) + set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME}) + endif() + set(TORCH_CUDA_LIBRARIES + ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib + ${CUDA_LIBRARIES}) + list(APPEND TORCH_INCLUDE_DIRS "${NVTOOLEXT_HOME}/include") + elseif(APPLE) + set(TORCH_CUDA_LIBRARIES + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib + ${CUDA_LIBRARIES}) + else() + set(TORCH_CUDA_LIBRARIES + ${CUDA_CUDA_LIB} + ${CUDA_NVRTC_LIB} + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so + ${CUDA_LIBRARIES}) + endif() + list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) endif() -set(TORCH_LIBRARIES - ${TORCH_LIBRARY} - ${CAFFE2_LIBRARY} - ${CAFFE2_CUDA_LIBRARY} - ${TORCH_CUDA_LIBRARIES}) - # Creates a shared library with the correct include directories # and linker flags set to include Torch header files and link with Torch # libraries. Also sets the C++ standard version to C++11. All options @@ -51,5 +67,5 @@ function(torch_add_custom_op_library name source_files) add_library(${name} SHARED ${source_files}) target_include_directories(${name} PUBLIC "${TORCH_INCLUDE_DIRS}") target_link_libraries(${name} "${TORCH_LIBRARIES}") - target_compile_options(${name} PUBLIC -std=c++11) + set_property(TARGET ${name} PROPERTY CXX_STANDARD 11) endfunction(torch_add_custom_op_library) diff --git a/setup.py b/setup.py index 3cc5c0c1642a0c..34d2e7e5402840 100644 --- a/setup.py +++ b/setup.py @@ -443,11 +443,10 @@ def check_file(f): # we need to find a better way to do this. # More information can be found in conversation thread of PR #5772 - self.copy_tree('torch/csrc', 'torch/lib/include/torch/csrc/') + self.copy_tree('torch/lib/tmp_install/share', 'torch/share') self.copy_tree('third_party/pybind11/include/pybind11/', 'torch/lib/include/pybind11') self.copy_file('torch/csrc/torch.h', 'torch/lib/include/torch/torch.h') - self.copy_file('torch/op.h', 'torch/lib/include/torch/op.h') build_dep_cmds = {} @@ -1136,27 +1135,38 @@ def make_relative_rpath(path): 'lib/*.h', 'lib/include/ATen/*.h', 'lib/include/ATen/core/*.h', - 'lib/include/ATen/detail/*.h', - 'lib/include/ATen/cuda/*.h', 'lib/include/ATen/cuda/*.cuh', + 'lib/include/ATen/cuda/*.h', + 'lib/include/ATen/cuda/detail/*.cuh', 'lib/include/ATen/cuda/detail/*.h', 'lib/include/ATen/cudnn/*.h', - 'lib/include/ATen/cuda/detail/*.cuh', + 'lib/include/ATen/detail/*.h', + 'lib/include/caffe2/utils/*.h', + 'lib/include/torch/*.h', + 'lib/include/torch/csrc/*.h', + 'lib/include/torch/csrc/api/include/torch/detail/ordered_dict.h', + 'lib/include/torch/csrc/autograd/*.h', + 'lib/include/torch/csrc/autograd/generated/*.h', + 'lib/include/torch/csrc/cuda/*.h', + 'lib/include/torch/csrc/jit/*.h', + 'lib/include/torch/csrc/jit/generated/*.h', + 'lib/include/torch/csrc/jit/passes/*.h', + 'lib/include/torch/csrc/jit/script/*.h', + 'lib/include/torch/csrc/utils/*.h', 'lib/include/pybind11/*.h', 'lib/include/pybind11/detail/*.h', 'lib/include/TH/*.h*', 'lib/include/TH/generic/*.h*', - 'lib/include/THC/*.h*', 'lib/include/THC/*.cuh', + 'lib/include/THC/*.h*', 'lib/include/THC/generic/*.h', 'lib/include/THCUNN/*.cuh', 'lib/include/THNN/*.h', - 'lib/include/torch/csrc/*.h', - 'lib/include/torch/csrc/autograd/*.h', - 'lib/include/torch/csrc/jit/*.h', - 'lib/include/torch/csrc/utils/*.h', - 'lib/include/torch/csrc/cuda/*.h', - 'lib/include/torch/torch.h', + 'share/cmake/ATen/*.cmake', + 'share/cmake/Caffe2/*.cmake', + 'share/cmake/Caffe2/public/*.cmake', + 'share/cmake/Gloo/*.cmake', + 'share/cmake/Torch/*.cmake', ], 'caffe2': [ rel_site_packages + '/caffe2/**/*.py' diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt index 114cfda92d83d6..14f605d4f5b3f7 100644 --- a/test/custom_operator/CMakeLists.txt +++ b/test/custom_operator/CMakeLists.txt @@ -4,6 +4,15 @@ project(custom_ops) find_package(Torch REQUIRED) +# This convenience function will create a shared library target, configure +# the right include directories and link against the right libraries. It is +# exactly equivalent to the following lines: +# +# add_library(custom_ops SHARED op.cpp) +# target_include_directories(custom_ops PUBLIC "${TORCH_INCLUDE_DIRS}") +# target_link_libraries(custom_ops "${TORCH_LIBRARIES}") +# set_property(TARGET custom_ops PROPERTY CXX_STANDARD 11) +# torch_add_custom_op_library(custom_ops op.cpp) add_executable(test_custom_ops test_custom_ops.cpp) diff --git a/test/custom_operator/model.py b/test/custom_operator/model.py index 6f91bd81c577cd..c803c18d467e13 100644 --- a/test/custom_operator/model.py +++ b/test/custom_operator/model.py @@ -1,9 +1,20 @@ import argparse import os.path +import sys import torch +SHARED_LIBRARY_EXTENSIONS = {'linux': 'so', 'darwin': 'dylib', 'win32': 'dll'} + + +def get_custom_op_library_path(): + extension = SHARED_LIBRARY_EXTENSIONS[sys.platform] + path = os.path.abspath('build/libcustom_ops.{}'.format(extension)) + assert os.path.exists(path), path + return path + + class Model(torch.jit.ScriptModule): def __init__(self): super(Model, self).__init__() @@ -20,7 +31,7 @@ def main(): parser.add_argument("--export-script-module-to", required=True) options = parser.parse_args() - torch.ops.load_library(os.path.abspath('build/libcustom_ops.so')) + torch.ops.load_library(get_custom_op_library_path()) model = Model() model.save(options.export_script_module_to) diff --git a/test/custom_operator/op.h b/test/custom_operator/op.h index d45123d3afccb4..b1c4b5b1647f0b 100644 --- a/test/custom_operator/op.h +++ b/test/custom_operator/op.h @@ -3,7 +3,7 @@ #include #include -std::vector custom_op( +TORCH_API std::vector custom_op( at::Tensor tensor, double scalar, int64_t repeat); diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp index d653d0e2ccf9e0..47e6f3618aad7c 100644 --- a/test/custom_operator/test_custom_ops.cpp +++ b/test/custom_operator/test_custom_ops.cpp @@ -22,9 +22,12 @@ void get_operator_from_registry_and_execute() { std::vector output; torch::jit::pop(stack, output); + const auto manual = custom_op(torch::ones(5), 2.0, 3); + assert(output.size() == 3); - for (const auto& tensor : output) { - assert(tensor.allclose(torch::ones(5) * 2)); + for (size_t i = 0; i < output.size(); ++i) { + assert(output[i].allclose(torch::ones(5) * 2)); + assert(output[i].allclose(manual[i])); } } @@ -71,10 +74,9 @@ void test_argument_checking_for_serialized_modules( module->forward({}); assert(false); } catch (const at::Error& error) { - std::cout << error.what_without_backtrace() << std::endl; assert( std::string(error.what_without_backtrace()) - .find("custom::op() is missing value for argument 'tensor'") == 0); + .find("forward() is missing value for argument 'input'") == 0); } } diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py index f7e6b2fa7b9e5f..ab6c958acd55fc 100644 --- a/test/custom_operator/test_custom_ops.py +++ b/test/custom_operator/test_custom_ops.py @@ -5,12 +5,12 @@ import torch -from model import Model +from model import Model, get_custom_op_library_path class TestCustomOperators(unittest.TestCase): def setUp(self): - self.library_path = os.path.abspath('build/libcustom_ops.so') + self.library_path = get_custom_op_library_path() torch.ops.load_library(self.library_path) def test_custom_library_is_loaded(self): diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index d5d0ebc663915d..fb0b76e8aeb91f 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -26,7 +26,7 @@ if(NOT TORCH_INSTALL_BIN_DIR) endif() if(NOT TORCH_INSTALL_INCLUDE_DIR) - set(TORCH_INSTALL_INCLUDE_DIR include/libtorch) + set(TORCH_INSTALL_INCLUDE_DIR include) endif() if(NOT TORCH_INSTALL_LIB_DIR) @@ -287,8 +287,7 @@ if (MSVC) target_link_libraries(torch onnx onnx_library) endif() -target_link_libraries(torch - caffe2_library) +target_link_libraries(torch caffe2_library) find_package(OpenMP) if(OPENMP_FOUND) @@ -395,6 +394,8 @@ endif() install(DIRECTORY "${TORCH_SRC_DIR}/csrc" DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch FILES_MATCHING PATTERN "*.h") +install(FILES "${TORCH_SRC_DIR}/op.h" + DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch) install(TARGETS torch RUNTIME DESTINATION "${TORCH_INSTALL_BIN_DIR}" @@ -473,3 +474,7 @@ configure_file( ${TORCH_ROOT}/cmake/TorchConfig.cmake.in ${PROJECT_BINARY_DIR}/TorchConfig.cmake @ONLY) +install(FILES + ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake + ${PROJECT_BINARY_DIR}/TorchConfig.cmake + DESTINATION share/cmake/Torch) diff --git a/torch/op.h b/torch/op.h index bf6f2aa080c133..9033857d672925 100644 --- a/torch/op.h +++ b/torch/op.h @@ -3,6 +3,7 @@ #include #include #include +#include #include From 0b78ae86c59e7372f0b9d4cadcd0e50bc76f8667 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Mon, 10 Sep 2018 15:30:08 -0700 Subject: [PATCH 014/237] Cleanup byte swapping utilities to generate optimal code on the platforms we care about. (#11394) Summary: While the use of memcpy as part of the byte swapping sequence looks funky, all major compilers recognize and optimize this pattern reliably, resulting in essentially optimal code generation. For example, decodeUInt32LE goes from this on iOS arm64: > ldrb w8, [x0, #3] > ldrb w9, [x0, #2] > bfi w8, w9, #8, #8 > ldrb w9, [x0, #1] > bfi w8, w9, #16, #8 > ldrb w9, [x0] > bfi w8, w9, #24, #8 > mov x0, x8 > ret To this: > ldr w8, [x0] > rev w0, w8 > ret Pull Request resolved: https://github.com/pytorch/pytorch/pull/11394 Reviewed By: SsnL Differential Revision: D9728659 Pulled By: resistor fbshipit-source-id: 9afbd4adfad1d1fb7b01f1179e6707ee21fa726f --- torch/csrc/byte_order.cpp | 106 +++++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 29 deletions(-) diff --git a/torch/csrc/byte_order.cpp b/torch/csrc/byte_order.cpp index 8bc1ff10f98c32..7818ca2191b966 100644 --- a/torch/csrc/byte_order.cpp +++ b/torch/csrc/byte_order.cpp @@ -2,34 +2,94 @@ #include +#if defined(_MSC_VER) +#include +#endif + +static inline void swapBytes16(void *ptr) +{ + uint16_t output; + memcpy(&output, ptr, sizeof(uint16_t)); +#if defined(_MSC_VER) && !defined(_DEBUG) + output = _byteswap_ushort(output); +#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC) + output = __builtin_bswap16(output); +#else + uint16_t Hi = output >> 8; + uint16_t Lo = output << 8; + output = Hi | Lo; +#endif + memcpy(ptr, &output, sizeof(uint16_t)); +} + +static inline void swapBytes32(void *ptr) +{ + uint32_t output; + memcpy(&output, ptr, sizeof(uint32_t)); +#if defined(_MSC_VER) && !defined(_DEBUG) + output = _byteswap_ulong(output); +#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC) + output = __builtin_bswap32(output); +#else + uint32_t Byte0 = output & 0x000000FF; + uint32_t Byte1 = output & 0x0000FF00; + uint32_t Byte2 = output & 0x00FF0000; + uint32_t Byte3 = output & 0xFF000000; + output = (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24); +#endif + memcpy(ptr, &output, sizeof(uint32_t)); +} + +static inline void swapBytes64(void *ptr) +{ + uint64_t output; + memcpy(&output, ptr, sizeof(uint64_t)); +#if defined(_MSC_VER) && !defined(_DEBUG) + output = _byteswap_uint64(output); +#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC) + output = __builtin_bswap64(output); +#else + uint64_t Hi = SwapByteOrder_32(uint32_t(value)); + uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32)); + return (Hi << 32) | Lo; +#endif + memcpy(ptr, &output, sizeof(uint64_t)); +} + static inline uint16_t decodeUInt16LE(const uint8_t *data) { - return (data[0]<<0) | (data[1]<<8); + uint16_t output; + memcpy(&output, data, sizeof(uint16_t)); + return output; } static inline uint16_t decodeUInt16BE(const uint8_t *data) { - return (data[1]<<0) | (data[0]<<8); + uint16_t output = decodeUInt16LE(data); + swapBytes16(&output); + return output; } static inline uint32_t decodeUInt32LE(const uint8_t *data) { - return (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24); + uint32_t output; + memcpy(&output, data, sizeof(uint32_t)); + return output; } static inline uint32_t decodeUInt32BE(const uint8_t *data) { - return (data[3]<<0) | (data[2]<<8) | (data[1]<<16) | (data[0]<<24); + uint32_t output = decodeUInt32LE(data); + swapBytes32(&output); + return output; } static inline uint64_t decodeUInt64LE(const uint8_t *data) { - return (((uint64_t)data[0])<< 0) | (((uint64_t)data[1])<< 8) | - (((uint64_t)data[2])<<16) | (((uint64_t)data[3])<<24) | - (((uint64_t)data[4])<<32) | (((uint64_t)data[5])<<40) | - (((uint64_t)data[6])<<48) | (((uint64_t)data[7])<<56); + uint64_t output; + memcpy(&output, data, sizeof(uint64_t)); + return output; } static inline uint64_t decodeUInt64BE(const uint8_t *data) { - return (((uint64_t)data[7])<< 0) | (((uint64_t)data[6])<< 8) | - (((uint64_t)data[5])<<16) | (((uint64_t)data[4])<<24) | - (((uint64_t)data[3])<<32) | (((uint64_t)data[2])<<40) | - (((uint64_t)data[1])<<48) | (((uint64_t)data[0])<<56); + uint64_t output = decodeUInt64LE(data); + swapBytes64(&output); + return output; } THPByteOrder THP_nativeByteOrder() @@ -92,24 +152,12 @@ void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, } } -template -static void swapBytes(uint8_t *ptr) -{ - uint8_t tmp; - for (size_t i = 0; i < size / 2; i++) { - tmp = ptr[i]; - ptr[i] = ptr[size-i]; - ptr[size-i] = tmp; - } -} - - void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len) { memcpy(dst, src, sizeof(int16_t) * len); if (order != THP_nativeByteOrder()) { for (size_t i = 0; i < len; i++) { - swapBytes(dst); + swapBytes16(dst); dst += sizeof(int16_t); } } @@ -120,7 +168,7 @@ void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, memcpy(dst, src, sizeof(int32_t) * len); if (order != THP_nativeByteOrder()) { for (size_t i = 0; i < len; i++) { - swapBytes(dst); + swapBytes32(dst); dst += sizeof(int32_t); } } @@ -131,7 +179,7 @@ void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, memcpy(dst, src, sizeof(int64_t) * len); if (order != THP_nativeByteOrder()) { for (size_t i = 0; i < len; i++) { - swapBytes(dst); + swapBytes64(dst); dst += sizeof(int64_t); } } @@ -142,7 +190,7 @@ void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, s memcpy(dst, src, sizeof(float) * len); if (order != THP_nativeByteOrder()) { for (size_t i = 0; i < len; i++) { - swapBytes(dst); + swapBytes32(dst); dst += sizeof(float); } } @@ -153,7 +201,7 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, memcpy(dst, src, sizeof(double) * len); if (order != THP_nativeByteOrder()) { for (size_t i = 0; i < len; i++) { - swapBytes(dst); + swapBytes64(dst); dst += sizeof(double); } } From f2f43ad2dab024ef4010b63ec2faa64e9affcdac Mon Sep 17 00:00:00 2001 From: Mingda Li Date: Mon, 10 Sep 2018 15:39:01 -0700 Subject: [PATCH 015/237] Add new LengthsSplit operator (#10974) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/10974 Pull Request resolved: https://github.com/pytorch/pytorch/pull/10291 This new operator will do the following: Given a LENGTHS vector and n_splits, output a "split" LENGTHS vector where: 1. Each length in input vector is split into n_splits values (thus output vector should have LENGTHS.size(0) * n_splits elements) 2. The new lengths in output should be evenly split, and if the length is not divisible by n_splits, then order new values in descending order. (e.g. n_splits = 3, length = 5 -> 2 2 1) 3. If n_splits > some element in the array, its split elements will contain 0s. (e.g. n_splits = 3, length = 2 - > 1 1 0) Reviewed By: bddppq, chocjy Differential Revision: D9013119 fbshipit-source-id: 82bf3371ec08c41fc3379177f0007afc142e0d84 --- caffe2/operators/length_split_op.cc | 37 +++++ caffe2/operators/length_split_op.h | 75 +++++++++ .../operator_test/length_split_op_test.py | 151 ++++++++++++++++++ 3 files changed, 263 insertions(+) create mode 100644 caffe2/operators/length_split_op.cc create mode 100644 caffe2/operators/length_split_op.h create mode 100644 caffe2/python/operator_test/length_split_op_test.py diff --git a/caffe2/operators/length_split_op.cc b/caffe2/operators/length_split_op.cc new file mode 100644 index 00000000000000..7c342d154491b1 --- /dev/null +++ b/caffe2/operators/length_split_op.cc @@ -0,0 +1,37 @@ +#include "caffe2/operators/length_split_op.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(LengthsSplit, LengthsSplitOp); + +OPERATOR_SCHEMA(LengthsSplit) + .NumInputs(1, 2) + .NumOutputs(1) + .ScalarType(TensorProto::INT32) + .SetDoc(R"DOC( +Given input vector LENGTHS, and input n_split, LengthsSplit returns +a single output vector. It "splits" each length into n_split values which add +up to the original length. It will attempt to do equal splits, and if not possible, +it orders larger values first. If the n_split is larger than the length, zero +padding will be applied. + +e.g. LENGTHS = [9 4 5] + n_split = 3 + Y = [3 3 3 2 1 1 2 2 1] + +e.g. LENGTHS = [2, 1, 2] + n_split = 3 + Y = [1 1 0 1 0 0 1 1 0] +)DOC") + .Arg("n_split", "Number of splits for each element in LENGTHS") + .Input(0, "LENGTHS", "Mx1 Input tensor denoting INT32 lengths") + .Input( + 1, + "n_split", + "(Optional) Number of splits for each element in LENGTHS (overrides argument)") + .Output(0, "Y", "(M*n_split)x1 Output vector denoting split lengths"); + +// TODO: Write gradient for this when needed +GRADIENT_NOT_IMPLEMENTED_YET(LengthsSplit); + +} // namespace caffe2 diff --git a/caffe2/operators/length_split_op.h b/caffe2/operators/length_split_op.h new file mode 100644 index 00000000000000..d8c98bf085c8a2 --- /dev/null +++ b/caffe2/operators/length_split_op.h @@ -0,0 +1,75 @@ +#ifndef CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_ +#define CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_ + +#include "caffe2/core/common_omp.h" +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class LengthsSplitOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + LengthsSplitOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + n_split_(OperatorBase::GetSingleArgument("n_split", 0)) { + if (InputSize() == 1) { + // If not specified, then must have this argument + CAFFE_ENFORCE( + OperatorBase::HasArgument("n_split"), + "Argument `n_split` is missing and was not specified as input."); + CAFFE_ENFORCE( + n_split_ > 0, + "`n_split` must contain a positive value for defined behavior."); + } + } + ~LengthsSplitOp() {} + + bool RunOnDevice() override { + const auto& L = Input(0); + CAFFE_ENFORCE_EQ(L.ndim(), 1, "Input `LENGTHS` should be a 1D vector."); + + if (InputSize() > 1) { + // We potentially have n_split specified as inputs as well + CAFFE_ENFORCE( + Input(1).ndim() == 1 && Input(1).size() == 1, + "Input `n_split` should be a vector of size 1."); + + const auto& input1 = Input(1); + context_.template CopyItems( + input1.meta(), 1, input1.raw_data(), &n_split_); + } + + CAFFE_ENFORCE( + n_split_ > 0, + "`n_split` must contain a positive value for defined behavior."); + const auto M = L.size(); + + auto* Y = Output(0); + Y->Resize(M * n_split_); + + const int32_t* Ldata = L.template data(); + int32_t* Ydata = Y->template mutable_data(); + + for (int i = 0; i < M; i++) { + int32_t mod = Ldata[i] % n_split_; + int32_t res = + mod != 0 ? math::divUp(Ldata[i], n_split_) : Ldata[i] / n_split_ + 1; + for (int j = 0; j < n_split_; j++) { + Ydata[(i * n_split_) + j] = mod-- > 0 ? res : res - 1; + } + } + return true; + } + + private: + int32_t n_split_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_ diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py new file mode 100644 index 00000000000000..08ec2638886291 --- /dev/null +++ b/caffe2/python/operator_test/length_split_op_test.py @@ -0,0 +1,151 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.python import core +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np + + +class TestLengthSplitOperator(hu.HypothesisTestCase): + + def _length_split_op_ref(self, input_lengths, n_split_array): + output = [] + n_split = n_split_array[0] + for x in input_lengths: + mod = x % n_split + val = x // n_split + 1 + for _ in range(n_split): + if mod > 0: + output.append(val) + mod -= 1 + else: + output.append(val - 1) + return [np.array(output).astype(np.int32)] + + @given(**hu.gcs_cpu_only) + def test_length_split_edge(self, gc, dc): + input_lengths = np.array([3, 4, 5]).astype(np.int32) + n_split_ = np.array([5]).astype(np.int32) + # Expected output: + # [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1] + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0]) + + @given(**hu.gcs_cpu_only) + def test_length_split_arg(self, gc, dc): + input_lengths = np.array([9, 4, 5]).astype(np.int32) + n_split = 3 + # Expected output: + # [3, 3, 3, 2, 1, 1, 2, 2, 1] + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths'], + ['Y'], n_split=n_split + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths], + reference=lambda x : self._length_split_op_ref(x, [n_split]), + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths], [0]) + + @given(**hu.gcs_cpu_only) + def test_length_split_override_arg(self, gc, dc): + input_lengths = np.array([9, 4, 5]).astype(np.int32) + n_split_ignored = 2 + n_split_used = np.array([3]).astype(np.int32) + + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], n_split=n_split_ignored + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_used], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_used], [0]) + + @given(m=st.integers(1, 100), n_split=st.integers(1, 20), + **hu.gcs_cpu_only) + def test_length_split_even_divide(self, m, n_split, gc, dc): + # multiples of n_split + input_lengths = np.random.randint(100, size=m).astype(np.int32) * n_split + n_split_ = np.array([n_split]).astype(np.int32) + + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0]) + + @given(m=st.integers(1, 100), n_split=st.integers(1, 20), + **hu.gcs_cpu_only) + def test_length_split_random(self, m, n_split, gc, dc): + input_lengths = np.random.randint(100, size=m).astype(np.int32) + n_split_ = np.array([n_split]).astype(np.int32) + + op = core.CreateOperator( + 'LengthsSplit', + ['input_lengths', + 'n_split'], + ['Y'], + ) + + # Check against numpy reference + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[input_lengths, + n_split_], + reference=self._length_split_op_ref, + ) + # Check over multiple devices + self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0]) + + +if __name__ == "__main__": + import unittest + unittest.main() From 3ad67c60f078f62f3d67e7228cf7b8a1a0f6e3e8 Mon Sep 17 00:00:00 2001 From: James Reed Date: Mon, 10 Sep 2018 16:53:25 -0700 Subject: [PATCH 016/237] Traceable explicit Variable instantiation (#11463) Summary: There's a bunch of legacy code where people are explicitly instantiating Variable, and these call-sites have thus far been untraceable (appearing as prim::Constant nodes with the tensor value at the time of tracing). This makes it so that the new variable inherits the traced Value* from the tensor it's being constructed from Pull Request resolved: https://github.com/pytorch/pytorch/pull/11463 Differential Revision: D9756529 Pulled By: jamesr66a fbshipit-source-id: da99c6a7621957a305f2699ec9cb9def69b1b2d7 --- test/test_jit.py | 9 +++++++++ torch/csrc/autograd/python_legacy_variable.cpp | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/test/test_jit.py b/test/test_jit.py index 32ccf392f60b93..382daa03ec1614 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1676,6 +1676,15 @@ def forward(self, x, lengths, hiddens): torch.randn(5, 3, 10), torch.LongTensor([3, 3, 2]), torch.randn(2, 3, 20), torch.randn(2, 3, 20) self.assertEqual(traced(x, lengths, (h0, c0)), imported(x, lengths, (h0, c0))) + def test_trace_variable_instantiation(self): + def random_foo(x): + return Variable(Variable(x) + 1.0) + + random_foo_traced = torch.jit.trace(random_foo, (torch.rand(3, 4),)) + + x = torch.rand(5, 6) + self.assertEqual(random_foo(x), random_foo_traced(x)) + class TestBatched(TestCase): # generate random examples and create an batchtensor with them diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp index 339e58cde4e56c..3ba7ff94bc1fd7 100644 --- a/torch/csrc/autograd/python_legacy_variable.cpp +++ b/torch/csrc/autograd/python_legacy_variable.cpp @@ -6,6 +6,7 @@ #include "torch/csrc/autograd/python_function.h" #include "torch/csrc/autograd/python_variable.h" #include "torch/csrc/tensor/python_tensor.h" +#include "torch/csrc/jit/tracer.h" using namespace at; @@ -67,6 +68,12 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject var.set_name(name); } + if (jit::tracer::isTracing() && data && data != Py_None && THPVariable_Check(data)) { + if (auto *v = jit::tracer::getValueTrace(((THPVariable*)data)->cdata)) { + jit::tracer::setValueTrace(var, v); + } + } + return THPVariable_Wrap(std::move(var)); END_HANDLE_TH_ERRORS } From 3e49a694660711a20acb3dd4b5c41fc8679ee48b Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Mon, 10 Sep 2018 17:07:37 -0700 Subject: [PATCH 017/237] Resolve ambiguity when including both caffe2 and aten registries (#11411) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11411 Simple fix Reviewed By: goldsborough Differential Revision: D9730371 fbshipit-source-id: f841327c01faa13cfb6b7fc6e279b8fc50fad1db --- caffe2/core/registry.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h index c7f3a7af539d51..f353e8d6601fab 100644 --- a/caffe2/core/registry.h +++ b/caffe2/core/registry.h @@ -17,21 +17,13 @@ #include #include +#include + #include "caffe2/core/common.h" #include "caffe2/core/typeid.h" namespace caffe2 { -template -inline void PrintOffendingKey(const KeyType& /*key*/) { - printf("[key type printing not supported]\n"); -} - -template <> -inline void PrintOffendingKey(const string& key) { - printf("Offending key: %s.\n", key.c_str()); -} - /** * @brief A template class that allows one to register classes by keys. * @@ -59,7 +51,7 @@ class CAFFE2_API Registry { std::lock_guard lock(register_mutex_); if (registry_.count(key) != 0) { printf("Key already registered.\n"); - PrintOffendingKey(key); + at::PrintOffendingKey(key); std::exit(1); } registry_[key] = creator; From e1e69446f6c3419c9a587a692c571bf77f9cc047 Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Mon, 10 Sep 2018 17:07:52 -0700 Subject: [PATCH 018/237] Lockdown NO_TEST=1 for tests even more (#11415) Summary: Skip torch tests as well when NO_TEST=1 environment variable is set. Also remove the separate ATen code path for not being built with Caffe2, since it will always be built with Caffe2. cc The controller you requested could not be found. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11415 Reviewed By: soumith Differential Revision: D9758179 Pulled By: orionr fbshipit-source-id: e3e3327364fccdc57a703aeaad8c4f30452973fb --- aten/src/ATen/CMakeLists.txt | 34 ++++++++++++++++++---------------- tools/build_libtorch.py | 1 + torch/CMakeLists.txt | 6 +++--- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index f6d296dfe79e45..d65211e2269d98 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -405,30 +405,32 @@ else() endif() if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - foreach(test_src ${ATen_CPU_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - add_executable(${test_name} "${test_src}") - target_include_directories( - ${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) - target_link_libraries(${test_name} ATen_cpu) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION test) - endforeach() - - if(USE_CUDA OR USE_ROCM) - foreach(test_src ${ATen_CUDA_TEST_SRCS}) + if (BUILD_TEST) + foreach(test_src ${ATen_CPU_TEST_SRCS}) get_filename_component(test_name ${test_src} NAME_WE) - torch_cuda_based_add_executable(${test_name} "${test_src}") + add_executable(${test_name} "${test_src}") target_include_directories( ${test_name} PRIVATE $) target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) - target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda) + target_link_libraries(${test_name} ATen_cpu) add_test(NAME ${test_name} COMMAND $) install(TARGETS ${test_name} DESTINATION test) endforeach() + + if(USE_CUDA OR USE_ROCM) + foreach(test_src ${ATen_CUDA_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + torch_cuda_based_add_executable(${test_name} "${test_src}") + target_include_directories( + ${test_name} PRIVATE $) + target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) + target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) + target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda) + add_test(NAME ${test_name} COMMAND $) + install(TARGETS ${test_name} DESTINATION test) + endforeach() + endif() endif() # Make sure these don't get built by parent diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py index 46b73687a12064..8428282ab7b9f8 100644 --- a/tools/build_libtorch.py +++ b/tools/build_libtorch.py @@ -12,6 +12,7 @@ args = parser.parse_args() os.environ['BUILD_TORCH'] = 'ON' + os.environ['BUILD_TEST'] = 'ON' os.environ['ONNX_NAMESPACE'] = 'onnx_torch' os.environ['PYTORCH_PYTHON'] = sys.executable diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index fb0b76e8aeb91f..614dbf3937742d 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -9,7 +9,7 @@ else() option(USE_CUDA "Use CUDA" ON) endif() -option(BUILD_TORCH_TEST "Build torch test binaries" ON) +option(BUILD_TEST "Build torch test binaries" ON) option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF) # TODO: Unify with version from setup.py @@ -403,7 +403,7 @@ install(TARGETS torch ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}") # JIT Tests. TODO: Put into test/cpp/jit folder -if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) +if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp) target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES}) target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES) @@ -416,7 +416,7 @@ if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) endif() endif() -if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM) +if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api") add_executable(test_api From a17528277678b0d6dc76ffce1c4940c0b995abc2 Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Mon, 10 Sep 2018 17:11:16 -0700 Subject: [PATCH 019/237] Flags for LMDB, LevelDB, and Caffe2 ops (#11462) Summary: Add flags for LMDB and LevelDB, default `OFF`. These can be enabled with ``` USE_LMDB=1 USE_LEVELDB=1 python setup.py build_deps ``` Also add a flag to build Caffe2 ops, which is default `ON`. Disable with ``` NO_CAFFE2_OPS=1 python setup.py build_deps ``` cc Yangqing soumith pjh5 mingzhe09088 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11462 Reviewed By: soumith Differential Revision: D9758156 Pulled By: orionr fbshipit-source-id: 95fd206d72fdf44df54fc5d0aeab598bff900c63 --- .jenkins/caffe2/build.sh | 2 +- CMakeLists.txt | 1 + caffe2/CMakeLists.txt | 6 ++++-- caffe2/contrib/aten/CMakeLists.txt | 2 +- caffe2/share/contrib/CMakeLists.txt | 2 +- cmake/Dependencies.cmake | 2 +- cmake/Summary.cmake | 1 + scripts/onnx/install-develop.sh | 2 +- scripts/onnx/install.sh | 2 +- setup.py | 18 ++++++++++++++++-- tools/build_pytorch_libs.bat | 3 +++ tools/build_pytorch_libs.sh | 3 +++ tools/setup_helpers/build.py | 3 +++ 13 files changed, 37 insertions(+), 10 deletions(-) diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index d9b2a2e096a1e5..ffcbbc136b50d5 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -226,7 +226,7 @@ else export MAX_JOBS=`expr $(nproc) - 1` fi - USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user + USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user # This is to save test binaries for testing cp -r torch/lib/tmp_install $INSTALL_PREFIX diff --git a/CMakeLists.txt b/CMakeLists.txt index b7f56f96e87f3a..8dc6dc146ffb79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF) option(BUILD_DOCS "Build Caffe2 documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON) option(BUILD_PYTHON "Build Python binaries" ON) +option(BUILD_CAFFE2_OPS "Build Caffe2 operators" ON) option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON) cmake_dependent_option( CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 32183e212c923c..8672f836d89c69 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -86,8 +86,10 @@ add_subdirectory(mobile) add_subdirectory(mpi) add_subdirectory(observers) add_subdirectory(onnx) -add_subdirectory(operators) -add_subdirectory(operators/rnn) +if (BUILD_CAFFE2_OPS) + add_subdirectory(operators) + add_subdirectory(operators/rnn) +endif() add_subdirectory(opt) add_subdirectory(perfkernels) add_subdirectory(python) diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt index 92eb671e019cb7..add3918d4c3373 100644 --- a/caffe2/contrib/aten/CMakeLists.txt +++ b/caffe2/contrib/aten/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT BUILD_ATEN_MOBILE) +if(NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS) # Add source generated by Codegen.cmake and pass to parent list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc) list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc) diff --git a/caffe2/share/contrib/CMakeLists.txt b/caffe2/share/contrib/CMakeLists.txt index 01af2c0616dfce..0fc3a4186f0189 100644 --- a/caffe2/share/contrib/CMakeLists.txt +++ b/caffe2/share/contrib/CMakeLists.txt @@ -1,4 +1,4 @@ -if (USE_NNPACK) +if (USE_NNPACK AND BUILD_CAFFE2_OPS) add_subdirectory(nnpack) endif() if (USE_ZSTD) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 67b65c0dcba74a..fed64398276078 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -770,7 +770,7 @@ if (USE_NNAPI AND NOT ANDROID) caffe2_update_option(USE_NNAPI OFF) endif() -if (NOT BUILD_ATEN_MOBILE) +if (NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS) if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen) if (USE_CUDA) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 846f4d6154b8ba..16d18ac7634d0d 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -38,6 +38,7 @@ function (caffe2_print_configuration_summary) message(STATUS " Python includes : ${PYTHON_INCLUDE_DIRS}") message(STATUS " Python site-packages: ${PYTHON_SITE_PACKAGES}") endif() + message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}") message(STATUS " BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh index 02888b11901aa6..24bb4bd5808a1c 100755 --- a/scripts/onnx/install-develop.sh +++ b/scripts/onnx/install-develop.sh @@ -15,4 +15,4 @@ pip install -e "$tp2_dir/onnx" # Install caffe2 and pytorch pip install -r "$top_dir/caffe2/requirements.txt" pip install -r "$top_dir/requirements.txt" -USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop +USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop diff --git a/scripts/onnx/install.sh b/scripts/onnx/install.sh index 7d4a3139d2e002..29da7e17baa00a 100755 --- a/scripts/onnx/install.sh +++ b/scripts/onnx/install.sh @@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx" # Install caffe2 and pytorch pip install -r "$top_dir/caffe2/requirements.txt" pip install -r "$top_dir/requirements.txt" -USE_OPENCV=1 BUILD_BINARY=1 python setup.py install +USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install diff --git a/setup.py b/setup.py index 34d2e7e5402840..7edd89cbd7fcb6 100644 --- a/setup.py +++ b/setup.py @@ -46,12 +46,21 @@ # disables use of system-wide nccl (we will use our submoduled # copy in third_party/nccl) # +# NO_CAFFE2_OPS +# disable Caffe2 operators build +# # USE_GLOO_IBVERBS # toggle features related to distributed support # # USE_OPENCV # enables use of OpenCV for additional operators # +# USE_LEVELDB +# enables use of LevelDB for storage +# +# USE_LMBD +# enables use of LMDB for storage +# # BUILD_BINARY # enables the additional binaries/ build # @@ -144,11 +153,13 @@ def hotpatch_var(var, prefix='USE_'): list(map(hotpatch_var, use_env_vars)) # Also hotpatch a few with BUILD_* equivalent -build_env_vars = ['BINARY', 'TEST'] +build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS'] [hotpatch_var(v, 'BUILD_') for v in build_env_vars] from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION -from tools.setup_helpers.build import BUILD_BINARY, BUILD_TEST, USE_OPENCV +from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST, + BUILD_CAFFE2_OPS, USE_LEVELDB, + USE_LMDB, USE_OPENCV) from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY, CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR) @@ -376,7 +387,10 @@ def build_libs(libs): my_env["BUILD_PYTHON"] = "ON" my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF" my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF" + my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF" my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF" + my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF" + my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF" my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF" try: diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat index 80d79dc4364260..34d7ea1ac0d206 100755 --- a/tools/build_pytorch_libs.bat +++ b/tools/build_pytorch_libs.bat @@ -180,10 +180,13 @@ goto:eof -DBUILD_BINARY=%BUILD_BINARY% ^ -DBUILD_TEST=OFF ^ -DINSTALL_TEST=%INSTALL_TEST% ^ + -DBUILD_CAFFE2_OPS=%BUILD_CAFFE2_OPS% ^ -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^ -DUSE_CUDA=%USE_CUDA% ^ -DUSE_CUDNN=OFF ^ -DUSE_NNPACK=%USE_NNPACK% ^ + -DUSE_LEVELDB=%USE_LEVELDB% ^ + -DUSE_LMDB=%USE_LMDB% ^ -DUSE_OPENCV=%USE_OPENCV% ^ -DUSE_GLOG=OFF ^ -DUSE_GFLAGS=OFF ^ diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index ba0bde0d577347..d09dc756d82d8e 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -263,11 +263,14 @@ function build_caffe2() { -DBUILD_BINARY=$BUILD_BINARY \ -DBUILD_TEST=$BUILD_TEST \ -DINSTALL_TEST=$INSTALL_TEST \ + -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \ -DONNX_NAMESPACE=$ONNX_NAMESPACE \ -DUSE_CUDA=$USE_CUDA \ -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \ -DUSE_ROCM=$USE_ROCM \ -DUSE_NNPACK=$USE_NNPACK \ + -DUSE_LEVELDB=$USE_LEVELDB \ + -DUSE_LMDB=$USE_LMDB \ -DUSE_OPENCV=$USE_OPENCV \ -DUSE_GLOG=OFF \ -DUSE_GFLAGS=OFF \ diff --git a/tools/setup_helpers/build.py b/tools/setup_helpers/build.py index 82364bd7394c51..09e26cb5aa1b20 100644 --- a/tools/setup_helpers/build.py +++ b/tools/setup_helpers/build.py @@ -2,4 +2,7 @@ BUILD_BINARY = check_env_flag('BUILD_BINARY') BUILD_TEST = not check_negative_env_flag('BUILD_TEST') +BUILD_CAFFE2_OPS = not check_negative_env_flag('BUILD_CAFFE2_OPS') +USE_LEVELDB = check_env_flag('USE_LEVELDB') +USE_LMDB = check_env_flag('USE_LMDB') USE_OPENCV = check_env_flag('USE_OPENCV') From 9cfdf0d677b2d7f7db83b56fce42fdfd6e5f26c3 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 10 Sep 2018 18:19:18 -0700 Subject: [PATCH 020/237] Document the Embedding module (#11469) Summary: ebetica soumith ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11469 Differential Revision: D9757547 Pulled By: goldsborough fbshipit-source-id: a95673abe949bb81d716dbc03c5c3e2a11cc15d3 --- .../api/include/torch/nn/modules/embedding.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h index bc33f8df74f75c..c2a701eb3ede24 100644 --- a/torch/csrc/api/include/torch/nn/modules/embedding.h +++ b/torch/csrc/api/include/torch/nn/modules/embedding.h @@ -10,12 +10,16 @@ namespace torch { namespace nn { +/// Options for the `Embedding` module. struct EmbeddingOptions { EmbeddingOptions(int64_t count, int64_t dimension); + /// The number of embeddings (number of rows in the table). TORCH_ARG(int64_t, count); + /// The size of each embedding vector (number of columns in the table). TORCH_ARG(int64_t, dimension); }; +/// Performs a lookup in a fixed size embedding table. class EmbeddingImpl : public torch::nn::Cloneable { public: EmbeddingImpl(int64_t count, int64_t dimension) @@ -23,12 +27,23 @@ class EmbeddingImpl : public torch::nn::Cloneable { explicit EmbeddingImpl(EmbeddingOptions options); void reset() override; - Tensor forward(Tensor); + /// Performs a lookup on the embedding table stored in `weight` using the + /// `indices` supplied and returns the result. + Tensor forward(Tensor indices); + + /// The `Options` used to configure this `Embedding` module. + /// Changes to `EmbeddingOptions` *after construction* have no effect. EmbeddingOptions options; + + /// The embedding table. Tensor weight; }; +/// A `ModuleHolder` subclass for `EmbeddingImpl`. +/// See the documentation for `EmbeddingImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(Embedding); } // namespace nn From dd8defeb3f02f6307021cf690438a912265a5437 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 10 Sep 2018 19:41:46 -0700 Subject: [PATCH 021/237] Document the Functional module (#11460) Summary: Document the `Functional` module in the C++ API. ebetica ezyang soumith Pull Request resolved: https://github.com/pytorch/pytorch/pull/11460 Differential Revision: D9757555 Pulled By: goldsborough fbshipit-source-id: 15f8bf6d60bd26f3f4e69fb8e414e186e3c220ee --- test/cpp/api/modules.cpp | 1 + .../api/include/torch/nn/modules/functional.h | 45 ++++++++++++++++++- torch/csrc/api/src/nn/modules/functional.cpp | 4 +- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 95f38d15a4c822..2da1e1c45bf6f6 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -226,6 +226,7 @@ TEST_CASE("modules") { REQUIRE(output.equal(torch::ones(5, torch::requires_grad()))); was_called = false; + // Use the call operator overload here. output = functional(torch::ones(5, torch::requires_grad())); REQUIRE(was_called); REQUIRE(output.equal(torch::ones(5, torch::requires_grad()))); diff --git a/torch/csrc/api/include/torch/nn/modules/functional.h b/torch/csrc/api/include/torch/nn/modules/functional.h index 4e234a8ad3fc80..44d11d1ffa6ee0 100644 --- a/torch/csrc/api/include/torch/nn/modules/functional.h +++ b/torch/csrc/api/include/torch/nn/modules/functional.h @@ -6,16 +6,51 @@ #include #include +#include namespace torch { namespace nn { -// Lets you create a container from a function, designed for use in -// Sequential. +/// Wraps a function in a `Module`. +/// +/// The `Functional` module allows wrapping an arbitrary function or function +/// object in an `nn::Module`. This is primarily handy for usage in +/// `Sequential`. +/// +/// \rst +/// .. code-block:: +/// Sequential sequential( +/// Linear(3, 4), +/// Functional(torch::relu), +/// BatchNorm(3), +/// Functional(torch::elu, /*alpha=*/1)); +/// \endrst +/// +/// While a `Functional` module only accepts a single `Tensor` as input, it is +/// possible for the the wrapped function to accept further arguments. However, +/// these have to be bound *at construction time*. For example, if +/// you want to wrap `torch::leaky_relu`, which accepts a `slope` scalar as its +/// second argument, with a particular value for its `slope` in a `Functional` +/// module, you could write `Functional(torch::leaky_relu, /*slope=*/0.5)`. The +/// value of `0.5` is then stored within the `Functional` object and supplied to +/// the function call at invocation time. Note that such bound values are +/// evaluated eagerly and stored a single time. See the documentation of +/// [`std::bind`](https://en.cppreference.com/w/cpp/utility/functional/bind) for +/// more information on the semantics of argument binding. +/// +/// \rst +/// .. attention:: +/// After passing any bound arguments, the function must accept a single +/// tensor and return a single tensor. +/// \endrst +/// +/// Note that `Functional` overloads the call operator (`operator()`) such that +/// you can invoke it with `my_func(...)`. class FunctionalImpl : public torch::nn::Cloneable { public: using Function = std::function; + /// Constructs a `Functional` from a function object. explicit FunctionalImpl(Function function); template < @@ -34,6 +69,8 @@ class FunctionalImpl : public torch::nn::Cloneable { } void reset() override; + + /// Forwards the `input` tensor to the underlying (bound) function object. Tensor forward(Tensor input); /// Calls forward(input). @@ -43,6 +80,10 @@ class FunctionalImpl : public torch::nn::Cloneable { Function function_; }; +/// A `ModuleHolder` subclass for `FunctionalImpl`. +/// See the documentation for `FunctionalImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(Functional); } // namespace nn diff --git a/torch/csrc/api/src/nn/modules/functional.cpp b/torch/csrc/api/src/nn/modules/functional.cpp index 591634db23640e..1820e8f4df2384 100644 --- a/torch/csrc/api/src/nn/modules/functional.cpp +++ b/torch/csrc/api/src/nn/modules/functional.cpp @@ -13,11 +13,11 @@ FunctionalImpl::FunctionalImpl(Function function) void FunctionalImpl::reset() {} Tensor FunctionalImpl::forward(Tensor input) { - return function_(input); + return function_(std::move(input)); } Tensor FunctionalImpl::operator()(Tensor input) { - return forward(input); + return forward(std::move(input)); } } // namespace nn } // namespace torch From f9d12eeb270d74e76ac2599495b95e8c4f5cb69b Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Mon, 10 Sep 2018 20:36:49 -0700 Subject: [PATCH 022/237] Give copy an optional device argument. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11497 Differential Revision: D9762014 Pulled By: gchanan fbshipit-source-id: 996419cc5e86d000af953d030ff361adafb921ad --- aten/src/ATen/templates/TensorMethods.h | 5 ++--- aten/src/ATen/templates/Type.h | 2 +- aten/src/ATen/templates/TypeDefault.cpp | 7 +++++-- aten/src/ATen/templates/TypeDefault.h | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 07e5f2b634372c..5573f5802809e9 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -7,7 +7,6 @@ #include "ATen/core/SparseTensorRef.h" #include "ATen/Type.h" #include "ATen/core/TensorOptions.h" -#include "ATen/DeviceGuard.h" namespace at { @@ -56,8 +55,8 @@ inline Tensor to( AT_CHECK(tensor.is_variable() == options.is_variable(), "cannot change is_variable, from: ", tensor.is_variable(), " to: ", options.is_variable()); - DeviceGuard guard(options.device()); - return tensor.type().toBackend(options.backend()).toScalarType(options.dtype()).copy(tensor, non_blocking); + return tensor.type().toBackend(options.backend()).toScalarType(options.dtype()) + .copy(tensor, non_blocking, options.device()); } } // namespace detail diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 3a7080ea201e35..183b2d4d498026 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -99,7 +99,7 @@ struct AT_API Type { return backendToDeviceType(backend()); } - virtual Tensor copy(const Tensor & src, bool non_blocking=false) const = 0; + virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional to_device={}) const = 0; virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0; virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0; diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 54c6c282dcb95d..4e9c454977d588 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -22,8 +22,11 @@ Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking return s_copy_(self, b_src, non_blocking); } -Tensor TypeDefault::copy(const Tensor & src, bool non_blocking) const { - // TODO(psag): have a DeviceGuard here +Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional to_device) const { + DeviceGuard device_guard; + if (to_device.has_value()) { + device_guard.set_index(to_device.value().index()); + } AT_CHECK(src.defined(), "attempt to copy an undefined tensor"); if (is_sparse()) { auto indices = src._indices(); diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h index 64ec158f82349e..0f315e590d700c 100644 --- a/aten/src/ATen/templates/TypeDefault.h +++ b/aten/src/ATen/templates/TypeDefault.h @@ -25,7 +25,7 @@ struct AT_API TypeDefault : public Type { Type & toBackend(Backend b) const override; Type & toScalarType(ScalarType s) const override; - Tensor copy(const Tensor & src, bool non_blocking=false) const override; + Tensor copy(const Tensor & src, bool non_blocking=false, optional to_device={}) const override; Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const override; void backward(Tensor & self, at::optional gradient, bool keep_graph, bool create_graph) const override; From b14a80553d550fa5e2fe588ca50d885d643bce5e Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 10 Sep 2018 20:45:03 -0700 Subject: [PATCH 023/237] Ignore functional doc error Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11508 Differential Revision: D9764380 Pulled By: goldsborough fbshipit-source-id: 3abb9c04f46137be833ea26d67734741e14f8010 --- docs/cpp/check-doxygen.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cpp/check-doxygen.sh b/docs/cpp/check-doxygen.sh index 58e65b403e39bf..375347974ac749 100755 --- a/docs/cpp/check-doxygen.sh +++ b/docs/cpp/check-doxygen.sh @@ -20,6 +20,7 @@ ignore_warning "warning: no uniquely matching class member found for" ignore_warning "warning: source ../../build/aten/src/ is not a readable file" ignore_warning "warning: source ../../build/aten/src/ATen/Tensor.h is not a readable file" ignore_warning "warning: source ../../build/aten/src/ATen/Functions.h is not a readable file" +ignore_warning "warning: documented symbol \`torch::nn::FunctionalImpl::FunctionalImpl' was not declared or defined" # Count the number of remaining warnings. warnings=$(grep 'warning:' doxygen-log.txt | wc -l) From 0988bbad2de5e0ce403c5e6f781437b24a484fc2 Mon Sep 17 00:00:00 2001 From: Teng Li Date: Mon, 10 Sep 2018 23:21:36 -0700 Subject: [PATCH 024/237] C10d release to torch.distributed for PT1 (#11405) Summary: The old `torch.distributed` will go to `torch.distributed.deprecated` The old DDP will go to `torch.nn.parallel.deprecated` Now `torch.nn.parallel.DDP` will use c10d DDP Now `torch.distributed` will use C10d frontend API Pull Request resolved: https://github.com/pytorch/pytorch/pull/11405 Reviewed By: pietern Differential Revision: D9733733 Pulled By: teng-li fbshipit-source-id: d6a3f3e73f8d3a7fcb1f4baef53c78063b8cbb08 --- test/run_test.py | 8 +- test/test_c10d.py | 8 +- test/test_distributed.py | 6 +- test/test_thd_distributed.py | 6 +- torch/csrc/distributed/Module.cpp | 4 +- torch/csrc/distributed/c10d/init.cpp | 2 +- torch/distributed/__init__.py | 563 +---------------- torch/distributed/c10d/__init__.py | 13 - torch/distributed/deprecated/__init__.py | 566 ++++++++++++++++++ .../{ => deprecated}/remote_types.py | 2 +- .../{c10d => }/distributed_c10d.py | 0 torch/distributed/{c10d => }/rendezvous.py | 0 torch/nn/parallel/__init__.py | 3 +- torch/nn/parallel/deprecated/__init__.py | 4 + torch/nn/parallel/deprecated/distributed.py | 482 +++++++++++++++ .../distributed_cpu.py} | 17 +- torch/nn/parallel/distributed.py | 443 ++++++-------- torch/nn/parallel/distributed_c10d.py | 366 ----------- torch/nn/parallel/distributed_cpu.py | 5 +- torch/utils/data/distributed.py | 10 +- 20 files changed, 1265 insertions(+), 1243 deletions(-) delete mode 100644 torch/distributed/c10d/__init__.py create mode 100644 torch/distributed/deprecated/__init__.py rename torch/distributed/{ => deprecated}/remote_types.py (96%) rename torch/distributed/{c10d => }/distributed_c10d.py (100%) rename torch/distributed/{c10d => }/rendezvous.py (100%) create mode 100644 torch/nn/parallel/deprecated/__init__.py create mode 100644 torch/nn/parallel/deprecated/distributed.py rename torch/nn/parallel/{distributed_c10d_cpu.py => deprecated/distributed_cpu.py} (88%) delete mode 100644 torch/nn/parallel/distributed_c10d.py diff --git a/test/run_test.py b/test/run_test.py index 40095c9736095c..d7af8e47ab8767 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -14,7 +14,7 @@ import torch from torch.utils import cpp_extension from common import TEST_WITH_ROCM -import torch.distributed.c10d as c10d +import torch.distributed as dist TESTS = [ 'autograd', @@ -60,12 +60,12 @@ } -if c10d.is_available(): - if c10d.is_mpi_available(): +if dist.is_available(): + if dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG['mpi'] = { 'WORLD_SIZE': '3' } - if c10d.is_nccl_available(): + if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG['nccl'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' } diff --git a/test/test_c10d.py b/test/test_c10d.py index 4e804ee1635ac4..64bedb3183a212 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -14,8 +14,8 @@ import common from torch import nn import torch.nn.functional as F -from torch.distributed import c10d -from torch.nn.parallel import distributed_c10d +import torch.distributed as c10d +from torch.nn.parallel import DistributedDataParallel from common import TestCase @@ -570,7 +570,7 @@ def world_size(self): def _test_ddp_with_process_group(self, process_group): gpus = gpus_for_rank(self.world_size)[self.rank] model = Net() - ddp_model = distributed_c10d._DistributedDataParallelC10d( + ddp_model = DistributedDataParallel( copy.deepcopy(model).cuda(gpus[0]), device_ids=gpus, process_group=process_group) @@ -731,7 +731,7 @@ def test_fp16(self): gpus = gpus_for_rank(self.world_size)[self.rank] model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half() nn.init.constant_(model.weight, 1) - ddp_model = distributed_c10d._DistributedDataParallelC10d( + ddp_model = DistributedDataParallel( model, device_ids=[gpus[0]], process_group=process_group, diff --git a/test/test_distributed.py b/test/test_distributed.py index c37eac2689e4b9..41fa5cdac0c827 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -12,7 +12,7 @@ import torch import torch.cuda -import torch.distributed.c10d as dist +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim @@ -1149,7 +1149,7 @@ def test_DistributedDataParallel(self): # DDP training setup model_DDP = copy.deepcopy(model) model_DDP.cuda(gpu_subset[0]) - model_DDP = nn.parallel._DistributedDataParallelC10d( + model_DDP = nn.parallel.DistributedDataParallel( model_DDP, device_ids=gpu_subset ) @@ -1183,7 +1183,7 @@ def test_DistributedDataParallelCPU(self): # DDP-CPU training setup model_DDP = copy.deepcopy(model_base) - model_DDP = nn.parallel._DistributedDataParallelC10dCPU(model_DDP) + model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) # dummy data initialization local_bs = 2 diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py index 47dbe9d056f154..a104baeb3fb451 100644 --- a/test/test_thd_distributed.py +++ b/test/test_thd_distributed.py @@ -11,7 +11,7 @@ import torch import torch.cuda -import torch.distributed as dist +import torch.distributed.deprecated as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim @@ -985,7 +985,7 @@ def test_DistributedDataParallel(self): # DDP training setup model_DDP = copy.deepcopy(model) model_DDP.cuda(gpu_subset[0]) - model_DDP = nn.parallel.DistributedDataParallel( + model_DDP = nn.parallel.deprecated.DistributedDataParallel( model_DDP, device_ids=gpu_subset ) @@ -1019,7 +1019,7 @@ def test_DistributedDataParallelCPU(self): # DDP-CPU training setup model_DDP = copy.deepcopy(model_base) - model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) + model_DDP = nn.parallel.deprecated.DistributedDataParallelCPU(model_DDP) # dummy data initialization local_bs = 2 diff --git a/torch/csrc/distributed/Module.cpp b/torch/csrc/distributed/Module.cpp index 9bf833e0b7a570..84d655288fe373 100644 --- a/torch/csrc/distributed/Module.cpp +++ b/torch/csrc/distributed/Module.cpp @@ -121,7 +121,7 @@ static THDReduceOp _getReduceOp(PyObject *obj) auto it = obj2reduceop.find(obj); if (it == obj2reduceop.end()) { throw std::runtime_error("op should be a constant from " - "torch.distributed.reduce_op"); + "torch.distributed.deprecated.reduce_op"); } return it->second; } @@ -132,7 +132,7 @@ static THDGroup _getGroup(PyObject *obj) if (it == obj2group.end()) { if (!THPUtils_checkLong(obj)) throw std::runtime_error("group should be an int or one of the values " - "from torch.distributed.group"); + "from torch.distributed.deprecated.group"); return THPUtils_unpackLong(obj); } return it->second; diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 021e5e01247d15..bfe381baf62f1f 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -34,7 +34,7 @@ using shared_ptr_class_ = py::class_>; PyObject* c10d_init(PyObject* _unused) { auto c10d_module = - THPObjectPtr(PyImport_ImportModule("torch.distributed.c10d")); + THPObjectPtr(PyImport_ImportModule("torch.distributed")); if (!c10d_module) { throw python_error(); } diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index e717ae724c66a3..5356097743aa3c 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -1,566 +1,13 @@ -""" -torch.distributed provides an MPI-like interface for exchanging tensor -data across multi-machine networks. It supports a few different backends -and initialization methods. -""" import torch -import atexit -import warnings -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors - - -class dist_backend: - UNDEFINED = -1 - TCP = 0 - MPI = 1 - GLOO = 2 - NCCL = 3 - - -_INITIALIZED_PG = 1 -_INITIALIZED_MW = 2 -_initialized = 0 -_backend = dist_backend.UNDEFINED -_scope = locals() - - -def _extend_scope(module): - _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')}) def is_available(): - return torch._C._has_distributed() - - -def destroy_process_group(): - r"""Destroy the initialized distributed package - """ - global _backend - global _initialized - torch._C._dist_destroy_process_group() - _backend = dist_backend.UNDEFINED - _initialized = 0 - - -def is_initialized(): - r"""Checking if the process group has been initialized - """ - return _initialized == _INITIALIZED_PG - - -def init_process_group(backend, init_method='env://', **kwargs): - r"""Initializes the distributed package. - - Arguments: - backend (str): Name of the backend to use. Depending on build-time configuration - valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``. - init_method (str, optional): URL specifying how to initialize the package. - world_size (int, optional): Number of processes participating in the job. - rank (int, optional): Rank of the current process. - group_name (str, optional): Group name. See description of init methods. - - To enable ``backend == mpi``, PyTorch needs to built from source on a system that - supports MPI. If you want to use Open MPI with CUDA-aware support, please use - Open MPI major version 2 and above. - - .. note:: - This method initializes CUDA context. Therefore, if multiple processes - run on a single machine but use different GPUs, make sure to use - :func:`torch.cuda.set_device` before this method to avoid unnecessarily - creating context on the first visible device. - - """ - world_size = kwargs.pop('world_size', -1) - group_name = kwargs.pop('group_name', '') - rank = kwargs.pop('rank', -1) - assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) - - if not is_available(): - raise RuntimeError("PyTorch built without distributed support") - - global _initialized - if _initialized: - raise RuntimeError("trying to initialize torch.distributed twice!") - - # Checking and assigning the distributed backend - global _backend - - backend = backend.lower() - if backend == "tcp": - _backend = dist_backend.TCP - elif backend == "mpi": - _backend = dist_backend.MPI - elif backend == "gloo": - _backend = dist_backend.GLOO - elif backend == "nccl": - _backend = dist_backend.NCCL - else: - raise RuntimeError("Invalid distributed backend name: " + backend) - - torch._C._dist_init_process_group(backend, init_method, world_size, - group_name, rank) - _initialized = _INITIALIZED_PG - - if _backend == dist_backend.NCCL: - atexit.register(destroy_process_group) - - if not torch._C._dist_init_extension(False, reduce_op, group): - raise RuntimeError("distributed module initialization failed") - - -def init_master_worker(backend, init_method='env://', **kwargs): - warnings.warn(""" - ================================================================================ - WARNING - ================================================================================ - Master-worker mode is still experimental. The API will change without - notice and we do not guarantee full correctness and expected performance yet. - We'll announce it once it's ready. - """) - world_size = kwargs.pop('world_size', -1) - group_name = kwargs.pop('group_name', '') - rank = kwargs.pop('rank', -1) - assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) - - if not is_available(): - raise RuntimeError("PyTorch built without distributed support") - - global _initialized - if _initialized: - raise RuntimeError("trying to initialize torch.distributed twice!") - torch._C._dist_init_master_worker(backend, init_method, world_size, - group_name, rank) - _initialized = _INITIALIZED_MW - import torch.distributed.collectives as collectives - import torch.distributed.remote_types as remote_types - _extend_scope(collectives) - _extend_scope(remote_types) - if not torch._C._dist_init_extension(True, reduce_op, group): - raise RuntimeError("distributed module initialization failed") - - -class reduce_op(object): - SUM = object() - PRODUCT = object() - MAX = object() - MIN = object() - - -class group(object): - WORLD = object() - - -class _DistributedRequest(object): - def __init__(self, request): - self.request = request - - def is_completed(self): - return torch._C._dist_request_is_completed(self.request) - - def wait(self): - torch._C._dist_request_wait(self.request) - - -def get_rank(): - r"""Returns the rank of current process. - - Rank is a unique identifier assigned to each process within a distributed - group. They are always consecutive integers ranging from ``0`` to - ``world_size - 1`` (inclusive). - """ - assert torch.distributed._initialized - return torch._C._dist_get_rank() - - -def get_world_size(): - r"""Returns the number of processes in the distributed group.""" - assert torch.distributed._initialized - return torch._C._dist_get_num_processes() - - -def isend(tensor, dst): - r"""Sends a tensor asynchronously. - - Arguments: - tensor (Tensor): Tensor to send. - dst (int): Destination rank. - - Returns: - A distributed request object. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return _DistributedRequest(torch._C._dist_isend(tensor, dst)) - - -def irecv(tensor, src): - r"""Receives a tensor asynchronously. - - Arguments: - tensor (Tensor): Tensor to fill with received data. - src (int): Source rank. - - Returns: - A distributed request object. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return _DistributedRequest(torch._C._dist_irecv(tensor, src)) - - -def send(tensor, dst): - r"""Sends a tensor synchronously. - - Arguments: - tensor (Tensor): Tensor to send. - dst (int): Destination rank. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return torch._C._dist_send(tensor, dst) - - -def recv(tensor, src=None): - r"""Receives a tensor synchronously. - - Arguments: - tensor (Tensor): Tensor to fill with received data. - src (int, optional): Source rank. Will receive from any - process if unspecified. - - Returns: - Sender rank. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - if src is None: - return torch._C._dist_recv_any_source(tensor) - return torch._C._dist_recv(tensor, src) - - -def broadcast_multigpu(tensor_list, src, group=group.WORLD): - r"""Broadcasts the tensor to the whole group with multiple GPU tensors - per node. - - :attr:`tensor` must have the same number of elements in all the GPUs from - all processes participating in the collective. each tensor in the list must - be on a different GPU. - - .. note:: - Only NCCL backend is currently supported. :attr:`tensor_list` should only - contain GPU tensors. - - Arguments: - tensor_list (List[Tensor]): Tensors that participate in the collective - operation. if ``src`` is the rank, then the first element of - ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all - other tensors (on different GPUs) in the src process and all tensors - in ``tensor_list`` of other non-src processes. You also need to make - sure that ``len(tensor_list)`` is the same for all the distributed - processes calling this function. - - src (int): Source rank. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - - return torch._C._dist_broadcast_multigpu(tensor_list, src, group) - - -def broadcast(tensor, src, group=group.WORLD): - r"""Broadcasts the tensor to the whole group. - - :attr:`tensor` must have the same number of elements in all processes - participating in the collective. - - Arguments: - tensor (Tensor): Data to be sent if :attr:`src` is the rank of - current process, and tensor to be used to save received data - otherwise. - src (int): Source rank. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return torch._C._dist_broadcast(tensor, src, group) - - -def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD): - r"""Reduces the tensor data across all machines in such a way that all get - the final result. This function reduces a number of tensors on every node, - while each tensor resides on a different GPU. - Therefore, the input tensor in the tensor list needs to be GPU tensors. - Also, each tensor in the tensor list needs to reside on a different GPU. - - After the call, all tensors in :attr:`tensor_list` will be bitwise identical - in all processes. - - .. note:: - Only NCCL backend is currently supported. :attr:`tensor_list` should only - contain GPU tensors. - - Arguments: - tensor_list (List[Tensor]): List of input and output tensors of - the collective. The function operates in-place and requires that - each tensor to be a GPU tensor on different GPUs. - You also need to make sure that ``len(tensor_list)`` is the same for - all the distributed processes calling this function. - - op (optional): One of the values from ``torch.distributed.reduce_op`` - enum. Specifies an operation used for element-wise reductions. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - - return torch._C._dist_all_reduce_multigpu(tensor_list, op, group) - - -def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD): - r"""Reduces the tensor data across all machines in such a way that all get - the final result. - - After the call :attr:`tensor` will be bitwise identical in all processes. - - Arguments: - tensor (Tensor): Input and output of the collective. The function - operates in-place. - op (optional): One of the values from ``torch.distributed.reduce_op`` - enum. Specifies an operation used for element-wise reductions. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return torch._C._dist_all_reduce(tensor, op, group) - - -def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD): - r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor - in :attr`tensor_list` should reside on a separate GPU. - - Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is - going to receive the final result. - - .. note:: - Only NCCL backend is currently supported. :attr:`tensor_list` should only - contain GPU tensors. - - Arguments: - tensor_list (List[Tensor]): Input and output GPU tensors of the - collective. The function operates in-place. - You also need to make sure that ``len(tensor_list)`` is the same for - all the distributed processes calling this function. - - dst (int): Destination rank - op (optional): One of the values from ``torch.distributed.reduce_op`` - enum. Specifies an operation used for element-wise reductions. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - - return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group) - - -def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD): - r"""Reduces the tensor data across all machines. - - Only the process with rank :attr:`dst` is going to receive the final result. - - Arguments: - tensor (Tensor): Input and output of the collective. The function - operates in-place. - dst (int): Destination rank - op (optional): One of the values from ``torch.distributed.reduce_op`` - enum. Specifies an operation used for element-wise reductions. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return torch._C._dist_reduce(tensor, dst, op, group) - - -def all_gather_multigpu(output_tensor_lists, - input_tensor_list, - group=group.WORLD): - r"""Gathers tensors from the whole group in a list. - Each tensor in :attr:`input_tensor_list` should reside on a separate GPU. - - .. note:: - Only NCCL backend is currently supported. :attr:`output_tensor_lists` and - :attr:`input_tensor_list` should only contain GPU tensors. - - Arguments: - output_tensor_lists (List[List[Tensor]]): Output lists. It should - contain correctly-sized tensors on each GPU to be used for output of - the collective. - e.g. ``output_tensor_lists[i]`` contains the all_gather - result that resides on the GPU of ``input_tensor_list[i]``. - Note that each element of ``output_tensor_lists[i]`` has the size of - ``world_size * len(input_tensor_list)``, since the function all - gathers the result from every single GPU in the group. To interpret - each element of ``output_tensor_list[i]``, note that - ``input_tensor_list[j]`` of rank k will be appear in - ``output_tensor_list[i][rank * world_size + j]`` - Also note that ``len(output_tensor_lists)``, and the size of each - element in ``output_tensor_lists`` (each element is a list, - therefore ``len(output_tensor_lists[i])``) need to be the same - for all the distributed processes calling this function. - - input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to - be broadcast from current process. - Note that ``len(input_tensor_list)`` needs to be the same for - all the distributed processes calling this function. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - - flatten_tensor_list = [] - for output_tensor_list in output_tensor_lists: - flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) - - ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, - input_tensor_list, - group) - - for output_tensor_list, flatten_tensor in zip(output_tensor_lists, - flatten_tensor_list): - for tensor, value in zip(output_tensor_list, - _unflatten_dense_tensors(flatten_tensor, - output_tensor_list)): - tensor.copy_(value) - - return ret - - -def all_gather(tensor_list, tensor, group=group.WORLD): - r"""Gathers tensors from the whole group in a list. - - Arguments: - tensor_list (list[Tensor]): Output list. It should contain - correctly-sized tensors to be used for output of the collective. - tensor (Tensor): Tensor to be broadcast from current process. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - if _backend != dist_backend.NCCL: - return torch._C._dist_all_gather(tensor_list, tensor, group) - else: - return all_gather_multigpu([tensor_list], [tensor], group) - - -def gather(tensor, **kwargs): - r"""Gathers a list of tensors in a single process. - - Arguments: - tensor (Tensor): Input tensor. - dst (int): Destination rank. Required in all processes except the one that - is receiveing the data. - gather_list (list[Tensor]): List of appropriately-sized tensors to - use for received data. Required only in the receiving process. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - my_rank = get_rank() - dst = kwargs.pop('dst', my_rank) - gather_list = kwargs.pop('gather_list', None) - _group = kwargs.pop('group', group.WORLD) - if kwargs: - raise RuntimeError("got unexpected kwargs") - if dst == my_rank: - if gather_list is None: - raise RuntimeError("gather_list is a required argument in gather destination") - return torch._C._dist_gather_recv(gather_list, tensor, _group) - else: - if gather_list: - raise RuntimeError("non-empty gather_list can be given only to gather destination") - return torch._C._dist_gather_send(tensor, dst, _group) - - -def scatter(tensor, **kwargs): - r"""Scatters a list of tensors to all processes in a group. - - Each process will receive exactly one tensor and store its data in the - :attr:`tensor` argument. - - Arguments: - tensor (Tensor): Output tensor. - src (int): Source rank. Required in all processes except the one that - is sending the data. - scatter_list (list[Tensor]): List of tensors to scatter. Required only - in the process that is sending the data. - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - my_rank = get_rank() - src = kwargs.pop('src', my_rank) - scatter_list = kwargs.pop('scatter_list', None) - _group = kwargs.pop('group', group.WORLD) - if kwargs: - raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys()))) - if src == my_rank: - if scatter_list is None: - raise RuntimeError("scatter_list is a required argument in scatter source") - return torch._C._dist_scatter_send(scatter_list, tensor, _group) - else: - if scatter_list: - raise RuntimeError("non-empty can be given only to scatter source") - return torch._C._dist_scatter_recv(tensor, src, _group) - - -def barrier(group=group.WORLD): - r"""Synchronizes all processes. - - This collective blocks processes until the whole group enters this function. - - Arguments: - group (optional): Group of the collective. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - return torch._C._dist_barrier(group) - - -def new_group(ranks=None): - r"""Creates a new distributed group. - - This function requires that all processes in the main group (i.e., all - processes that are part of the distributed job) enter this function, even - if they are not going to be members of the group. Additionally, groups - should be created in the same order in all processes. - - Arguments: - ranks (list[int]): List of ranks of group members. - - Returns: - A handle of distributed group that can be given to collective calls. - """ - assert torch.distributed._initialized == _INITIALIZED_PG, \ - "collective only supported in process-group mode" - if ranks is None: - ranks = list(range(get_world_size())) - return torch._C._dist_new_group(ranks) - - -def _clear_group_cache(group=group.WORLD): - r"""Clear the created distributed group's cached resource. - - Only NCCL backend is currently supported. + return hasattr(torch._C, "_c10d_init") - Cached resource includes NCCL communicators and CUDA events. - Arguments: - group (optional): Group of the collective. - """ - return torch._C._dist_clear_group_cache(group) +if is_available() and not torch._C._c10d_init(): + raise RuntimeError("Failed to initialize PyTorch distributed support") -def _register_stream(stream): - if not _initialized: - raise RuntimeError("torch.distributed needs to be initialized first") - return torch._C._dist_register_stream(stream) +if is_available(): + from .distributed_c10d import * diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py deleted file mode 100644 index 5356097743aa3c..00000000000000 --- a/torch/distributed/c10d/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -import torch - - -def is_available(): - return hasattr(torch._C, "_c10d_init") - - -if is_available() and not torch._C._c10d_init(): - raise RuntimeError("Failed to initialize PyTorch distributed support") - - -if is_available(): - from .distributed_c10d import * diff --git a/torch/distributed/deprecated/__init__.py b/torch/distributed/deprecated/__init__.py new file mode 100644 index 00000000000000..151439aa332bc0 --- /dev/null +++ b/torch/distributed/deprecated/__init__.py @@ -0,0 +1,566 @@ +""" +torch.distributed.deprecated provides an MPI-like interface for exchanging tensor +data across multi-machine networks. It supports a few different backends +and initialization methods. +""" +import torch +import atexit +import warnings +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + + +class dist_backend: + UNDEFINED = -1 + TCP = 0 + MPI = 1 + GLOO = 2 + NCCL = 3 + + +_INITIALIZED_PG = 1 +_INITIALIZED_MW = 2 +_initialized = 0 +_backend = dist_backend.UNDEFINED +_scope = locals() + + +def _extend_scope(module): + _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')}) + + +def is_available(): + return torch._C._has_distributed() + + +def destroy_process_group(): + r"""Destroy the initialized distributed package + """ + global _backend + global _initialized + torch._C._dist_destroy_process_group() + _backend = dist_backend.UNDEFINED + _initialized = 0 + + +def is_initialized(): + r"""Checking if the process group has been initialized + """ + return _initialized == _INITIALIZED_PG + + +def init_process_group(backend, init_method='env://', **kwargs): + r"""Initializes the distributed package. + + Arguments: + backend (str): Name of the backend to use. Depending on build-time configuration + valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``. + init_method (str, optional): URL specifying how to initialize the package. + world_size (int, optional): Number of processes participating in the job. + rank (int, optional): Rank of the current process. + group_name (str, optional): Group name. See description of init methods. + + To enable ``backend == mpi``, PyTorch needs to built from source on a system that + supports MPI. If you want to use Open MPI with CUDA-aware support, please use + Open MPI major version 2 and above. + + .. note:: + This method initializes CUDA context. Therefore, if multiple processes + run on a single machine but use different GPUs, make sure to use + :func:`torch.cuda.set_device` before this method to avoid unnecessarily + creating context on the first visible device. + + """ + world_size = kwargs.pop('world_size', -1) + group_name = kwargs.pop('group_name', '') + rank = kwargs.pop('rank', -1) + assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) + + if not is_available(): + raise RuntimeError("PyTorch built without distributed support") + + global _initialized + if _initialized: + raise RuntimeError("trying to initialize torch.distributed.deprecated twice!") + + # Checking and assigning the distributed backend + global _backend + + backend = backend.lower() + if backend == "tcp": + _backend = dist_backend.TCP + elif backend == "mpi": + _backend = dist_backend.MPI + elif backend == "gloo": + _backend = dist_backend.GLOO + elif backend == "nccl": + _backend = dist_backend.NCCL + else: + raise RuntimeError("Invalid distributed backend name: " + backend) + + torch._C._dist_init_process_group(backend, init_method, world_size, + group_name, rank) + _initialized = _INITIALIZED_PG + + if _backend == dist_backend.NCCL: + atexit.register(destroy_process_group) + + if not torch._C._dist_init_extension(False, reduce_op, group): + raise RuntimeError("distributed module initialization failed") + + +def init_master_worker(backend, init_method='env://', **kwargs): + warnings.warn(""" + ================================================================================ + WARNING + ================================================================================ + Master-worker mode is still experimental. The API will change without + notice and we do not guarantee full correctness and expected performance yet. + We'll announce it once it's ready. + """) + world_size = kwargs.pop('world_size', -1) + group_name = kwargs.pop('group_name', '') + rank = kwargs.pop('rank', -1) + assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) + + if not is_available(): + raise RuntimeError("PyTorch built without distributed support") + + global _initialized + if _initialized: + raise RuntimeError("trying to initialize torch.distributed.deprecated twice!") + torch._C._dist_init_master_worker(backend, init_method, world_size, + group_name, rank) + _initialized = _INITIALIZED_MW + import torch.distributed.deprecated.collectives as collectives + import torch.distributed.deprecated.remote_types as remote_types + _extend_scope(collectives) + _extend_scope(remote_types) + if not torch._C._dist_init_extension(True, reduce_op, group): + raise RuntimeError("distributed module initialization failed") + + +class reduce_op(object): + SUM = object() + PRODUCT = object() + MAX = object() + MIN = object() + + +class group(object): + WORLD = object() + + +class _DistributedRequest(object): + def __init__(self, request): + self.request = request + + def is_completed(self): + return torch._C._dist_request_is_completed(self.request) + + def wait(self): + torch._C._dist_request_wait(self.request) + + +def get_rank(): + r"""Returns the rank of current process. + + Rank is a unique identifier assigned to each process within a distributed + group. They are always consecutive integers ranging from ``0`` to + ``world_size - 1`` (inclusive). + """ + assert torch.distributed.deprecated._initialized + return torch._C._dist_get_rank() + + +def get_world_size(): + r"""Returns the number of processes in the distributed group.""" + assert torch.distributed.deprecated._initialized + return torch._C._dist_get_num_processes() + + +def isend(tensor, dst): + r"""Sends a tensor asynchronously. + + Arguments: + tensor (Tensor): Tensor to send. + dst (int): Destination rank. + + Returns: + A distributed request object. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return _DistributedRequest(torch._C._dist_isend(tensor, dst)) + + +def irecv(tensor, src): + r"""Receives a tensor asynchronously. + + Arguments: + tensor (Tensor): Tensor to fill with received data. + src (int): Source rank. + + Returns: + A distributed request object. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return _DistributedRequest(torch._C._dist_irecv(tensor, src)) + + +def send(tensor, dst): + r"""Sends a tensor synchronously. + + Arguments: + tensor (Tensor): Tensor to send. + dst (int): Destination rank. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return torch._C._dist_send(tensor, dst) + + +def recv(tensor, src=None): + r"""Receives a tensor synchronously. + + Arguments: + tensor (Tensor): Tensor to fill with received data. + src (int, optional): Source rank. Will receive from any + process if unspecified. + + Returns: + Sender rank. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + if src is None: + return torch._C._dist_recv_any_source(tensor) + return torch._C._dist_recv(tensor, src) + + +def broadcast_multigpu(tensor_list, src, group=group.WORLD): + r"""Broadcasts the tensor to the whole group with multiple GPU tensors + per node. + + :attr:`tensor` must have the same number of elements in all the GPUs from + all processes participating in the collective. each tensor in the list must + be on a different GPU. + + .. note:: + Only NCCL backend is currently supported. :attr:`tensor_list` should only + contain GPU tensors. + + Arguments: + tensor_list (List[Tensor]): Tensors that participate in the collective + operation. if ``src`` is the rank, then the first element of + ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all + other tensors (on different GPUs) in the src process and all tensors + in ``tensor_list`` of other non-src processes. You also need to make + sure that ``len(tensor_list)`` is the same for all the distributed + processes calling this function. + + src (int): Source rank. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + + return torch._C._dist_broadcast_multigpu(tensor_list, src, group) + + +def broadcast(tensor, src, group=group.WORLD): + r"""Broadcasts the tensor to the whole group. + + :attr:`tensor` must have the same number of elements in all processes + participating in the collective. + + Arguments: + tensor (Tensor): Data to be sent if :attr:`src` is the rank of + current process, and tensor to be used to save received data + otherwise. + src (int): Source rank. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return torch._C._dist_broadcast(tensor, src, group) + + +def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD): + r"""Reduces the tensor data across all machines in such a way that all get + the final result. This function reduces a number of tensors on every node, + while each tensor resides on a different GPU. + Therefore, the input tensor in the tensor list needs to be GPU tensors. + Also, each tensor in the tensor list needs to reside on a different GPU. + + After the call, all tensors in :attr:`tensor_list` will be bitwise identical + in all processes. + + .. note:: + Only NCCL backend is currently supported. :attr:`tensor_list` should only + contain GPU tensors. + + Arguments: + tensor_list (List[Tensor]): List of input and output tensors of + the collective. The function operates in-place and requires that + each tensor to be a GPU tensor on different GPUs. + You also need to make sure that ``len(tensor_list)`` is the same for + all the distributed processes calling this function. + + op (optional): One of the values from ``torch.distributed.deprecated.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + + return torch._C._dist_all_reduce_multigpu(tensor_list, op, group) + + +def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD): + r"""Reduces the tensor data across all machines in such a way that all get + the final result. + + After the call :attr:`tensor` will be bitwise identical in all processes. + + Arguments: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + op (optional): One of the values from ``torch.distributed.deprecated.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return torch._C._dist_all_reduce(tensor, op, group) + + +def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD): + r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor + in :attr`tensor_list` should reside on a separate GPU. + + Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is + going to receive the final result. + + .. note:: + Only NCCL backend is currently supported. :attr:`tensor_list` should only + contain GPU tensors. + + Arguments: + tensor_list (List[Tensor]): Input and output GPU tensors of the + collective. The function operates in-place. + You also need to make sure that ``len(tensor_list)`` is the same for + all the distributed processes calling this function. + + dst (int): Destination rank + op (optional): One of the values from ``torch.distributed.deprecated.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + + return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group) + + +def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD): + r"""Reduces the tensor data across all machines. + + Only the process with rank :attr:`dst` is going to receive the final result. + + Arguments: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + dst (int): Destination rank + op (optional): One of the values from ``torch.distributed.deprecated.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return torch._C._dist_reduce(tensor, dst, op, group) + + +def all_gather_multigpu(output_tensor_lists, + input_tensor_list, + group=group.WORLD): + r"""Gathers tensors from the whole group in a list. + Each tensor in :attr:`input_tensor_list` should reside on a separate GPU. + + .. note:: + Only NCCL backend is currently supported. :attr:`output_tensor_lists` and + :attr:`input_tensor_list` should only contain GPU tensors. + + Arguments: + output_tensor_lists (List[List[Tensor]]): Output lists. It should + contain correctly-sized tensors on each GPU to be used for output of + the collective. + e.g. ``output_tensor_lists[i]`` contains the all_gather + result that resides on the GPU of ``input_tensor_list[i]``. + Note that each element of ``output_tensor_lists[i]`` has the size of + ``world_size * len(input_tensor_list)``, since the function all + gathers the result from every single GPU in the group. To interpret + each element of ``output_tensor_list[i]``, note that + ``input_tensor_list[j]`` of rank k will be appear in + ``output_tensor_list[i][rank * world_size + j]`` + Also note that ``len(output_tensor_lists)``, and the size of each + element in ``output_tensor_lists`` (each element is a list, + therefore ``len(output_tensor_lists[i])``) need to be the same + for all the distributed processes calling this function. + + input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to + be broadcast from current process. + Note that ``len(input_tensor_list)`` needs to be the same for + all the distributed processes calling this function. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + + flatten_tensor_list = [] + for output_tensor_list in output_tensor_lists: + flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) + + ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, + input_tensor_list, + group) + + for output_tensor_list, flatten_tensor in zip(output_tensor_lists, + flatten_tensor_list): + for tensor, value in zip(output_tensor_list, + _unflatten_dense_tensors(flatten_tensor, + output_tensor_list)): + tensor.copy_(value) + + return ret + + +def all_gather(tensor_list, tensor, group=group.WORLD): + r"""Gathers tensors from the whole group in a list. + + Arguments: + tensor_list (list[Tensor]): Output list. It should contain + correctly-sized tensors to be used for output of the collective. + tensor (Tensor): Tensor to be broadcast from current process. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + if _backend != dist_backend.NCCL: + return torch._C._dist_all_gather(tensor_list, tensor, group) + else: + return all_gather_multigpu([tensor_list], [tensor], group) + + +def gather(tensor, **kwargs): + r"""Gathers a list of tensors in a single process. + + Arguments: + tensor (Tensor): Input tensor. + dst (int): Destination rank. Required in all processes except the one that + is receiveing the data. + gather_list (list[Tensor]): List of appropriately-sized tensors to + use for received data. Required only in the receiving process. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + my_rank = get_rank() + dst = kwargs.pop('dst', my_rank) + gather_list = kwargs.pop('gather_list', None) + _group = kwargs.pop('group', group.WORLD) + if kwargs: + raise RuntimeError("got unexpected kwargs") + if dst == my_rank: + if gather_list is None: + raise RuntimeError("gather_list is a required argument in gather destination") + return torch._C._dist_gather_recv(gather_list, tensor, _group) + else: + if gather_list: + raise RuntimeError("non-empty gather_list can be given only to gather destination") + return torch._C._dist_gather_send(tensor, dst, _group) + + +def scatter(tensor, **kwargs): + r"""Scatters a list of tensors to all processes in a group. + + Each process will receive exactly one tensor and store its data in the + :attr:`tensor` argument. + + Arguments: + tensor (Tensor): Output tensor. + src (int): Source rank. Required in all processes except the one that + is sending the data. + scatter_list (list[Tensor]): List of tensors to scatter. Required only + in the process that is sending the data. + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + my_rank = get_rank() + src = kwargs.pop('src', my_rank) + scatter_list = kwargs.pop('scatter_list', None) + _group = kwargs.pop('group', group.WORLD) + if kwargs: + raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys()))) + if src == my_rank: + if scatter_list is None: + raise RuntimeError("scatter_list is a required argument in scatter source") + return torch._C._dist_scatter_send(scatter_list, tensor, _group) + else: + if scatter_list: + raise RuntimeError("non-empty can be given only to scatter source") + return torch._C._dist_scatter_recv(tensor, src, _group) + + +def barrier(group=group.WORLD): + r"""Synchronizes all processes. + + This collective blocks processes until the whole group enters this function. + + Arguments: + group (optional): Group of the collective. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + return torch._C._dist_barrier(group) + + +def new_group(ranks=None): + r"""Creates a new distributed group. + + This function requires that all processes in the main group (i.e., all + processes that are part of the distributed job) enter this function, even + if they are not going to be members of the group. Additionally, groups + should be created in the same order in all processes. + + Arguments: + ranks (list[int]): List of ranks of group members. + + Returns: + A handle of distributed group that can be given to collective calls. + """ + assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \ + "collective only supported in process-group mode" + if ranks is None: + ranks = list(range(get_world_size())) + return torch._C._dist_new_group(ranks) + + +def _clear_group_cache(group=group.WORLD): + r"""Clear the created distributed group's cached resource. + + Only NCCL backend is currently supported. + + Cached resource includes NCCL communicators and CUDA events. + + Arguments: + group (optional): Group of the collective. + """ + return torch._C._dist_clear_group_cache(group) + + +def _register_stream(stream): + if not _initialized: + raise RuntimeError("torch.distributed.deprecated needs to be initialized first") + return torch._C._dist_register_stream(stream) diff --git a/torch/distributed/remote_types.py b/torch/distributed/deprecated/remote_types.py similarity index 96% rename from torch/distributed/remote_types.py rename to torch/distributed/deprecated/remote_types.py index a8d10cd93b341f..549a131dabd8c5 100644 --- a/torch/distributed/remote_types.py +++ b/torch/distributed/deprecated/remote_types.py @@ -55,6 +55,6 @@ class HalfStorage(_DistributedBase, torch._C.DistributedHalfStorageBase, _Storag _tensors = [_locals[t + 'Tensor'] for t in _type_names] _storages = [_locals[t + 'Storage'] for t in _type_names] for cls in _tensors + _storages: - cls.__module__ = 'torch.distributed' + cls.__module__ = 'torch.distributed.deprecated' torch._C._init_names(_tensors + _storages) del _locals, _type_names, _tensors, _storages diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/distributed_c10d.py similarity index 100% rename from torch/distributed/c10d/distributed_c10d.py rename to torch/distributed/distributed_c10d.py diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/rendezvous.py similarity index 100% rename from torch/distributed/c10d/rendezvous.py rename to torch/distributed/rendezvous.py diff --git a/torch/nn/parallel/__init__.py b/torch/nn/parallel/__init__.py index 00d942abf35e0e..066f415a73409a 100644 --- a/torch/nn/parallel/__init__.py +++ b/torch/nn/parallel/__init__.py @@ -4,8 +4,7 @@ from .scatter_gather import scatter, gather from .distributed import DistributedDataParallel from .distributed_cpu import DistributedDataParallelCPU -from .distributed_c10d import _DistributedDataParallelC10d -from .distributed_c10d_cpu import _DistributedDataParallelC10dCPU +import torch.nn.parallel.deprecated __all__ = ['replicate', 'scatter', 'parallel_apply', 'gather', 'data_parallel', 'DataParallel', 'DistributedDataParallel', 'DistributedDataParallelCPU'] diff --git a/torch/nn/parallel/deprecated/__init__.py b/torch/nn/parallel/deprecated/__init__.py new file mode 100644 index 00000000000000..262827cf5c919d --- /dev/null +++ b/torch/nn/parallel/deprecated/__init__.py @@ -0,0 +1,4 @@ +from .distributed import DistributedDataParallel +from .distributed_cpu import DistributedDataParallelCPU + +__all__ = ['DistributedDataParallel', 'DistributedDataParallelCPU'] diff --git a/torch/nn/parallel/deprecated/distributed.py b/torch/nn/parallel/deprecated/distributed.py new file mode 100644 index 00000000000000..655cdce0102bb5 --- /dev/null +++ b/torch/nn/parallel/deprecated/distributed.py @@ -0,0 +1,482 @@ +import sys +import math +import threading +import copy + +import torch +from torch.autograd import Variable +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \ + _take_tensors + +from torch.cuda.comm import broadcast_coalesced +from torch.cuda import nccl +import torch.distributed.deprecated as dist + +from ...modules import Module +from ..replicate import replicate +from ..scatter_gather import scatter_kwargs, gather +from ..parallel_apply import parallel_apply + +if sys.version_info[0] == 3: + import queue +else: + import Queue as queue + + +class DistributedDataParallel(Module): + r"""Implements distributed data parallelism at the module level. + + This container parallelizes the application of the given module by + splitting the input across the specified devices by chunking in the batch + dimension. The module is replicated on each machine and each device, and + each such replica handles a portion of the input. During the backwards + pass, gradients from each node are averaged. + + The batch size should be larger than the number of GPUs used locally. It + should also be an integer multiple of the number of GPUs so that each chunk + is the same size (so that each GPU processes the same number of samples). + + See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`. + The same constraints on input as in :class:`torch.nn.DataParallel` apply. + + Creation of this class requires the distributed package to be already + initialized in the process group mode + (see :func:`torch.distributed.deprecated.init_process_group`). + + .. warning:: + This module works only with the ``nccl`` and ``gloo`` backends. + + .. warning:: + Constructor, forward method, and differentiation of the output (or a + function of the output of this module) is a distributed synchronization + point. Take that into account in case different processes might be + executing different code. + + .. warning:: + This module assumes all parameters are registered in the model by the + time it is created. No parameters should be added nor removed later. + Same applies to buffers. + + .. warning:: + This module assumes all buffers and gradients are dense. + + .. warning:: + This module doesn't work with :func:`torch.autograd.grad` (i.e. it will + only work if gradients are to be accumulated in ``.grad`` attributes of + parameters). + + .. warning:: + If you plan on using this module with a ``nccl`` backend or a ``gloo`` + backend (that uses Infiniband), together with a DataLoader that uses + multiple workers, please change the multiprocessing start method to + ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately + Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will + likely experience deadlocks if you don't change this setting. + + .. note:: + Parameters are never broadcast between processes. The module performs + an all-reduce step on gradients and assumes that they will be modified + by the optimizer in all processes in the same way. Buffers + (e.g. BatchNorm stats) are broadcast from the module in process of rank + 0, to all other replicas in the system in every iteration. + + .. warning:: + Forward and backward hooks defined on :attr:`module` and its submodules + won't be invoked anymore, unless the hooks are initialized in the + :meth:`forward` method. + + Args: + module: module to be parallelized + device_ids: CUDA devices (default: all devices) + output_device: device location of output (default: device_ids[0]) + broadcast_buffers: flag that enables syncing (broadcasting) buffers of + the module at beginning of the forward function. + (default: True) + + Attributes: + module (Module): the module to be parallelized + + Example:: + + >>> torch.distributed.deprecated.init_process_group(world_size=4, init_method='...') + >>> net = torch.nn.DistributedDataParallel(model) + """ + + def __init__(self, module, device_ids=None, output_device=None, dim=0, + broadcast_buffers=True): + super(DistributedDataParallel, self).__init__() + if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO): + raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel') + + if device_ids is None: + device_ids = list(range(torch.cuda.device_count())) + if output_device is None: + output_device = device_ids[0] + self.dim = dim + self.module = module + self.device_ids = device_ids + self.output_device = output_device + self.broadcast_buffers = broadcast_buffers + + # Flag used by the NCCL backend to make sure we only reduce gradients + # one time in the execution engine + self.need_reduction = False + + MB = 1024 * 1024 + # used for intra-node param sync and inter-node sync as well + self.broadcast_bucket_size = 10 * MB + self.nccl_reduce_bucket_size = 256 * MB + + # Sync params and buffers + module_states = list(self.module.state_dict().values()) + if len(module_states) > 0: + self._dist_broadcast_coalesced(module_states, + self.broadcast_bucket_size) + + if len(device_ids) > 1: + # TODO: we don't need to replicate params in here. they're always going to + # be broadcasted using larger blocks in broadcast_coalesced, so it might be + # better to not pollute the caches with these small blocks + self._module_copies = replicate(self.module, self.device_ids, detach=True) + self._module_copies[0] = self.module + + for module_copy in self._module_copies[1:]: + for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): + copy_param.requires_grad = param.requires_grad + + else: + self._module_copies = [self.module] + + # For NCCL backend, since every single NCCL call is asynchoronous, we + # therefore directly enqueue all the NCCL reduction calls to the + # default CUDA stream without spawning up other reduction threads. + # This achieves the best performance. + if dist._backend == dist.dist_backend.NCCL: + self._register_nccl_grad_hook() + return + + bucket_bytes_cap = 1 * MB + + # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems + param_buckets = [] + # Split the parameters into buckets and by types as well + for dev_idx, module in enumerate(self._module_copies): + param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap))) + + self.bucket_sizes = [] + self.bucket_map = {} + + # We transpose param_buckets, so the loop is over buckets. + # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems + for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): + self.bucket_sizes.append(0) + # Now, we transpose again, so we iterate over bucket_elems, but getting tuples + # of params from each device. + for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): + if idx == 0: + # Bucket parameter type tracking + bucket_param_type = param_tuple[0].type() + # Only gloo and nccl support half-precision + if bucket_param_type == torch.cuda.HalfTensor and \ + dist._backend != dist.dist_backend.GLOO: + raise RuntimeError("DistributedDataParallel currently only " + "supports half precision parameters " + "with Nccl and Gloo backend") + if not param_tuple[0].requires_grad: + continue + for p in param_tuple: + self.bucket_map[p] = bucket_idx + self.bucket_sizes[bucket_idx] += 1 + + self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] + self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] + self.reduced = [False] * len(self.bucket_sizes) + + self._register_grad_hooks() + + self.dispatch_lock = threading.Lock() + self._start_reduction_threads() + + def __getstate__(self): + attrs = copy.copy(self.__dict__) + if dist._backend != dist.dist_backend.NCCL: + del attrs['_grad_accs'], attrs['_reduction_queues'], \ + attrs['_reduction_streams'], attrs['_reduction_threads'], \ + attrs['_nccl_streams'], attrs['_default_streams'] + return attrs + + def __setstate__(self, state): + super(DistributedDataParallel, self).__setstate__(state) + if dist._backend == dist.dist_backend.NCCL: + self._register_nccl_grad_hook() + else: + self._register_grad_hooks() + self._start_reduction_threads() + + def forward(self, *inputs, **kwargs): + self.need_reduction = True + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + self._sync_params() + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs) + return self.gather(outputs, self.output_device) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def parallel_apply(self, replicas, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) + + def gather(self, outputs, output_device): + return gather(outputs, output_device, dim=self.dim) + + def train(self, mode=True): + super(DistributedDataParallel, self).train(mode) + for module in self._module_copies[1:]: + module.train(mode) + + def _dist_broadcast_coalesced(self, tensors, buffer_size): + """ + Broadcast a sequence of tensors to the default group from rank 0. + Small tensors are first coalesced into a buffer to reduce the number of + broadcasts. + + tensors (sequence): tensors to broadcast. Each tensor needs to be on the + same GPU. + buffer_size (int): maximum size of the buffer for coalescing + """ + for tensors in _take_tensors(tensors, buffer_size): + flat_tensors = _flatten_dense_tensors(tensors) + dist.broadcast(flat_tensors, 0) + for tensor, synced in zip(tensors, + _unflatten_dense_tensors(flat_tensors, tensors)): + tensor.copy_(synced) + + def _sync_params(self): + if len(self.device_ids) > 1: + # intra-node parameter sync + params = [p.data for p in self.module.parameters()] + result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) + for tensors, module in zip(result[1:], self._module_copies[1:]): + for tensor, param in zip(tensors, module.parameters()): + param.data.set_(tensor) + + # module buffer sync + if self.broadcast_buffers: + buffers = [b.data for b in self.module.buffers()] + if len(buffers) > 0: + # cross-node buffer sync + self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size) + + if len(self.device_ids) > 1: + # intra-node buffer sync + result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) + for tensors, module in zip(result[1:], self._module_copies[1:]): + for tensor, buf in zip(tensors, module.buffers()): + buf.data.set_(tensor) + + def _register_grad_hooks(self): + self._grad_accs = [] # need to keep them in scope + for device_idx, module in enumerate(self._module_copies): + for p in module.parameters(): + if p.requires_grad: + p_tmp = p.expand_as(p) + grad_acc = p_tmp.grad_fn.next_functions[0][0] + grad_acc.register_hook(self._make_param_hook(p, device_idx)) + self._grad_accs.append(grad_acc) + + def _register_nccl_grad_hook(self): + """ + This function registers the callback all-reduction function for the + NCCL backend. All gradients will be all reduced in one single step. + The NCCL reduction will directly be enqueued into the + default CUDA stream. Therefore, no synchronization is needed. + """ + # Creating a new group + self.nccl_reduction_group_id = dist.new_group() + + def reduction_fn_nccl(): + # This function only needs to be called once + if not self.need_reduction: + return + + self.need_reduction = False + all_grads = [[] for _ in range(len(self._module_copies))] + all_grads_buckets_iters = [] + + # Bucketing all the gradients + for dev_idx, module in enumerate(self._module_copies): + for param in module.parameters(): + if not param.requires_grad or param.grad is None: + continue + if param.grad.requires_grad: + raise RuntimeError("DistributedDataParallel only works " + "with gradients that don't require " + "grad") + # Adding the gradients for reduction + all_grads[dev_idx].append(param.grad.data) + + # Now bucketing the parameters + dev_grads_buckets = _take_tensors(all_grads[dev_idx], + self.nccl_reduce_bucket_size) + + all_grads_buckets_iters.append(dev_grads_buckets) + + # Now reduce each bucket one after another + for grads_batch in zip(*all_grads_buckets_iters): + grads_batch_coalesced = [] + # Coalesce each bucket + for dev_idx, dev_grads_batch in enumerate(grads_batch): + dev_id = self.device_ids[dev_idx] + with torch.cuda.device(dev_id): + dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) + grads_batch_coalesced.append(dev_grads_batch_coalesced) + + # We will only use device 0's results, but this single op should be + # faster than doing the following two operation sequentially: + # (1) intra-node reduce to lead GPU, followed by + # (2) inter-node allreduce for all the first lead GPUs in all nodes + dist.all_reduce_multigpu(grads_batch_coalesced, + group=self.nccl_reduction_group_id) + + # Now only work on the first device of self.device_ids, uncoalesce + # the gradients for each bucket + grads_batch_coalesced[0] /= dist.get_world_size() + grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0]) + for grad, reduced in zip(grads_batch[0], grads_batch_reduced): + grad.copy_(reduced) + + # clear the gradients and save memory for replicas + for module in self._module_copies[1:]: + for param in module.parameters(): + if param.requires_grad: + param.grad = None + param.data.set_() + + # Now register the reduction hook on the parameters + for p in self.module.parameters(): + if not p.requires_grad: + continue + + def allreduce_hook(*unused): + Variable._execution_engine.queue_callback(reduction_fn_nccl) + + p.register_hook(allreduce_hook) + + def _make_param_hook(self, param, device_idx): + + bucket_idx = self.bucket_map[param] + + def distributed_data_parallel_hook(*unused): + if param.grad.requires_grad: + raise RuntimeError("DistributedDataParallel only works with " + "gradients that don't require grad") + bucket = self.buckets[bucket_idx][device_idx] + bucket.append(param.grad.data) + + # We can flush these and save memory for replicas + if device_idx > 0: + param.grad = None + param.data.set_() + + # Current device's bucket is full + if len(bucket) == self.bucket_sizes[bucket_idx]: + with torch.cuda.device(self.device_ids[device_idx]): + event = torch.cuda.Event() + event.record() + with self.dispatch_lock: + self.bucket_events[bucket_idx][device_idx] = event + self._queue_reduction(bucket_idx) + + return distributed_data_parallel_hook + + def _queue_reduction(self, bucket_idx): + dev_buckets = self.buckets[bucket_idx] + dev_events = self.bucket_events[bucket_idx] + + # Check if it's ready + if any(evt is None for evt in dev_events): + return + + # Queue the reduction and make sure backward waits for it + event = threading.Event() + self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event)) + Variable._execution_engine.queue_callback(lambda: event.wait()) + + # Reset bucket state + self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))] + self.bucket_events[bucket_idx] = [None] * len(self.device_ids) + self.reduced[bucket_idx] = True + if all(self.reduced): + self.reduced = [False] * len(self.bucket_sizes) + + def sync_reduction_streams(): + # We only have to sync with the first one, but it's safer to do it this way + # in case we change the way in which we paralellize work + r_streams = zip(*self._reduction_streams) + for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams): + with torch.cuda.device(dev_id): + for reduction_stream in dev_r_streams: + default_stream.wait_stream(reduction_stream) + Variable._execution_engine.queue_callback(sync_reduction_streams) + + def _start_reduction_threads(self): + num_buckets = len(self.bucket_sizes) + self._reduction_queues = [queue.Queue() for _ in range(num_buckets)] + self._reduction_threads = [] + self._reduction_streams = [[] for _ in range(num_buckets)] + self._nccl_streams = [] + self._default_streams = [] + for dev_id in self.device_ids: + with torch.cuda.device(dev_id): + # TODO: don't assume we're on a default stream + self._default_streams.append(torch.cuda.current_stream()) + self._nccl_streams.append(torch.cuda.Stream()) + for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams): + for dev_id in self.device_ids: + with torch.cuda.device(dev_id): + reduction_streams.append(torch.cuda.Stream()) + # We only use the first device for distributed reductions + dist._register_stream(reduction_streams[0]) + + group_id = dist.new_group() + + self._reduction_threads.append(threading.Thread( + target=self._reduction_thread_fn, + args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams))) + self._reduction_threads[-1].daemon = True + self._reduction_threads[-1].start() + + @staticmethod + def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams): + + def _process_batch(): + dev_grad_batch, dev_events, job_event = queue.get() + dev_coalesced = [] + # Coalesce the tensors on all devices and start a local reduction + for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): + with torch.cuda.device(dev_id), torch.cuda.stream(stream): + stream.wait_event(event) + coalesced = _flatten_dense_tensors(grad_batch) + dev_coalesced.append(coalesced) + # Wait for all copies to complete before starting the NCCL kernel + for stream in reduction_streams: + stream.synchronize() + nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) + + # From now on we're only going to work on the first device (from device_ids) + grad_batch = dev_grad_batch[0] + coalesced = dev_coalesced[0] + reduce_stream = reduction_streams[0] + with torch.cuda.stream(reduce_stream): + reduce_stream.wait_stream(nccl_streams[0]) + coalesced /= dist.get_world_size() + dist.all_reduce(coalesced, group=group_id) + for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): + grad.copy_(reduced) + job_event.set() + + with torch.cuda.device(device_ids[0]): + while True: + _process_batch() # just to have a clear scope diff --git a/torch/nn/parallel/distributed_c10d_cpu.py b/torch/nn/parallel/deprecated/distributed_cpu.py similarity index 88% rename from torch/nn/parallel/distributed_c10d_cpu.py rename to torch/nn/parallel/deprecated/distributed_cpu.py index d7cdeb45cae7f0..e93ed2a9b816f1 100644 --- a/torch/nn/parallel/distributed_c10d_cpu.py +++ b/torch/nn/parallel/deprecated/distributed_cpu.py @@ -1,16 +1,15 @@ import torch from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -import torch.distributed.c10d as dist +import torch.distributed.deprecated as dist from torch.nn.modules import Module from collections import defaultdict from torch.autograd import Variable -class _DistributedDataParallelC10dCPU(Module): - r"""Implements distributed data parallelism for CPU at the module level - and it will use PyTorch's new distributed package: c10d. +class DistributedDataParallelCPU(Module): + r"""Implements distributed data parallelism for CPU at the module level. - This module support the ``mpi``, ``gloo``, backends. + This module support the ``mpi``, ``gloo``, ``tcp`` backends. This container parallelizes the application of the given module by splitting the input across the specified devices by chunking in the batch @@ -29,7 +28,7 @@ class _DistributedDataParallelC10dCPU(Module): Creation of this class requires the distributed package to be already initialized in the process group mode - (see :func:`torch.distributed.init_process_group`). + (see :func:`torch.distributed.deprecated.init_process_group`). .. warning:: Constructor, forward method, and differentiation of the output (or a @@ -64,12 +63,12 @@ class _DistributedDataParallelC10dCPU(Module): Example:: - >>> torch.distributed.init_process_group(world_size=4, init_method='...') - >>> net = torch.nn._DistributedDataParallelC10dCPU(model) + >>> torch.distributed.deprecated.init_process_group(world_size=4, init_method='...') + >>> net = torch.nn.DistributedDataParallelCPU(model) """ def __init__(self, module): - super(_DistributedDataParallelC10dCPU, self).__init__() + super(DistributedDataParallelCPU, self).__init__() self.module = module self.sync_parameters() diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index caf1f533acbec9..175aac70be9f3d 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -1,10 +1,6 @@ -import sys -import math -import threading import copy import torch -from torch.autograd import Variable from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \ _take_tensors @@ -17,14 +13,13 @@ from .scatter_gather import scatter_kwargs, gather from .parallel_apply import parallel_apply -if sys.version_info[0] == 3: - import queue -else: - import Queue as queue - class DistributedDataParallel(Module): - r"""Implements distributed data parallelism at the module level. + r"""Implements distributed data parallelism that is based on c10d at the + module level. + + Currently this module is EXPERIMENTAL ONLY and should not be + used by normal users. Instead, please use DistributedDataParallel. This container parallelizes the application of the given module by splitting the input across the specified devices by chunking in the batch @@ -39,12 +34,12 @@ class DistributedDataParallel(Module): See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`. The same constraints on input as in :class:`torch.nn.DataParallel` apply. - Creation of this class requires the distributed package to be already - initialized in the process group mode - (see :func:`torch.distributed.init_process_group`). + Creation of this class requires the c10d process group to be already + initialized. This class will basically operate on the provided c10d + process group. .. warning:: - This module works only with the ``nccl`` and ``gloo`` backends. + This module works only with the ``gloo`` and ``nccl`` process groups. .. warning:: Constructor, forward method, and differentiation of the output (or a @@ -57,6 +52,14 @@ class DistributedDataParallel(Module): time it is created. No parameters should be added nor removed later. Same applies to buffers. + -- warning:: + This module assumes all parameters are registered in the model of each + distributed processes are in the same order. The module itself will + conduct gradient all-reduction following the reverse order of the + registered parameters of the model. In other wise, it is users' + responsibility to ensure that each distributed process has the exact + same model and thus the exact parameter registeration order. + .. warning:: This module assumes all buffers and gradients are dense. @@ -66,12 +69,13 @@ class DistributedDataParallel(Module): parameters). .. warning:: - If you plan on using this module with a ``nccl`` backend or a ``gloo`` - backend (that uses Infiniband), together with a DataLoader that uses - multiple workers, please change the multiprocessing start method to - ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately - Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will - likely experience deadlocks if you don't change this setting. + If you plan on using this module with a ``nccl`` process group or + a ``gloo`` process group (that uses Infiniband), together with a + DataLoader that uses multiple workers, please change the multiprocessing + start method to ``forkserver`` (Python 3 only) or ``spawn``. + Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe, + and you will likely experience deadlocks if you don't change this + setting. .. note:: Parameters are never broadcast between processes. The module performs @@ -92,40 +96,53 @@ class DistributedDataParallel(Module): broadcast_buffers: flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function. (default: True) + process_group: the c10d process group to be used for distributed data + all-reduction. If None, the default process group will + be used. (default: None) + bucket_cap_mb: DistributedDataParallel will bucket parameters into + multiple buckets so that gradient reduction of each + bucket can potentially overlap with backward computation. + bucket_cap_mb controls the bucket size in MegaBytes (MB) + (default: 25) Attributes: module (Module): the module to be parallelized Example:: - - >>> torch.distributed.init_process_group(world_size=4, init_method='...') - >>> net = torch.nn.DistributedDataParallel(model) + >>> store = torch.distributed.FileStore("/tmp/tempfile.txt") + >>> pg = torch.distributed.ProcessGroupGloo(store, rank, world_size) + >>> net = torch.nn.DistributedDataParallel(model, pg) """ + def __init__(self, module, device_ids=None, + output_device=None, dim=0, broadcast_buffers=True, + process_group=None, bucket_cap_mb=25): - def __init__(self, module, device_ids=None, output_device=None, dim=0, - broadcast_buffers=True): super(DistributedDataParallel, self).__init__() - if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO): - raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel') + # Use all devices by default if device_ids is None: device_ids = list(range(torch.cuda.device_count())) + if output_device is None: output_device = device_ids[0] + + if process_group is None: + self.process_group = dist.get_default_group() + else: + self.process_group = process_group + self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers - # Flag used by the NCCL backend to make sure we only reduce gradients - # one time in the execution engine - self.need_reduction = False + self.allreduce_opts = dist.AllreduceOptions() MB = 1024 * 1024 + # used for intra-node param sync and inter-node sync as well - self.broadcast_bucket_size = 10 * MB - self.nccl_reduce_bucket_size = 256 * MB + self.broadcast_bucket_size = 25 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) @@ -147,21 +164,19 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0, else: self._module_copies = [self.module] - # For NCCL backend, since every single NCCL call is asynchoronous, we - # therefore directly enqueue all the NCCL reduction calls to the - # default CUDA stream without spawning up other reduction threads. - # This achieves the best performance. - if dist._backend == dist.dist_backend.NCCL: - self._register_nccl_grad_hook() - return + self.modules_params_data = [[] for _ in range(len(self.device_ids))] + self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] + + for dev_idx, module in enumerate(self._module_copies): + self.modules_params_data[dev_idx] = [p.data for p in module.parameters()] + self.modules_buffers_data[dev_idx] = [b.data for b in module.buffers()] - bucket_bytes_cap = 1 * MB + bucket_bytes_cap = bucket_cap_mb * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well - for dev_idx, module in enumerate(self._module_copies): - param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap))) + param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies] self.bucket_sizes = [] self.bucket_map = {} @@ -173,48 +188,45 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0, # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): - if idx == 0: - # Bucket parameter type tracking - bucket_param_type = param_tuple[0].type() - # Only gloo and nccl support half-precision - if bucket_param_type == torch.cuda.HalfTensor and \ - dist._backend != dist.dist_backend.GLOO: - raise RuntimeError("DistributedDataParallel currently only " - "supports half precision parameters " - "with Nccl and Gloo backend") if not param_tuple[0].requires_grad: continue for p in param_tuple: - self.bucket_map[p] = bucket_idx + self.bucket_map[p] = (bucket_idx, idx) self.bucket_sizes[bucket_idx] += 1 - self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] - self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] - self.reduced = [False] * len(self.bucket_sizes) + self.buckets = [[[None for _ in range(self.bucket_sizes[i])] + for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] + # The number of params ready in each bucket + self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] - self._register_grad_hooks() + # coalesced bucket for only device 0 + self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] + # We will always reduce the bucket following the reverse order + # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 + self.next_bucket = len(self.bucket_sizes) - 1 + self.ready_buckets_not_reduced = set() + self.reduction_works = [None for _ in range(len(self.bucket_sizes))] + + self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] - self.dispatch_lock = threading.Lock() - self._start_reduction_threads() + # default stream tracking to launch nccl reduce kernels + self.default_streams = [] + for dev_id in self.device_ids: + with torch.cuda.device(dev_id): + self.default_streams.append(torch.cuda.current_stream()) + + self._register_grad_hooks() def __getstate__(self): attrs = copy.copy(self.__dict__) - if dist._backend != dist.dist_backend.NCCL: - del attrs['_grad_accs'], attrs['_reduction_queues'], \ - attrs['_reduction_streams'], attrs['_reduction_threads'], \ - attrs['_nccl_streams'], attrs['_default_streams'] + del attrs['_grad_accs'] return attrs def __setstate__(self, state): super(DistributedDataParallel, self).__setstate__(state) - if dist._backend == dist.dist_backend.NCCL: - self._register_nccl_grad_hook() - else: - self._register_grad_hooks() - self._start_reduction_threads() + self._register_grad_hooks() def forward(self, *inputs, **kwargs): - self.need_reduction = True inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) self._sync_params() if len(self.device_ids) == 1: @@ -237,44 +249,32 @@ def train(self, mode=True): module.train(mode) def _dist_broadcast_coalesced(self, tensors, buffer_size): - """ - Broadcast a sequence of tensors to the default group from rank 0. - Small tensors are first coalesced into a buffer to reduce the number of - broadcasts. - - tensors (sequence): tensors to broadcast. Each tensor needs to be on the - same GPU. - buffer_size (int): maximum size of the buffer for coalescing - """ - for tensors in _take_tensors(tensors, buffer_size): - flat_tensors = _flatten_dense_tensors(tensors) - dist.broadcast(flat_tensors, 0) - for tensor, synced in zip(tensors, - _unflatten_dense_tensors(flat_tensors, tensors)): - tensor.copy_(synced) + dist._dist_broadcast_coalesced(tensors, buffer_size, self.process_group) def _sync_params(self): if len(self.device_ids) > 1: # intra-node parameter sync - params = [p.data for p in self.module.parameters()] - result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) - for tensors, module in zip(result[1:], self._module_copies[1:]): - for tensor, param in zip(tensors, module.parameters()): - param.data.set_(tensor) + result = broadcast_coalesced(self.modules_params_data[0], + self.device_ids, + self.broadcast_bucket_size) + for tensors, module_params_data in zip(result[1:], self.modules_params_data[1:]): + for tensor, param_data in zip(tensors, module_params_data): + param_data.set_(tensor) # module buffer sync if self.broadcast_buffers: - buffers = [b.data for b in self.module.buffers()] - if len(buffers) > 0: + if len(self.modules_buffers_data[0]) > 0: # cross-node buffer sync - self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size) - + self._dist_broadcast_coalesced(self.modules_buffers_data[0], + self.broadcast_bucket_size) if len(self.device_ids) > 1: # intra-node buffer sync - result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) - for tensors, module in zip(result[1:], self._module_copies[1:]): - for tensor, buf in zip(tensors, module.buffers()): - buf.data.set_(tensor) + result = broadcast_coalesced(self.modules_buffers_data[0], + self.device_ids, + self.broadcast_bucket_size) + for tensors, module_buffers_data in zip(result[1:], self.modules_buffers_data[1:]): + for tensor, buffer_data in zip(tensors, module_buffers_data): + buffer_data.set_(tensor) def _register_grad_hooks(self): self._grad_accs = [] # need to keep them in scope @@ -286,94 +286,16 @@ def _register_grad_hooks(self): grad_acc.register_hook(self._make_param_hook(p, device_idx)) self._grad_accs.append(grad_acc) - def _register_nccl_grad_hook(self): - """ - This function registers the callback all-reduction function for the - NCCL backend. All gradients will be all reduced in one single step. - The NCCL reduction will directly be enqueued into the - default CUDA stream. Therefore, no synchronization is needed. - """ - # Creating a new group - self.nccl_reduction_group_id = dist.new_group() - - def reduction_fn_nccl(): - # This function only needs to be called once - if not self.need_reduction: - return - - self.need_reduction = False - all_grads = [[] for _ in range(len(self._module_copies))] - all_grads_buckets_iters = [] - - # Bucketing all the gradients - for dev_idx, module in enumerate(self._module_copies): - for param in module.parameters(): - if not param.requires_grad or param.grad is None: - continue - if param.grad.requires_grad: - raise RuntimeError("DistributedDataParallel only works " - "with gradients that don't require " - "grad") - # Adding the gradients for reduction - all_grads[dev_idx].append(param.grad.data) - - # Now bucketing the parameters - dev_grads_buckets = _take_tensors(all_grads[dev_idx], - self.nccl_reduce_bucket_size) - - all_grads_buckets_iters.append(dev_grads_buckets) - - # Now reduce each bucket one after another - for grads_batch in zip(*all_grads_buckets_iters): - grads_batch_coalesced = [] - # Coalesce each bucket - for dev_idx, dev_grads_batch in enumerate(grads_batch): - dev_id = self.device_ids[dev_idx] - with torch.cuda.device(dev_id): - dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) - grads_batch_coalesced.append(dev_grads_batch_coalesced) - - # We will only use device 0's results, but this single op should be - # faster than doing the following two operation sequentially: - # (1) intra-node reduce to lead GPU, followed by - # (2) inter-node allreduce for all the first lead GPUs in all nodes - dist.all_reduce_multigpu(grads_batch_coalesced, - group=self.nccl_reduction_group_id) - - # Now only work on the first device of self.device_ids, uncoalesce - # the gradients for each bucket - grads_batch_coalesced[0] /= dist.get_world_size() - grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0]) - for grad, reduced in zip(grads_batch[0], grads_batch_reduced): - grad.copy_(reduced) - - # clear the gradients and save memory for replicas - for module in self._module_copies[1:]: - for param in module.parameters(): - if param.requires_grad: - param.grad = None - param.data.set_() - - # Now register the reduction hook on the parameters - for p in self.module.parameters(): - if not p.requires_grad: - continue - - def allreduce_hook(*unused): - Variable._execution_engine.queue_callback(reduction_fn_nccl) - - p.register_hook(allreduce_hook) - def _make_param_hook(self, param, device_idx): - - bucket_idx = self.bucket_map[param] + bucket_idx, bucket_offset = self.bucket_map[param] def distributed_data_parallel_hook(*unused): if param.grad.requires_grad: - raise RuntimeError("DistributedDataParallel only works with " - "gradients that don't require grad") + raise RuntimeError("DistributedDataParallel only works " + "with gradients that don't require grad") bucket = self.buckets[bucket_idx][device_idx] - bucket.append(param.grad.data) + bucket[bucket_offset] = param.grad.data + self.buckets_ready_size[bucket_idx][device_idx] += 1 # We can flush these and save memory for replicas if device_idx > 0: @@ -381,102 +303,79 @@ def distributed_data_parallel_hook(*unused): param.data.set_() # Current device's bucket is full - if len(bucket) == self.bucket_sizes[bucket_idx]: - with torch.cuda.device(self.device_ids[device_idx]): - event = torch.cuda.Event() - event.record() - with self.dispatch_lock: - self.bucket_events[bucket_idx][device_idx] = event + if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]: + self.devs_ready[bucket_idx] += 1 + if self.devs_ready[bucket_idx] < len(self.device_ids): + return + + # Now all devices's buckets with index: bucket_idx are ready + if bucket_idx == self.next_bucket: self._queue_reduction(bucket_idx) + self.next_bucket -= 1 + # Now reduce anything that is ready but not yet reduced + if len(self.ready_buckets_not_reduced) > 0: + sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True) + for i in sorted_todo: + # Nothing can be reduced now + if i < self.next_bucket: + break + self._queue_reduction(i) + self.ready_buckets_not_reduced.remove(i) + if i == self.next_bucket: + self.next_bucket -= 1 + else: + self.ready_buckets_not_reduced.add(bucket_idx) + + # When all devices' buckets + if self.next_bucket == -1: + # A final sync for all the reduction works + self._sync_reduction_works() return distributed_data_parallel_hook def _queue_reduction(self, bucket_idx): - dev_buckets = self.buckets[bucket_idx] - dev_events = self.bucket_events[bucket_idx] - - # Check if it's ready - if any(evt is None for evt in dev_events): - return - - # Queue the reduction and make sure backward waits for it - event = threading.Event() - self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event)) - Variable._execution_engine.queue_callback(lambda: event.wait()) - - # Reset bucket state - self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))] - self.bucket_events[bucket_idx] = [None] * len(self.device_ids) - self.reduced[bucket_idx] = True - if all(self.reduced): - self.reduced = [False] * len(self.bucket_sizes) - - def sync_reduction_streams(): - # We only have to sync with the first one, but it's safer to do it this way - # in case we change the way in which we paralellize work - r_streams = zip(*self._reduction_streams) - for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams): - with torch.cuda.device(dev_id): - for reduction_stream in dev_r_streams: - default_stream.wait_stream(reduction_stream) - Variable._execution_engine.queue_callback(sync_reduction_streams) - - def _start_reduction_threads(self): - num_buckets = len(self.bucket_sizes) - self._reduction_queues = [queue.Queue() for _ in range(num_buckets)] - self._reduction_threads = [] - self._reduction_streams = [[] for _ in range(num_buckets)] - self._nccl_streams = [] - self._default_streams = [] - for dev_id in self.device_ids: + grads_batch = self.buckets[bucket_idx] + grads_batch_coalesced = [] + + # coalesce the bucket + for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch): with torch.cuda.device(dev_id): - # TODO: don't assume we're on a default stream - self._default_streams.append(torch.cuda.current_stream()) - self._nccl_streams.append(torch.cuda.Stream()) - for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams): - for dev_id in self.device_ids: - with torch.cuda.device(dev_id): - reduction_streams.append(torch.cuda.Stream()) - # We only use the first device for distributed reductions - dist._register_stream(reduction_streams[0]) - - group_id = dist.new_group() - - self._reduction_threads.append(threading.Thread( - target=self._reduction_thread_fn, - args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams))) - self._reduction_threads[-1].daemon = True - self._reduction_threads[-1].start() - - @staticmethod - def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams): - - def _process_batch(): - dev_grad_batch, dev_events, job_event = queue.get() - dev_coalesced = [] - # Coalesce the tensors on all devices and start a local reduction - for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): - with torch.cuda.device(dev_id), torch.cuda.stream(stream): - stream.wait_event(event) - coalesced = _flatten_dense_tensors(grad_batch) - dev_coalesced.append(coalesced) - # Wait for all copies to complete before starting the NCCL kernel - for stream in reduction_streams: - stream.synchronize() - nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) - - # From now on we're only going to work on the first device (from device_ids) - grad_batch = dev_grad_batch[0] - coalesced = dev_coalesced[0] - reduce_stream = reduction_streams[0] - with torch.cuda.stream(reduce_stream): - reduce_stream.wait_stream(nccl_streams[0]) - coalesced /= dist.get_world_size() - dist.all_reduce(coalesced, group=group_id) - for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): - grad.copy_(reduced) - job_event.set() - - with torch.cuda.device(device_ids[0]): - while True: - _process_batch() # just to have a clear scope + dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) + grads_batch_coalesced.append(dev_grads_batch_coalesced) + + # reduce to the first GPU in self.device_ids + if len(self.device_ids) > 1: + nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) + + # divide by the number of processes here to reduce chances of overflow + grads_batch_coalesced[0] /= self.process_group.size() + + # now work on the first gpu + reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]], + self.allreduce_opts) + self.reduction_works[bucket_idx] = reduction_work + self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0] + + def _sync_reduction_works(self): + # Now only work on the first GPU of self.device_ids, uncoalesce + # the gradients for each bucket + for bucket_idx, grads_batch in enumerate(self.buckets): + # wait will let current stream wait on the c10d reduction stream + self.reduction_works[bucket_idx].wait() + + grads_batch_reduced = _unflatten_dense_tensors( + self.buckets_coalesced[bucket_idx], grads_batch[0]) + + for grad, reduced in zip(grads_batch[0], grads_batch_reduced): + grad.copy_(reduced) + + # Reset the module states + self.next_bucket = len(self.bucket_sizes) - 1 + self.ready_buckets_not_reduced = set() + self.reduction_works = [None for _ in range(len(self.bucket_sizes))] + self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] + + self.buckets = [[[None for _ in range(self.bucket_sizes[i])] + for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] + self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] + self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py deleted file mode 100644 index 61acf2c8784f04..00000000000000 --- a/torch/nn/parallel/distributed_c10d.py +++ /dev/null @@ -1,366 +0,0 @@ -import copy - -import torch -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \ - _take_tensors - -from torch.cuda.comm import broadcast_coalesced -from torch.cuda import nccl -import torch.distributed.c10d as c10d - -from ..modules import Module -from .replicate import replicate -from .scatter_gather import scatter_kwargs, gather -from .parallel_apply import parallel_apply - - -class _DistributedDataParallelC10d(Module): - r"""Implements distributed data parallelism that is based on c10d at the - module level. - - Currently this module is EXPERIMENTAL ONLY and should not be - used by normal users. Instead, please use DistributedDataParallel. - - This container parallelizes the application of the given module by - splitting the input across the specified devices by chunking in the batch - dimension. The module is replicated on each machine and each device, and - each such replica handles a portion of the input. During the backwards - pass, gradients from each node are averaged. - - The batch size should be larger than the number of GPUs used locally. It - should also be an integer multiple of the number of GPUs so that each chunk - is the same size (so that each GPU processes the same number of samples). - - See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`. - The same constraints on input as in :class:`torch.nn.DataParallel` apply. - - Creation of this class requires the c10d process group to be already - initialized. This class will basically operate on the provided c10d - process group. - - .. warning:: - This module works only with the ``gloo`` and ``nccl`` process groups. - - .. warning:: - Constructor, forward method, and differentiation of the output (or a - function of the output of this module) is a distributed synchronization - point. Take that into account in case different processes might be - executing different code. - - .. warning:: - This module assumes all parameters are registered in the model by the - time it is created. No parameters should be added nor removed later. - Same applies to buffers. - - -- warning:: - This module assumes all parameters are registered in the model of each - distributed processes are in the same order. The module itself will - conduct gradient all-reduction following the reverse order of the - registered parameters of the model. In other wise, it is users' - responsibility to ensure that each distributed process has the exact - same model and thus the exact parameter registeration order. - - .. warning:: - This module assumes all buffers and gradients are dense. - - .. warning:: - This module doesn't work with :func:`torch.autograd.grad` (i.e. it will - only work if gradients are to be accumulated in ``.grad`` attributes of - parameters). - - .. warning:: - If you plan on using this module with a ``nccl`` process group or - a ``gloo`` process group (that uses Infiniband), together with a - DataLoader that uses multiple workers, please change the multiprocessing - start method to ``forkserver`` (Python 3 only) or ``spawn``. - Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe, - and you will likely experience deadlocks if you don't change this - setting. - - .. note:: - Parameters are never broadcast between processes. The module performs - an all-reduce step on gradients and assumes that they will be modified - by the optimizer in all processes in the same way. Buffers - (e.g. BatchNorm stats) are broadcast from the module in process of rank - 0, to all other replicas in the system in every iteration. - - .. warning:: - Forward and backward hooks defined on :attr:`module` and its submodules - won't be invoked anymore, unless the hooks are initialized in the - :meth:`forward` method. - - Args: - module: module to be parallelized - device_ids: CUDA devices (default: all devices) - output_device: device location of output (default: device_ids[0]) - broadcast_buffers: flag that enables syncing (broadcasting) buffers of - the module at beginning of the forward function. - (default: True) - process_group: the c10d process group to be used for distributed data - all-reduction. If None, the default process group will - be used - bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into - multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward computation. - bucket_cap_mb controls the bucket size in MegaBytes (MB) - (default: 25) - - Attributes: - module (Module): the module to be parallelized - - Example:: - >>> store = torch.distributed.c10d.FileStore("/tmp/tempfile.txt") - >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size) - >>> net = torch.nn._DistributedDataParallelC10d(model, pg) - """ - def __init__(self, module, device_ids=None, - output_device=None, dim=0, broadcast_buffers=True, - process_group=None, bucket_cap_mb=25): - - super(_DistributedDataParallelC10d, self).__init__() - - # Use all devices by default - if device_ids is None: - device_ids = list(range(torch.cuda.device_count())) - - if output_device is None: - output_device = device_ids[0] - - if process_group is None: - self.process_group = c10d.get_default_group() - else: - self.process_group = process_group - - self.dim = dim - self.module = module - self.device_ids = device_ids - self.output_device = output_device - self.broadcast_buffers = broadcast_buffers - - self.allreduce_opts = c10d.AllreduceOptions() - - MB = 1024 * 1024 - - # used for intra-node param sync and inter-node sync as well - self.broadcast_bucket_size = 25 * MB - - # Sync params and buffers - module_states = list(self.module.state_dict().values()) - if len(module_states) > 0: - self._dist_broadcast_coalesced(module_states, - self.broadcast_bucket_size) - - if len(device_ids) > 1: - # TODO: we don't need to replicate params in here. they're always going to - # be broadcasted using larger blocks in broadcast_coalesced, so it might be - # better to not pollute the caches with these small blocks - self._module_copies = replicate(self.module, self.device_ids, detach=True) - self._module_copies[0] = self.module - - for module_copy in self._module_copies[1:]: - for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): - copy_param.requires_grad = param.requires_grad - - else: - self._module_copies = [self.module] - - # .data() of each parameter for each model replica - self.modules_params_data = [[] for _ in range(len(self.device_ids))] - # .data() of each buffer for each model replica - self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] - - for dev_idx, module in enumerate(self._module_copies): - self.modules_params_data[dev_idx] = [p.data for p in module.parameters()] - self.modules_buffers_data[dev_idx] = [b.data for b in module.buffers()] - - bucket_bytes_cap = bucket_cap_mb * MB - - # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems - param_buckets = [] - # Split the parameters into buckets and by types as well - param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies] - - self.bucket_sizes = [] - self.bucket_map = {} - - # We transpose param_buckets, so the loop is over buckets. - # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems - for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): - self.bucket_sizes.append(0) - # Now, we transpose again, so we iterate over bucket_elems, but getting tuples - # of params from each device. - for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): - if not param_tuple[0].requires_grad: - continue - for p in param_tuple: - self.bucket_map[p] = (bucket_idx, idx) - self.bucket_sizes[bucket_idx] += 1 - - self.buckets = [[[None for _ in range(self.bucket_sizes[i])] - for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] - # The number of params ready in each bucket - self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] - - # coalesced bucket for only device 0 - self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] - # We will always reduce the bucket following the reverse order - # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 - self.next_bucket = len(self.bucket_sizes) - 1 - self.ready_buckets_not_reduced = set() - self.reduction_works = [None for _ in range(len(self.bucket_sizes))] - - self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] - - # default stream tracking to launch nccl reduce kernels - self.default_streams = [] - for dev_id in self.device_ids: - with torch.cuda.device(dev_id): - self.default_streams.append(torch.cuda.current_stream()) - - self._register_grad_hooks() - - def __getstate__(self): - attrs = copy.copy(self.__dict__) - del attrs['_grad_accs'] - return attrs - - def __setstate__(self, state): - super(_DistributedDataParallelC10d, self).__setstate__(state) - self._register_grad_hooks() - - def forward(self, *inputs, **kwargs): - inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) - self._sync_params() - if len(self.device_ids) == 1: - return self.module(*inputs[0], **kwargs[0]) - outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs) - return self.gather(outputs, self.output_device) - - def scatter(self, inputs, kwargs, device_ids): - return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) - - def parallel_apply(self, replicas, inputs, kwargs): - return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) - - def gather(self, outputs, output_device): - return gather(outputs, output_device, dim=self.dim) - - def train(self, mode=True): - super(_DistributedDataParallelC10d, self).train(mode) - for module in self._module_copies[1:]: - module.train(mode) - - def _dist_broadcast_coalesced(self, tensors, buffer_size): - c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group) - - def _sync_params(self): - c10d._sync_params(self.process_group, - self.modules_params_data, - self.modules_buffers_data, - self.device_ids, - self.broadcast_bucket_size, - self.broadcast_buffers) - - def _register_grad_hooks(self): - self._grad_accs = [] # need to keep them in scope - for device_idx, module in enumerate(self._module_copies): - for p in module.parameters(): - if p.requires_grad: - p_tmp = p.expand_as(p) - grad_acc = p_tmp.grad_fn.next_functions[0][0] - grad_acc.register_hook(self._make_param_hook(p, device_idx)) - self._grad_accs.append(grad_acc) - - def _make_param_hook(self, param, device_idx): - bucket_idx, bucket_offset = self.bucket_map[param] - - def distributed_data_parallel_hook(*unused): - if param.grad.requires_grad: - raise RuntimeError("DistributedDataParallelC10d only works " - "with gradients that don't require grad") - bucket = self.buckets[bucket_idx][device_idx] - bucket[bucket_offset] = param.grad.data - self.buckets_ready_size[bucket_idx][device_idx] += 1 - - # We can flush these and save memory for replicas - if device_idx > 0: - param.grad = None - param.data.set_() - - # Current device's bucket is full - if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]: - self.devs_ready[bucket_idx] += 1 - if self.devs_ready[bucket_idx] < len(self.device_ids): - return - - # Now all devices's buckets with index: bucket_idx are ready - if bucket_idx == self.next_bucket: - self._queue_reduction(bucket_idx) - self.next_bucket -= 1 - # Now reduce anything that is ready but not yet reduced - if len(self.ready_buckets_not_reduced) > 0: - sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True) - for i in sorted_todo: - # Nothing can be reduced now - if i < self.next_bucket: - break - self._queue_reduction(i) - self.ready_buckets_not_reduced.remove(i) - if i == self.next_bucket: - self.next_bucket -= 1 - else: - self.ready_buckets_not_reduced.add(bucket_idx) - - # When all devices' buckets - if self.next_bucket == -1: - # A final sync for all the reduction works - self._sync_reduction_works() - - return distributed_data_parallel_hook - - def _queue_reduction(self, bucket_idx): - grads_batch = self.buckets[bucket_idx] - grads_batch_coalesced = [] - - # coalesce the bucket - for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch): - with torch.cuda.device(dev_id): - dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) - grads_batch_coalesced.append(dev_grads_batch_coalesced) - - # reduce to the first GPU in self.device_ids - if len(self.device_ids) > 1: - nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) - - # divide by the number of processes here to reduce chances of overflow - grads_batch_coalesced[0] /= self.process_group.size() - - # now work on the first gpu - reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]], - self.allreduce_opts) - self.reduction_works[bucket_idx] = reduction_work - self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0] - - def _sync_reduction_works(self): - # Now only work on the first GPU of self.device_ids, uncoalesce - # the gradients for each bucket - for bucket_idx, grads_batch in enumerate(self.buckets): - # wait will let current stream wait on the c10d reduction stream - self.reduction_works[bucket_idx].wait() - - grads_batch_reduced = _unflatten_dense_tensors( - self.buckets_coalesced[bucket_idx], grads_batch[0]) - - for grad, reduced in zip(grads_batch[0], grads_batch_reduced): - grad.copy_(reduced) - - # Reset the module states - self.next_bucket = len(self.bucket_sizes) - 1 - self.ready_buckets_not_reduced = set() - self.reduction_works = [None for _ in range(len(self.bucket_sizes))] - self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] - - self.buckets = [[[None for _ in range(self.bucket_sizes[i])] - for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] - self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] - self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] diff --git a/torch/nn/parallel/distributed_cpu.py b/torch/nn/parallel/distributed_cpu.py index 07b59c528d25b4..d5a3b1235f9678 100644 --- a/torch/nn/parallel/distributed_cpu.py +++ b/torch/nn/parallel/distributed_cpu.py @@ -7,9 +7,10 @@ class DistributedDataParallelCPU(Module): - r"""Implements distributed data parallelism for CPU at the module level. + r"""Implements distributed data parallelism for CPU at the module level + and it will use PyTorch's new distributed package: c10d. - This module support the ``mpi``, ``gloo``, ``tcp`` backends. + This module support the ``mpi``, ``gloo``, backends. This container parallelizes the application of the given module by splitting the input across the specified devices by chunking in the batch diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py index d3e871353ece96..ee58c1afc5a74d 100644 --- a/torch/utils/data/distributed.py +++ b/torch/utils/data/distributed.py @@ -1,7 +1,7 @@ import math import torch from . import Sampler -from torch.distributed import get_world_size, get_rank +import torch.distributed as dist class DistributedSampler(Sampler): @@ -24,9 +24,13 @@ class DistributedSampler(Sampler): def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: - num_replicas = get_world_size() + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() if rank is None: - rank = get_rank() + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank From 3d5fd12488103c9259a9d20f804102fe65597126 Mon Sep 17 00:00:00 2001 From: Teng Li Date: Tue, 11 Sep 2018 02:06:05 -0700 Subject: [PATCH 025/237] Documentation for c10d: torch.distributed and deprecate the old distributed doc (#11450) Summary: This is the new documentation for c10d release, and it also deprecates the old torch.distributed document. This PR depends on https://github.com/pytorch/pytorch/pull/11405 and should only be landed after https://github.com/pytorch/pytorch/pull/11405 is landed Pull Request resolved: https://github.com/pytorch/pytorch/pull/11450 Differential Revision: D9765504 Pulled By: teng-li fbshipit-source-id: 48f38b27b8c270baf389f8e478ea226b9ecc63db --- docs/source/distributed.rst | 114 ++++++---- docs/source/distributed_deprecated.rst | 280 +++++++++++++++++++++++++ docs/source/index.rst | 1 + 3 files changed, 353 insertions(+), 42 deletions(-) create mode 100644 docs/source/distributed_deprecated.rst diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index 9dbb9be74709bb..49d9d96ee62817 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -7,35 +7,35 @@ Distributed communication package - torch.distributed .. automodule:: torch.distributed .. currentmodule:: torch.distributed -Currently torch.distributed supports four backends, each with +Currently torch.distributed supports three backends, each with different capabilities. The table below shows which functions are available for use with CPU / CUDA tensors. MPI supports cuda only if the implementation used to build PyTorch supports it. -+------------+-----------+-----------+-----------+-----------+ -| Backend | ``tcp`` | ``gloo`` | ``mpi`` | ``nccl`` | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| Device | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU | -+============+=====+=====+=====+=====+=====+=====+=====+=====+ -| send | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| recv | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| broadcast | ✓ | ✘ | ✓ | ✓ | ✓ | ? | ✘ | ✓ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| all_reduce | ✓ | ✘ | ✓ | ✓ | ✓ | ? | ✘ | ✓ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| reduce | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✓ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| all_gather | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✓ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| gather | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| scatter | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ -| barrier | ✓ | ✘ | ✓ | ✓ | ✓ | ? | ✘ | ✘ | -+------------+-----+-----+-----+-----+-----+-----+-----+-----+ ++------------+-----------+-----------+-----------+ +| Backend | ``gloo`` | ``mpi`` | ``nccl`` | ++------------+-----+-----+-----+-----+-----+-----+ +| Device | CPU | GPU | CPU | GPU | CPU | GPU | ++============+=====+=====+=====+=====+=====+=====+ +| send | ✓ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+ +| recv | ✓ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+ +| broadcast | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+ +| all_reduce | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+ +| reduce | ✘ | ✘ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+ +| all_gather | ✘ | ✘ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+ +| gather | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+ +| scatter | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+ +| barrier | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+ .. _distributed-basics: @@ -79,6 +79,14 @@ joined. .. autofunction:: get_world_size +.. autofunction:: is_initialized + +.. autofunction:: get_default_group + +.. autofunction:: is_mpi_available + +.. autofunction:: is_nccl_available + -------------------------------------------------------------------------------- Currently three initialization methods are supported: @@ -91,10 +99,8 @@ reachable from all processes and a desired ``world_size``. The first way requires specifying an address that belongs to the rank 0 process. This initialization method requires that all processes have manually specified ranks. -Alternatively, the address has to be a valid IP multicast address, in which case -ranks can be assigned automatically. Multicast initialization also supports -a ``group_name`` argument, which allows you to use the same address for multiple -jobs, as long as they use different group names. +Note that multicast address is not supported anymore in the latest distributed +package. ``group_name`` is deprecated as well. :: @@ -103,31 +109,40 @@ jobs, as long as they use different group names. # Use address of one of the machines dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4) - # or a multicast address - rank will be assigned automatically if unspecified - dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456', - world_size=4) - Shared file-system initialization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Another initialization method makes use of a file system that is shared and visible from all machines in a group, along with a desired ``world_size``. The URL should start with ``file://`` and contain a path to a non-existent file (in an existing -directory) on a shared file system. This initialization method also supports a -``group_name`` argument, which allows you to use the same shared file path for -multiple jobs, as long as they use different group names. +directory) on a shared file system. File-system initialization will automatically +create that file if it doesn't exist, but will not delete the file. Therefore, it +is your responsibility to make sure that the file is cleaned up before the next +init_process_group call on the same file path/name. + +Note that automatic rank assignment is not supported anymore in the latest +distributed package and ``group_name`` is deprecated as well. .. warning:: This method assumes that the file system supports locking using ``fcntl`` - most local systems and NFS support it. +.. warning:: + This method does not clean up and remove the file and it is your responsibility + to remove the file at the end of the training. This is especially important + if you plan to call init_process_group multiple times on the same file name. + In other words, if the file is not removed/cleaned up and you call + init_process_group again on that file, it is unexpected behavior and will cause + failures. The rule of thumb here is that, make sure that the file is non-existent or + empty everytime init_process_group is called. + :: import torch.distributed as dist - # Rank will be assigned automatically if unspecified + # rank should always be specified dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile', - world_size=4, group_name=args.group) + world_size=4, rank=args.rank) Environment variable initialization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -174,14 +189,29 @@ as they should never be created manually, but they are guaranteed to support two * ``wait()`` - will block the process until the operation is finished. ``is_completed()`` is guaranteed to return True once it returns. -When using the MPI backend, :func:`~torch.distributed.isend` and :func:`~torch.distributed.irecv` -support non-overtaking, which has some guarantees on supporting message order. For more detail, see -http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54 - .. autofunction:: isend .. autofunction:: irecv +Synchronous and asynchornous collective operations +-------------------------------------------------- +Every collective operation function supports the following two kinds of operations: + +synchronous operation - the default mode, when ``async_op`` is set to False. +when the function returns, it is guaranteed that +the collective operation is performed (not necessarily completed if it's a CUDA op since all +CUDA ops are asynchornous), and any further function calls depending on the data of the +collective operation can be called. In the synchronous mode, the collective function does not +return anything + +asynchornous operation - when ``async_op`` is set to True. The collective operation function +returns a distributed request object. In general, you don't need to create it manually and it +is guaranteed to support two methods: + +* ``is_completed()`` - returns True if the operation has finished +* ``wait()`` - will block the process until the operation is finished. + + Collective functions -------------------- @@ -202,7 +232,7 @@ Collective functions Multi-GPU collective functions ------------------------------ -If you have more than one GPU on each node, when using the NCCL backend, +If you have more than one GPU on each node, when using the NCCL and Gloo backend, :func:`~torch.distributed.broadcast_multigpu` :func:`~torch.distributed.all_reduce_multigpu` :func:`~torch.distributed.reduce_multigpu` and diff --git a/docs/source/distributed_deprecated.rst b/docs/source/distributed_deprecated.rst new file mode 100644 index 00000000000000..303ce7b8a54922 --- /dev/null +++ b/docs/source/distributed_deprecated.rst @@ -0,0 +1,280 @@ +.. role:: hidden + :class: hidden-section + +Distributed communication package (deprecated) - torch.distributed.deprecated +============================================================================= + +.. warning:: + torch.distributed.deprecated is the older version of torch.distributed and + currently deprecated. It will be removed soon. Please use and refer the doc + for torch.distributed, which is the latest distributed communication + package for PyTorch + +.. automodule:: torch.distributed.deprecated +.. currentmodule:: torch.distributed.deprecated + +Currently torch.distributed.deprecated supports four backends, each with +different capabilities. The table below shows which functions are available +for use with CPU / CUDA tensors. +MPI supports cuda only if the implementation used to build PyTorch supports it. + + ++------------+-----------+-----------+-----------+-----------+ +| Backend | ``tcp`` | ``gloo`` | ``mpi`` | ``nccl`` | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| Device | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU | ++============+=====+=====+=====+=====+=====+=====+=====+=====+ +| send | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| recv | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| broadcast | ✓ | ✘ | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| all_reduce | ✓ | ✘ | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| reduce | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| all_gather | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✓ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| gather | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| scatter | ✓ | ✘ | ✘ | ✘ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ +| barrier | ✓ | ✘ | ✓ | ✓ | ✓ | ? | ✘ | ✘ | ++------------+-----+-----+-----+-----+-----+-----+-----+-----+ + +.. _distributed-deprecated-basics: + +Basics +------ + +The `torch.distributed.deprecated` package provides PyTorch support and communication primitives +for multiprocess parallelism across several computation nodes running on one or more +machines. The class :func:`torch.nn.parallel.deprecated.DistributedDataParallel` builds on this +functionality to provide synchronous distributed training as a wrapper around any +PyTorch model. This differs from the kinds of parallelism provided by +:doc:`multiprocessing` and :func:`torch.nn.DataParallel` in that it supports +multiple network-connected machines and in that the user must explicitly launch a separate +copy of the main training script for each process. + +In the single-machine synchronous case, `torch.distributed.deprecated` or the +:func:`torch.nn.parallel.deprecated.DistributedDataParallel` wrapper may still have advantages over other +approaches to data-parallelism, including :func:`torch.nn.DataParallel`: + +* Each process maintains its own optimizer and performs a complete optimization step with each + iteration. While this may appear redundant, since the gradients have already been gathered + together and averaged across processes and are thus the same for every process, this means + that no parameter broadcast step is needed, reducing time spent transferring tensors between + nodes. +* Each process contains an independent Python interpreter, eliminating the extra interpreter + overhead and "GIL-thrashing" that comes from driving several execution threads, model + replicas, or GPUs from a single Python process. This is especially important for models that + make heavy use of the Python runtime, including models with recurrent layers or many small + components. + +Initialization +-------------- + +The package needs to be initialized using the :func:`torch.distributed.deprecated.init_process_group` +function before calling any other methods. This blocks until all processes have +joined. + +.. autofunction:: init_process_group + +.. autofunction:: get_rank + +.. autofunction:: get_world_size + +-------------------------------------------------------------------------------- + +Currently three initialization methods are supported: + +TCP initialization +^^^^^^^^^^^^^^^^^^ + +There are two ways to initialize using TCP, both requiring a network address +reachable from all processes and a desired ``world_size``. The first way +requires specifying an address that belongs to the rank 0 process. This +initialization method requires that all processes have manually specified ranks. + +Alternatively, the address has to be a valid IP multicast address, in which case +ranks can be assigned automatically. Multicast initialization also supports +a ``group_name`` argument, which allows you to use the same address for multiple +jobs, as long as they use different group names. + +:: + + import torch.distributed.deprecated as dist + + # Use address of one of the machines + dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4) + + # or a multicast address - rank will be assigned automatically if unspecified + dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456', + world_size=4) + +Shared file-system initialization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Another initialization method makes use of a file system that is shared and +visible from all machines in a group, along with a desired ``world_size``. The URL should start +with ``file://`` and contain a path to a non-existent file (in an existing +directory) on a shared file system. This initialization method also supports a +``group_name`` argument, which allows you to use the same shared file path for +multiple jobs, as long as they use different group names. + +.. warning:: + This method assumes that the file system supports locking using ``fcntl`` - most + local systems and NFS support it. + +:: + + import torch.distributed.deprecated as dist + + # Rank will be assigned automatically if unspecified + dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile', + world_size=4, group_name=args.group) + +Environment variable initialization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This method will read the configuration from environment variables, allowing +one to fully customize how the information is obtained. The variables to be set +are: + +* ``MASTER_PORT`` - required; has to be a free port on machine with rank 0 +* ``MASTER_ADDR`` - required (except for rank 0); address of rank 0 node +* ``WORLD_SIZE`` - required; can be set either here, or in a call to init function +* ``RANK`` - required; can be set either here, or in a call to init function + +The machine with rank 0 will be used to set up all connections. + +This is the default method, meaning that ``init_method`` does not have to be specified (or +can be ``env://``). + +Groups +------ + +By default collectives operate on the default group (also called the world) and +require all processes to enter the distributed function call. However, some workloads can benefit +from more fine-grained communication. This is where distributed groups come +into play. :func:`~torch.distributed.deprecated.new_group` function can be +used to create new groups, with arbitrary subsets of all processes. It returns +an opaque group handle that can be given as a ``group`` argument to all collectives +(collectives are distributed functions to exchange information in certain well-known programming patterns). + +.. autofunction:: new_group + +Point-to-point communication +---------------------------- + +.. autofunction:: send + +.. autofunction:: recv + +:func:`~torch.distributed.deprecated.isend` and :func:`~torch.distributed.deprecated.irecv` +return distributed request objects when used. In general, the type of this object is unspecified +as they should never be created manually, but they are guaranteed to support two methods: + +* ``is_completed()`` - returns True if the operation has finished +* ``wait()`` - will block the process until the operation is finished. + ``is_completed()`` is guaranteed to return True once it returns. + +When using the MPI backend, :func:`~torch.distributed.deprecated.isend` and :func:`~torch.distributed.deprecated.irecv` +support non-overtaking, which has some guarantees on supporting message order. For more detail, see +http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54 + +.. autofunction:: isend + +.. autofunction:: irecv + +Collective functions +-------------------- + +.. autofunction:: broadcast + +.. autofunction:: all_reduce + +.. autofunction:: reduce + +.. autofunction:: all_gather + +.. autofunction:: gather + +.. autofunction:: scatter + +.. autofunction:: barrier + +Multi-GPU collective functions +------------------------------ + +If you have more than one GPU on each node, when using the NCCL backend, +:func:`~torch.distributed.deprecated.broadcast_multigpu` +:func:`~torch.distributed.deprecated.all_reduce_multigpu` +:func:`~torch.distributed.deprecated.reduce_multigpu` and +:func:`~torch.distributed.deprecated.all_gather_multigpu` support distributed collective +operations among multiple GPUs within each node. These functions can potentially +improve the overall distributed training performance and be easily used by +passing a list of tensors. Each Tensor in the passed tensor list needs +to be on a separate GPU device of the host where the function is called. Note +that the length of the tensor list needs to be identical among all the +distributed processes. Also note that currently the multi-GPU collective +functions are only supported by the NCCL backend. + +For example, if the system we use for distributed training has 2 nodes, each +of which has 8 GPUs. On each of the 16 GPUs, there is a tensor that we would +like to all-reduce. The following code can serve as a reference: + +Code running on Node 0 + +:: + + import torch + import torch.distributed.deprecated as dist + + dist.init_process_group(backend="nccl", + init_method="file:///distributed_test", + world_size=2, + rank=0) + tensor_list = [] + for dev_idx in range(torch.cuda.device_count()): + tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx)) + + dist.all_reduce_multigpu(tensor_list) + +Code running on Node 1 + +:: + + import torch + import torch.distributed.deprecated as dist + + dist.init_process_group(backend="nccl", + init_method="file:///distributed_test", + world_size=2, + rank=1) + tensor_list = [] + for dev_idx in range(torch.cuda.device_count()): + tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx)) + + dist.all_reduce_multigpu(tensor_list) + +After the call, all 16 tensors on the two nodes will have the all-reduced value +of 16 + +.. autofunction:: broadcast_multigpu + +.. autofunction:: all_reduce_multigpu + +.. autofunction:: reduce_multigpu + +.. autofunction:: all_gather_multigpu + + +Launch utility +-------------- + +The `torch.distributed.deprecated` package also provides a launch utility in +`torch.distributed.deprecated.launch`. + +.. automodule:: torch.distributed.launch diff --git a/docs/source/index.rst b/docs/source/index.rst index 5b37b3301f3db2..7732366d38a61c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,6 +32,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs. optim torch.autograd torch.distributed + torch.distributed.deprecated torch.distributions torch.multiprocessing bottleneck From f84693efa96317ffb3a1c68045fa924d2d6bf2b2 Mon Sep 17 00:00:00 2001 From: Duc Ngo Date: Tue, 11 Sep 2018 04:35:44 -0700 Subject: [PATCH 026/237] nomnigraph - Improvements to subgraph matching APIs (#11418) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11418 Several improvements that aim to make the APIs more straightforward to use - Get rid of helper methods subgraph and nonTerminal . Users now should create a NNMatchGraph directly via graph's createNode and createEdge API - Get rid of operatorSubgraph helper method - invertGraphTraversal flag applies to both the match graph and the scanned graph. This allows user to create match graph in the same direction as the scanned graph, thus reduce confusion. - additional parameters of matchNode (count, includeInSubgraph, nonTerminal) are removed from the constructors and moved into setter methods. (We no longer enforce that MatchNode is immutable but this helps improve code clarity). - Tests are updated to reflect the changes Follow up changes: - Possibly clean up the tests further. This change aims to minimally modify the unit tests. - Help a validity check that enforce the current limitation of the match graph (single source node), and throws if the match graph does not satisfy the criteria. - Have the single source node be detected automatically and callers just need to pass in the matchGraph instead of the source node reference. Differential Revision: D9732565 fbshipit-source-id: ae8320e2bc89b867f6bb4b1c1aad635f4b219fa1 --- .../nomnigraph/Representations/NeuralNet.cc | 15 ++- .../nomnigraph/Representations/NeuralNet.h | 17 ++- .../Transformations/SubgraphMatcher.h | 100 +++++++++--------- caffe2/core/nomnigraph/tests/NeuralNetTest.cc | 31 +++--- .../nomnigraph/tests/SubgraphMatcherTest.cc | 62 ++++++----- caffe2/python/pybind_state_nomni.cc | 24 +++-- 6 files changed, 127 insertions(+), 122 deletions(-) diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc index 2765b48435e7bd..28f33a43cbff2f 100644 --- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc +++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc @@ -216,17 +216,14 @@ NNNodeMatchCriteria criteriaSingleConsumer() { "Single consumer"); } -NNNodeMatchCriteria matchTensor() { - return matchOp("matchTensor"); +NNNodeMatchCriteria matchTensor(const std::string& debugString) { + return matchOp(debugString); } -NNMatchGraph::NodeRef operatorSubgraph( - NNMatchGraph& g, - const NNNodeMatchCriteria& root, - const std::vector& childrenCriteria, - int count) { - return subgraph( - g, matchTensor(), {subgraph(g, root, childrenCriteria)}, count); +NNMatchNode matchExternalTensorNode(const std::string& debugString) { + return NNMatchNode(matchTensor(debugString)) + .nonTerminal() + .excludeFromSubgraph(); } } // namespace nn diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index f4c3940acd0711..09273a5fe15981 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -474,8 +474,6 @@ NNNodeMatchCriteria matchOp(const std::string& debugString = "matchOp") { debugString); } -CAFFE2_API NNNodeMatchCriteria matchTensor(); - template NNNodeMatchCriteria matchOp( const std::function predicate, @@ -489,6 +487,12 @@ NNNodeMatchCriteria matchOp( debugString); }; +CAFFE2_API NNNodeMatchCriteria +matchTensor(const std::string& debugString = "matchTensor"); + +CAFFE2_API NNMatchNode +matchExternalTensorNode(const std::string& debugString = "matchExternalTensor"); + struct CAFFE2_API NNNodeMatch { static bool isMatch( const NNGraph::NodeRef& node, @@ -500,15 +504,6 @@ struct CAFFE2_API NNNodeMatch { using NNSubgraphMatcher = nom::matcher::SubgraphMatcher; -// This helper method makes it easy to create matching criteria in NNGraph. -// For example, operatorSubgraph(opMatch, ...) will refer to a tree like this: -// ... -> opMatch -> opMatch_Output -CAFFE2_API NNMatchGraph::NodeRef operatorSubgraph( - NNMatchGraph& g, - const NNNodeMatchCriteria& root, - const std::vector& childrenCriteria = {}, - int count = 1); - } // namespace nn } // namespace repr diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h index f11975284b217c..66494dba3275ce 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h @@ -32,15 +32,8 @@ template class CAFFE2_API MatchNode { public: static const int kStarCount = -1; - MatchNode( - const NodeMatchCriteria& criteria, - bool includeInSubgraph = true, - int count = 1, - bool nonTerminal = false) - : criteria_(criteria), - includeInSubgraph_(includeInSubgraph), - count_(count), - nonTerminal_(nonTerminal) {} + + MatchNode(const NodeMatchCriteria& criteria) : criteria_(criteria) {} MatchNode() = default; MatchNode(const MatchNode&) = default; @@ -55,6 +48,25 @@ class CAFFE2_API MatchNode { return count_; } + MatchNode& count(int count) { + count_ = count; + return *this; + } + + MatchNode& starCount() { + return count(kStarCount); + } + + MatchNode& nonTerminal() { + nonTerminal_ = true; + return *this; + } + + MatchNode& excludeFromSubgraph() { + includeInSubgraph_ = false; + return *this; + } + bool isNonTerminal() const { return nonTerminal_; } @@ -65,9 +77,9 @@ class CAFFE2_API MatchNode { private: NodeMatchCriteria criteria_; - bool includeInSubgraph_; - int count_; - bool nonTerminal_; + int count_ = 1; + bool includeInSubgraph_ = true; + bool nonTerminal_ = false; }; template @@ -76,38 +88,12 @@ using MatchGraph = Graph>; template using MatchNodeRef = typename MatchGraph::NodeRef; -template -MatchNodeRef subgraph( - MatchGraph& graph, - const NodeMatchCriteria& root, - const std::vector>& children, - int count = 1, - bool includeInSubgraph = true) { - auto result = graph.createNode( - MatchNode(root, includeInSubgraph, count, false)); - for (auto child : children) { - graph.createEdge(result, child); - } - return result; -} - -// Note that for nonTerminalSubgraph, the default value for includeInSubgraph -// is false since we typically do not want to include a nonTerminal node -// in the matched subgraph. -template -MatchNodeRef nonTerminalSubgraph( - MatchGraph& graph, - const NodeMatchCriteria& root, - int count = 1, - bool includeInSubgraph = false) { - return graph.createNode( - MatchNode(root, includeInSubgraph, count, true)); -} - // TODO: Reuse convertToDotString once convertToDotString can work // with subgraph. template -std::string debugString(MatchNodeRef rootCriteriaRef) { +std::string debugString( + MatchNodeRef rootCriteriaRef, + bool invertGraphTraversal) { std::ostringstream out; auto rootNode = rootCriteriaRef->data(); out << "{rootCriteria = '" << rootNode.getCriteria() << "'"; @@ -117,11 +103,14 @@ std::string debugString(MatchNodeRef rootCriteriaRef) { if (rootNode.isNonTerminal()) { out << ", nonTerminal = " << rootNode.isNonTerminal(); } - auto outEdges = rootCriteriaRef->getOutEdges(); - if (!outEdges.empty()) { + auto edges = invertGraphTraversal ? rootCriteriaRef->getInEdges() + : rootCriteriaRef->getOutEdges(); + if (!edges.empty()) { out << ", childrenCriteria = ["; - for (auto& child : outEdges) { - out << debugString(child->head()) << ", "; + for (auto& child : edges) { + auto nextNode = invertGraphTraversal ? child->tail() : child->head(); + out << debugString(nextNode, invertGraphTraversal) + << ", "; } out << "]"; } @@ -294,7 +283,8 @@ struct SubgraphMatcher { std::ostringstream debugMessage; debugMessage << "Subgraph root at " << root << " is not the same as " << matchedNode << " which previously matched criteria " - << debugString(rootCriteriaRef); + << debugString( + rootCriteriaRef, invertGraphTraversal); return SubgraphMatchResultType::notMatched(debugMessage.str()); } else { return SubgraphMatchResultType::notMatched(); @@ -307,7 +297,8 @@ struct SubgraphMatcher { std::ostringstream debugMessage; debugMessage << "Subgraph root at " << root << " does not match criteria " - << debugString(rootCriteriaRef); + << debugString( + rootCriteriaRef, invertGraphTraversal); return SubgraphMatchResultType::notMatched(debugMessage.str()); } else { return SubgraphMatchResultType::notMatched(); @@ -326,8 +317,10 @@ struct SubgraphMatcher { invertGraphTraversal ? root->getInEdges() : root->getOutEdges(); int numEdges = edges.size(); - const auto outEdges = rootCriteriaRef->getOutEdges(); - int numChildrenCriteria = outEdges.size(); + const auto criteriaEdges = invertGraphTraversal + ? rootCriteriaRef->getInEdges() + : rootCriteriaRef->getOutEdges(); + int numChildrenCriteria = criteriaEdges.size(); // The current algorithm implies that the ordering of the children is // important. The children nodes will be matched with the children subgraph @@ -336,7 +329,9 @@ struct SubgraphMatcher { int currentEdgeIdx = 0; for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria; criteriaIdx++) { - auto childrenCriteriaRef = outEdges[criteriaIdx]->head(); + auto childrenCriteriaRef = invertGraphTraversal + ? criteriaEdges[criteriaIdx]->tail() + : criteriaEdges[criteriaIdx]->head(); int expectedCount = childrenCriteriaRef->data().getCount(); bool isStarCount = @@ -374,7 +369,7 @@ struct SubgraphMatcher { debugMessage << "Child node at " << child << " does not match child criteria " << debugString( - childrenCriteriaRef) + childrenCriteriaRef, invertGraphTraversal) << ". We expected " << expectedCount << " matches but only found " << countMatch << "."; return SubgraphMatchResultType::notMatched(debugMessage.str()); @@ -399,7 +394,8 @@ struct SubgraphMatcher { std::ostringstream debugMessage; debugMessage << "Expected " << expectedCount << " matches for child criteria " - << debugString(childrenCriteriaRef) + << debugString( + childrenCriteriaRef, invertGraphTraversal) << " but only found " << countMatch; return SubgraphMatchResultType::notMatched(debugMessage.str()); } else { diff --git a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc index 32c0f20f3571bf..874da120b5be8f 100644 --- a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc +++ b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc @@ -43,27 +43,28 @@ TEST(NeuralNetGraph, ReplaceGraph) { */ auto mg = NNMatchGraph(); - // clang-format off - auto matchSumOutput = operatorSubgraph(mg, - matchOp(), { - nonTerminalSubgraph(mg, matchTensor(), 2) - });; - auto pattern = subgraph(mg, - matchOp(), { - matchSumOutput - }); - // clang-format on + auto matchSumInput = + mg.createNode(std::move(matchExternalTensorNode().count(2))); + auto matchSum = mg.createNode(matchOp("matchSum")); + mg.createEdge(matchSumInput, matchSum); + + auto matchSumOutput = mg.createNode(matchTensor("matchSumOutput")); + mg.createEdge(matchSum, matchSumOutput); + + auto matchRelu = mg.createNode(matchOp("matchRelu")); + mg.createEdge(matchSumOutput, matchRelu); - EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, pattern).isMatch()); + auto matchRoot = matchRelu; + EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, matchRoot).isMatch()); EXPECT_FALSE( - NNSubgraphMatcher::isSubgraphMatch(reluOutput, pattern).isMatch()); - EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, pattern).isMatch()); + NNSubgraphMatcher::isSubgraphMatch(reluOutput, matchRoot).isMatch()); + EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, matchRoot).isMatch()); - EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, pattern).isMatch()); + EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, matchRoot).isMatch()); NNSubgraphMatcher::replaceSubgraph( graph, - pattern, + matchRoot, [&matchSumOutput]( NNGraph& g, NNGraph::NodeRef relu, diff --git a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc index 7ed996aa5f1bdc..ee677665c6546d 100644 --- a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc +++ b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc @@ -41,27 +41,22 @@ TestMatchGraph::NodeRef Tree( const Criteria& root, const std::vector& children = {}, int count = 1) { - return subgraph(graph, root, children, count); + auto result = graph.createNode(std::move(TestMatchNode(root).count(count))); + for (auto& child : children) { + graph.createEdge(result, child); + } + return result; } TestMatchGraph::NodeRef NonTerminal(const Criteria& root, int count = 1) { - return nonTerminalSubgraph(graph, root, count); + return graph.createNode( + std::move(TestMatchNode(root).count(count).nonTerminal())); } Criteria any() { return Criteria("*"); } -// Make it more concise to create matching criteria in dataflow graph. -// For example, operatorTree("opA", ...) will refer to a tree like this: -// ... -> opA -> opA_Output -TestMatchGraph::NodeRef operatorTree( - const Criteria& root, - const std::vector& childrenCriteria = {}, - int count = 1) { - return Tree(any(), {Tree(root, childrenCriteria)}, count); -} - std::map TestGraphNodePrinter( TestGraph::NodeRef node) { std::map labelMap; @@ -185,20 +180,35 @@ struct DataFlowTestGraphCriteria { TestMatchGraph::NodeRef matchOpG; DataFlowTestGraphCriteria() { - // clang-format off - matchOpCOutput = operatorTree("opC", { - NonTerminal(Criteria("input"), TestMatchNode::kStarCount) - }); - matchOpG = Tree( - Criteria("opG"),{ - operatorTree("opF", { - operatorTree("opB", { - matchOpCOutput, matchOpCOutput, - }) - }), - NonTerminal(any()) // matches dataI - }); - // clang-format on + auto matchOpCInputs = + graph.createNode(std::move(TestMatchNode(Criteria("input")) + .starCount() + .nonTerminal() + .excludeFromSubgraph())); + auto matchOpC = graph.createNode(Criteria("opC")); + graph.createEdge(matchOpCInputs, matchOpC); + + matchOpCOutput = graph.createNode(any()); + graph.createEdge(matchOpC, matchOpCOutput); + + auto matchOpB = graph.createNode(Criteria("opB")); + graph.createEdge(matchOpCOutput, matchOpB); + graph.createEdge(matchOpCOutput, matchOpB); + + auto matchOpBOutput = graph.createNode(any()); + graph.createEdge(matchOpB, matchOpBOutput); + + auto matchOpF = graph.createNode(Criteria("opF")); + graph.createEdge(matchOpBOutput, matchOpF); + + auto matchOpFOutput = graph.createNode(any()); + graph.createEdge(matchOpF, matchOpFOutput); + + matchOpG = graph.createNode(Criteria("opG")); + auto matchDataI = graph.createNode( + std::move(TestMatchNode(any()).nonTerminal().excludeFromSubgraph())); + graph.createEdge(matchOpFOutput, matchOpG); + graph.createEdge(matchDataI, matchOpG); } }; diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc index fbfe143f66cee0..a8cbb661d69c21 100644 --- a/caffe2/python/pybind_state_nomni.cc +++ b/caffe2/python/pybind_state_nomni.cc @@ -233,9 +233,11 @@ void addNomnigraphMethods(pybind11::module& m) { auto nnOp = nn::get(node); return opName == nnOp->getName(); }); - return g->createNode( - nom::matcher::MatchNode( - match, true, 1, !strict)); + auto node = nom::matcher::MatchNode(match); + if (!strict) { + node.nonTerminal(); + } + return g->createNode(std::move(node)); }, py::return_value_policy::reference_internal, py::arg("node"), @@ -243,9 +245,11 @@ void addNomnigraphMethods(pybind11::module& m) { .def( "createNode", [](nn::NNMatchGraph* g, nom::repr::Tensor& tensor, bool strict) { - return g->createNode( - nom::matcher::MatchNode( - nn::matchTensor(), true, 1, !strict)); + auto node = nn::NNMatchNode(nn::matchTensor()); + if (!strict) { + node.nonTerminal(); + } + return g->createNode(std::move(node)); }, py::return_value_policy::reference_internal, py::arg("tensor"), @@ -255,9 +259,11 @@ void addNomnigraphMethods(pybind11::module& m) { [](nn::NNMatchGraph* g, bool strict) { auto match = nn::NNNodeMatchCriteria( [](NNGraph::NodeRef node) { return true; }); - return g->createNode( - nom::matcher::MatchNode( - match, true, 1, !strict)); + auto node = nom::matcher::MatchNode(match); + if (!strict) { + node.nonTerminal(); + } + return g->createNode(std::move(node)); }, py::return_value_policy::reference_internal, py::arg("strict") = false) From 0ddbe668cda54271aeca99adefcf052f163847eb Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 05:56:17 -0700 Subject: [PATCH 027/237] Improve shape analysis to cover all most commonly used ops (#11358) Summary: [Here's a list](https://gist.github.com/apaszke/f0821840bdcc67a977832dc58acc1b85) of ops that are in `register_aten_ops.cpp`, but aren't supported in shape prop. Everything else should work now. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11358 Differential Revision: D9753693 Pulled By: apaszke fbshipit-source-id: efeae0126ce16cb56b8797fc5246405588bcae3c --- test/test_jit.py | 58 +- tools/jit/gen_jit_dispatch.py | 6 +- tools/jit/templates/register_aten_ops.cpp | 4 - torch/csrc/jit/graph_executor.cpp | 5 +- torch/csrc/jit/ivalue.h | 29 + torch/csrc/jit/passes/shape_analysis.cpp | 805 ++++++++++++++++++++-- torch/csrc/jit/pybind.h | 7 +- torch/csrc/jit/python_ir.cpp | 8 +- torch/csrc/jit/type.h | 2 +- 9 files changed, 834 insertions(+), 90 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 382daa03ec1614..ffd2c853481eb9 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -7059,6 +7059,19 @@ def forward(self, x, y): 'test_split_dim_neg0', } +EXCLUDE_TYPE_CHECK = { + # slogdet tests use itemgetter to select its only differentiable output, + # but this happens outside of the graph we handle, so there are fewer + # reference outputs than graph outputs. + 'test_slogdet_1x1_neg_det', + 'test_slogdet_1x1_pos_det', + 'test_slogdet_distinct_singular_values', + 'test_slogdet_neg_det', + 'test_slogdet_pos_det', + 'test_slogdet_symmetric', + 'test_slogdet_symmetric_pd', +} + # known to be failing in script EXCLUDE_SCRIPT = { # TODO: Fix var/std @@ -7182,7 +7195,9 @@ def traced_fn(*inputs, **kwargs): fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs) traced = torch.jit.trace(fn_tensors, inputs_tensors) self.assertExportImport(traced.graph, inputs_tensors) - return traced(*inputs_tensors) + output = traced(*inputs_tensors) + traced_fn.last_graph = traced.graph_for(*inputs_tensors) + return output return traced_fn script_template = ''' @@ -7222,12 +7237,30 @@ def script_fn(*args, **kwargs): script = script_template.format(', '.join(formals), call) CU = torch.jit.CompilationUnit(script) self.assertExportImport(CU.the_method.graph, tensors) - - return output_process_fn(CU.the_method(*tensors)) + output = output_process_fn(CU.the_method(*tensors)) + script_fn.last_graph = CU.the_method.graph_for(*tensors) + return output return script_fn -def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True): +def check_output_types(self, func, ref_outputs, args, kwargs): + graph = getattr(func, 'last_graph', None) + if not isinstance(ref_outputs, tuple): + ref_outputs = (ref_outputs,) + types = [o.type() for o in graph.outputs()] + self.assertEqual(len(types), len(ref_outputs)) + for i, (t, ref_out) in enumerate(zip(types, ref_outputs)): + if isinstance(ref_out, list): + assert len(ref_out) > 0 + elem = ref_out[0] + assert isinstance(elem, torch.Tensor) + self.assertTrue(t.isSubtypeOf(torch._C.ListType.ofTensors())) + else: + ref_type = torch._C.Type.inferFrom(ref_out) + self.assertTrue(ref_type.isSubtypeOf(t)) + + +def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True, check_types=True): kwargs = kwargs if kwargs else {} def allSum(vs): @@ -7252,6 +7285,9 @@ def clone_inputs(requires_grad): outputs_test = func(*nograd_inputs, **kwargs) self.assertEqual(outputs, outputs_test) + if check_types: + check_output_types(self, func, outputs_test, nograd_inputs, kwargs) + # test single grad case outputs = reference_func(*recording_inputs, **kwargs) grads = torch.autograd.grad(allSum(outputs), recording_tensors, @@ -7577,15 +7613,19 @@ def fn(*inputs, **kwargs): output = getattr(inputs[0], name)(*inputs[1:], **kwargs) return output_process_fn(output) + check_types = test_name not in EXCLUDE_TYPE_CHECK + if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name): if test_name not in EXCLUDE_TRACED: check_against_reference(self, create_traced_fn(self, fn), - fn, (self_variable,) + args_variable, kwargs_variable) + fn, (self_variable,) + args_variable, kwargs_variable, + check_types=check_types) if not is_magic_method and test_name not in EXCLUDE_SCRIPT: check_against_reference(self, create_script_fn(self, name, 'method', output_process_fn), - fn, (self_variable,) + args_variable, kwargs_variable) + fn, (self_variable,) + args_variable, kwargs_variable, + check_types=check_types) # functional interface tests if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL: @@ -7597,12 +7637,14 @@ def fn(*inputs, **kwargs): f_args_tensor = (self_tensor,) + args_tensor if not is_inplace and test_name not in EXCLUDE_TRACED: - check_against_reference(self, create_traced_fn(self, fn), fn, f_args_variable, kwargs_variable) + check_against_reference(self, create_traced_fn(self, fn), fn, + f_args_variable, kwargs_variable, check_types=check_types) if not is_inplace and test_name not in EXCLUDE_SCRIPT: check_against_reference(self, create_script_fn(self, name, 'functional', output_process_fn), - fn, f_args_variable, kwargs_variable) + fn, f_args_variable, kwargs_variable, + check_types=check_types) check(name) inplace_name = name + '_' diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index ff7fce56e91552..31ab5ed110f0de 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -56,11 +56,11 @@ def jit_type_of(arg): # map from aten 'simple_type' to the function that will turn a tensor into # that type FROM_IVALUE = { - 'Device': 'as_device({}.toIntList()->elements())', + 'Device': '{}.to()', 'IntList': '{}.toIntList()->elements()', - 'Layout': 'static_cast({}.toInt())', + 'Layout': '{}.to()', 'Scalar': '{}.toScalar()', - 'ScalarType': 'static_cast({}.toInt())', + 'ScalarType': '{}.to()', 'Tensor': '{}.toTensor()', 'TensorList': '{}.toTensorList()->elements()', 'bool': 'bool({}.toInt())', diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp index 4ac499b99faaed..de205de52a484e 100644 --- a/tools/jit/templates/register_aten_ops.cpp +++ b/tools/jit/templates/register_aten_ops.cpp @@ -50,10 +50,6 @@ std::array as_bool_array(at::ArrayRef vec) { return res; } -at::Device as_device(ArrayRef elements) { - return at::Device(static_cast(elements[0]), elements[1]); -} - RegisterOperators reg({ ${constructors} }); diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index cdfeab0e730179..d77f1b2be1aa28 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -379,6 +379,10 @@ struct GraphExecutorImpl { // Phase 2. Propagate detailed information about the spec through the // graph (enabled more specializations in later passes). + // Shape propagation sometimes depends on certain arguments being + // constants, and constant propagation doesn't need shape information + // anyway, so it's better to run it first. + ConstantPropagation(opt_graph); PropagateInputShapes(*opt_graph, spec); // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites that @@ -427,7 +431,6 @@ struct GraphExecutorImpl { EliminateDeadCode(graph); EliminateCommonSubexpression(graph); UnrollLoops(graph); - ConstantPropagation(graph); PeepholeOptimize(graph); CheckInplace(graph); BatchMM(graph); diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 7d9bea6e678eec..3602e7deb8eb89 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -358,6 +358,35 @@ DEFINE_TO(std::vector, toTensorListRef) #undef DEFINE_TO +#define DEFINE_TO_WITH_BODY(type, body) \ +template<> \ +inline type IValue::to() && { \ + body(std::move(*this)); \ +} \ +template<> \ +inline type IValue::to() const & { \ + body((*this)); \ +} + +#define SCALAR_TYPE_BODY(this) return static_cast(this.toInt()); +#define LAYOUT_BODY(this) return static_cast(this.toInt()); +#define DEVICE_BODY(this) \ + /* NB: const_list might be a move of the vector, so we need to */ \ + /* assign it to prevent its deallocation. */ \ + auto && const_list = this.toIntList(); \ + const auto & elems = const_list->elements(); \ + JIT_ASSERT(elems.size() == 2); \ + return at::Device(static_cast(elems[0]), elems[1]); + +DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY) +DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY) +DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY) + +#undef DEFINE_TO_WITH_BODY +#undef SCALAR_TYPE_BODY +#undef LAYOUT_BODY +#undef DEVICE_BODY + inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Tuple), is_intrusive_ptr(true) { as_intrusive_ptr = v.release(); diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 7064e2b4b365a1..7499546f27684f 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -208,8 +208,7 @@ bool PropagateShapeOnNodeByRunningIt(Node* node) { // However, we allow primitive returns because we want to support mixed // primitive/tensor outputs. -bool PropagateTensorShapeOnNode( - Node * node, bool insert_expands, std::vector types); +bool PropagateTensorShapeOnNode(Node * node, bool insert_expands); bool PropagateCompleteShapeOnNode( Node * node, bool insert_expands, std::vector types); @@ -306,8 +305,30 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { } case prim::ImplicitTensorToNum: case prim::TensorToNum: - case prim::NumToTensor: return; // correct num type is already set + case prim::NumToTensor: { + if (node->input()->type()->isSubtypeOf(IntType::get())) { + node->output()->setType(TensorType::create(at::kLong, -1, 0)); + } else { + JIT_ASSERT(node->input()->type()->isSubtypeOf(FloatType::get())); + node->output()->setType(TensorType::create(at::kDouble, -1, 0)); + } + return; + } + case prim::TupleConstruct: { + // We refresh the tuple type, because the input types could have been refined. + node->output()->setType(TupleType::create(fmap(node->inputs(), [](Value *v) { return v->type(); }))); + return; + } + case prim::TupleUnpack: { + auto tuple_type = node->input()->type()->cast(); + JIT_ASSERT(tuple_type && tuple_type->elements().size() == node->outputs().size()); + auto elems = tuple_type->elements(); + for (size_t i = 0; i < node->outputs().size(); ++i) { + node->output(i)->setType(elems[i]); + } + return; + } case prim::Constant: { if(node->output()->type()->isSubtypeOf(DynamicType::get())) { node->output()->inferTypeFrom(node->t(attr::value)); @@ -344,10 +365,8 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { } } - if (auto maybe_tensor_types = gatherTensorTypes(node)) { - if (PropagateTensorShapeOnNode(node, insert_expands, std::move(*maybe_tensor_types))) { - return; - } + if (PropagateTensorShapeOnNode(node, insert_expands)) { + return; } if (PropagateShapeOnNodeByRunningIt(node)) { @@ -356,9 +375,11 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { return setUnshapedType(node); } -bool PropagateTensorShapeOnNode(Node * node, bool insert_expands, - std::vector tensor_types) { +bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) { static const auto broadcast = [](std::vector& tensor_types) -> TensorTypePtr { + if (tensor_types.size() == 1) { + return tensor_types[0]; + } JIT_ASSERT(!tensor_types.empty()); auto any_type = tensor_types[0]; auto max_dims = any_type->dim(); @@ -367,85 +388,727 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands, } return TensorType::create(any_type->scalarType(), any_type->device(), max_dims); }; + + using type_vec_t = std::vector; + // Formula is expected to return a vector of length equal to the number of tensor + // outputs of the node, or an empty vector which implies that it failed to propagate. + using formula_t = std::function; + static std::mutex shape_formulas_mutex; + static std::vector> shape_formulas; + struct register_formula_for { + register_formula_for(OperatorSet operators, formula_t formula) { + std::unique_lock lock {shape_formulas_mutex}; + shape_formulas.emplace_back(std::move(operators), std::move(formula)); + } + }; + + // Requirements: + // dims : preserved + // scalar type : preserved + // device : preserved + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - First input should be the only tensor input + static const register_formula_for simple_unary_ops {{ + "aten::abs(Tensor self) -> Tensor", + "aten::acos(Tensor self) -> Tensor", + "aten::neg(Tensor self) -> Tensor", + "aten::t(Tensor self) -> Tensor", + "aten::sigmoid(Tensor self) -> Tensor", + "aten::tanh(Tensor self) -> Tensor", + "aten::exp(Tensor self) -> Tensor", + "aten::relu(Tensor self) -> Tensor", + "aten::asin(Tensor self) -> Tensor", + "aten::atan(Tensor self) -> Tensor", + "aten::ceil(Tensor self) -> Tensor", + "aten::clone(Tensor self) -> Tensor", + "aten::contiguous(Tensor self) -> Tensor", + "aten::bernoulli(Tensor self) -> Tensor", + "aten::celu(Tensor self, Scalar alpha) -> Tensor", + "aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor", + "aten::clamp_max(Tensor self, Scalar max) -> Tensor", + "aten::clamp_min(Tensor self, Scalar min) -> Tensor", + "aten::alpha_dropout(Tensor input, float p, int train) -> Tensor", + "aten::bernoulli(Tensor self, float p, Generator generator) -> Tensor", + "aten::cos(Tensor self) -> Tensor", + "aten::cosh(Tensor self) -> Tensor", + "aten::digamma(Tensor self) -> Tensor", + "aten::dropout(Tensor input, float p, int train) -> Tensor", + "aten::elu(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale) -> Tensor", + "aten::erf(Tensor self) -> Tensor", + "aten::erfc(Tensor self) -> Tensor", + "aten::erfinv(Tensor self) -> Tensor", + "aten::exp(Tensor self) -> Tensor", + "aten::expm1(Tensor self) -> Tensor", + "aten::log(Tensor self) -> Tensor", + "aten::log10(Tensor self) -> Tensor", + "aten::log1p(Tensor self) -> Tensor", + "aten::log2(Tensor self) -> Tensor", + "aten::log_sigmoid(Tensor self) -> Tensor", + "aten::log_softmax(Tensor self, int dim) -> Tensor", + "aten::floor(Tensor self) -> Tensor", + "aten::frac(Tensor self) -> Tensor", + "aten::flip(Tensor self, int[] dims) -> Tensor", + "aten::feature_alpha_dropout(Tensor input, float p, int train) -> Tensor", + "aten::feature_dropout(Tensor input, float p, int train) -> Tensor", + "aten::hardshrink(Tensor self, Scalar lambd) -> Tensor", + "aten::hardtanh(Tensor self, Scalar min_val, Scalar max_val) -> Tensor", + "aten::glu(Tensor self, int dim) -> Tensor", + "aten::inverse(Tensor self) -> Tensor", + "aten::leaky_relu(Tensor self, Scalar negative_slope) -> Tensor", + "aten::lgamma(Tensor self) -> Tensor", + "aten::mvlgamma(Tensor self, int p) -> Tensor", + "aten::normal(float mean, Tensor std, *, Generator generator) -> Tensor", + "aten::normal(Tensor mean, float std, *, Generator generator) -> Tensor", + "aten::permute(Tensor self, int[] dims) -> Tensor", + "aten::pin_memory(Tensor self) -> Tensor", + "aten::pinverse(Tensor self, float rcond) -> Tensor", + "aten::reciprocal(Tensor self) -> Tensor", + "aten::relu(Tensor self) -> Tensor", + "aten::round(Tensor self) -> Tensor", + "aten::rrelu(Tensor self, Scalar lower, Scalar upper, int training, Generator generator) -> Tensor", + "aten::rsqrt(Tensor self) -> Tensor", + "aten::selu(Tensor self) -> Tensor", + "aten::sigmoid(Tensor self) -> Tensor", + "aten::sign(Tensor self) -> Tensor", + "aten::sin(Tensor self) -> Tensor", + "aten::sinh(Tensor self) -> Tensor", + "aten::softmax(Tensor self, int dim) -> Tensor", + "aten::softplus(Tensor self, Scalar beta, Scalar threshold) -> Tensor", + "aten::softshrink(Tensor self, Scalar lambd) -> Tensor", + "aten::sqrt(Tensor self) -> Tensor", + "aten::tan(Tensor self) -> Tensor", + "aten::tanh(Tensor self) -> Tensor", + "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor", + "aten::transpose(Tensor self, int dim0, int dim1) -> Tensor", + "aten::tril(Tensor self, int diagonal) -> Tensor", + "aten::triu(Tensor self, int diagonal) -> Tensor", + "aten::trunc(Tensor self) -> Tensor", + "aten::rot90(Tensor self, int k, int[] dims) -> Tensor", + "aten::narrow(Tensor self, int dim, int start, int length) -> Tensor", + "aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor", + "aten::alias(Tensor self) -> Tensor", + "aten::detach(Tensor self) -> Tensor", + "aten::cumprod(Tensor self, int dim) -> Tensor", + "aten::cumsum(Tensor self, int dim) -> Tensor", + + "aten::empty_like(Tensor self) -> Tensor", + "aten::full_like(Tensor self, Scalar fill_value) -> Tensor", + "aten::ones_like(Tensor self) -> Tensor", + "aten::rand_like(Tensor self) -> Tensor", + "aten::randint_like(Tensor self, int high) -> Tensor", + "aten::randint_like(Tensor self, int low, int high) -> Tensor", + "aten::randn_like(Tensor self) -> Tensor", + "aten::zeros_like(Tensor self) -> Tensor", + }, [](Node * node) -> type_vec_t { + auto input_type = node->input(0)->type()->cast(); + return input_type ? type_vec_t{input_type} : type_vec_t{}; + }}; + + // Requirements: + // dims : broadcast all tensor args + // scalar type : always matching and preserved + // device : always matching and preserved + // tensor inputs : * + // tensor outputs : 1 + static const register_formula_for broadcasting_ops {{ + // Tensor-Tensor operators + "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", + "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", + "aten::mul(Tensor self, Tensor other) -> Tensor", + "aten::div(Tensor self, Tensor other) -> Tensor", + "aten::pow(Tensor self, Tensor exponent) -> Tensor", + "aten::min(Tensor self, Tensor other) -> Tensor", + "aten::max(Tensor self, Tensor other) -> Tensor", + "aten::fmod(Tensor self, Tensor other) -> Tensor", + "aten::remainder(Tensor self, Tensor other) -> Tensor", + "aten::lerp(Tensor self, Tensor end, Scalar weight) -> Tensor", + "aten::max(Tensor self, Tensor other) -> Tensor", + "aten::min(Tensor self, Tensor other) -> Tensor", + "aten::__and__(Tensor self, Tensor other) -> Tensor", + "aten::__or__(Tensor self, Tensor other) -> Tensor", + "aten::__xor__(Tensor self, Tensor other) -> Tensor", + "aten::__lshift__(Tensor self, Tensor other) -> Tensor", + "aten::__rshift__(Tensor self, Tensor other) -> Tensor", + "aten::__iand__(Tensor self, Tensor other) -> Tensor", + "aten::__ior__(Tensor self, Tensor other) -> Tensor", + "aten::__ixor__(Tensor self, Tensor other) -> Tensor", + "aten::__ilshift__(Tensor self, Tensor other) -> Tensor", + "aten::__irshift__(Tensor self, Tensor other) -> Tensor", + + // Tensor-Scalar operators + "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", + "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", + "aten::mul(Tensor self, Scalar other) -> Tensor", + "aten::div(Tensor self, Scalar other) -> Tensor", + "aten::pow(Tensor self, Scalar exponent) -> Tensor", + "aten::fmod(Tensor self, Scalar other) -> Tensor", + "aten::remainder(Tensor self, Scalar other) -> Tensor", + "aten::add(Scalar other, Tensor self) -> Tensor", + "aten::sub(Scalar other, Tensor self) -> Tensor", + "aten::mul(Scalar other, Tensor self) -> Tensor", + "aten::div(Scalar other, Tensor self) -> Tensor", + "aten::pow(Scalar base, Tensor self) -> Tensor", + "aten::__and__(Tensor self, Scalar other) -> Tensor", + "aten::__or__(Tensor self, Scalar other) -> Tensor", + "aten::__xor__(Tensor self, Scalar other) -> Tensor", + "aten::__lshift__(Tensor self, Scalar other) -> Tensor", + "aten::__rshift__(Tensor self, Scalar other) -> Tensor", + "aten::__iand__(Tensor self, Scalar other) -> Tensor", + "aten::__ior__(Tensor self, Scalar other) -> Tensor", + "aten::__ixor__(Tensor self, Scalar other) -> Tensor", + "aten::__ilshift__(Tensor self, Scalar other) -> Tensor", + "aten::__irshift__(Tensor self, Scalar other) -> Tensor", + + // Ops with Tensor-Tensor overloads only + "aten::atan2(Tensor self, Tensor other) -> Tensor", + + // Non-binary ops + "aten::where(Tensor condition, Tensor self, Tensor other) -> Tensor", + "aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value) -> Tensor", + "aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto maybe_tensor_types = gatherTensorTypes(node)) { + return {broadcast(*maybe_tensor_types)}; + } + return {}; + }}; + + static const auto any_tensor_type = [](Node * node) -> TensorTypePtr { + for (Value * input : node->inputs()) { + if (auto type = input->type()->cast()) { + return type; + } + } + return nullptr; + }; + + // Requirements: + // dims : always matching and preserved + // scalar type : always matching and preserved + // device : always matching and preserved + // tensor inputs : 2 + // tensor outputs : 1 + static const register_formula_for binary_ops_strict_match {{ + "aten::normal(Tensor mean, Tensor std, *, Generator generator) -> Tensor", + "aten::bernoulli(Tensor self, Tensor p, Generator generator) -> Tensor", + "aten::mm(Tensor self, Tensor mat2) -> Tensor", + "aten::bmm(Tensor self, Tensor mat2) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto type = any_tensor_type(node)) { + return {type}; + } + return {}; + }}; + + // Requirements: + // dims : all tensor args are broadcast + // scalar type : byte/uint8 + // device : always matching and preserved + // tensor inputs : * + // tensor outputs : 1 + static const register_formula_for comparison_ops {{ + "aten::lt(Tensor self, Tensor other) -> Tensor", + "aten::le(Tensor self, Tensor other) -> Tensor", + "aten::gt(Tensor self, Tensor other) -> Tensor", + "aten::ge(Tensor self, Tensor other) -> Tensor", + "aten::eq(Tensor self, Tensor other) -> Tensor", + "aten::ne(Tensor self, Tensor other) -> Tensor", + "aten::lt(Tensor self, Scalar other) -> Tensor", + "aten::le(Tensor self, Scalar other) -> Tensor", + "aten::gt(Tensor self, Scalar other) -> Tensor", + "aten::ge(Tensor self, Scalar other) -> Tensor", + "aten::eq(Tensor self, Scalar other) -> Tensor", + "aten::ne(Tensor self, Scalar other) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto maybe_tensor_types = gatherTensorTypes(node)) { + return {broadcast(*maybe_tensor_types)->toScalarType(at::kByte)}; + } + return {}; + }}; + + // Requirements: + // dims : preserved from the first argument + // scalar type : preserved from the first argument (doesn't have to match other arguments) + // device : always matching and preserved + // tensor inputs : * + // tensor outputs : 1 + // NB: those ops (with slight adjustments) are good candidates for restarts. + // Knowing the type and device of weights or biases is usually enough to + // infer the output type. + static const register_formula_for nn_ops_first_input_preserving {{ + "aten::batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, int training, float momentum, float eps, int cudnn_enabled) -> Tensor", + "aten::conv1d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor", + "aten::conv2d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor", + "aten::conv3d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor", + "aten::conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad) -> Tensor", + "aten::conv_transpose1d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor", + "aten::conv_transpose2d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor", + "aten::conv_transpose3d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor", + "aten::convolution(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int transposed, int[] output_padding, int groups) -> Tensor", + "aten::adaptive_avg_pool1d(Tensor self, int[] output_size) -> Tensor", + "aten::adaptive_avg_pool2d(Tensor self, int[] output_size) -> Tensor", + "aten::adaptive_avg_pool3d(Tensor self, int[] output_size) -> Tensor", + "aten::avg_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor", + "aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor", + "aten::avg_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor", + "aten::max_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor", + "aten::max_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor", + "aten::max_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor", + "aten::max_unpool2d(Tensor self, Tensor indices, int[] output_size) -> Tensor", + "aten::max_unpool3d(Tensor self, Tensor indices, int[] output_size, int[] stride, int[] padding) -> Tensor", + "aten::reflection_pad1d(Tensor self, int[] padding) -> Tensor", + "aten::reflection_pad2d(Tensor self, int[] padding) -> Tensor", + "aten::replication_pad1d(Tensor self, int[] padding) -> Tensor", + "aten::replication_pad2d(Tensor self, int[] padding) -> Tensor", + "aten::replication_pad3d(Tensor self, int[] padding) -> Tensor", + "aten::upsample_bilinear2d(Tensor self, int[] output_size, int align_corners) -> Tensor", + "aten::upsample_linear1d(Tensor self, int[] output_size, int align_corners) -> Tensor", + "aten::upsample_nearest1d(Tensor self, int[] output_size) -> Tensor", + "aten::upsample_nearest2d(Tensor self, int[] output_size) -> Tensor", + "aten::upsample_nearest3d(Tensor self, int[] output_size) -> Tensor", + "aten::upsample_trilinear3d(Tensor self, int[] output_size, int align_corners) -> Tensor", + "aten::prelu(Tensor self, Tensor weight) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto type = node->input(0)->type()->cast()) { + return {type}; + } + return {}; + }}; + + // Requirements: + // dims : 0 + // scalar type : preserved + // device : preserved + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - First input should be the only tensor input + static const register_formula_for all_reduce_ops {{ + "aten::argmax(Tensor self) -> Tensor", + "aten::argmin(Tensor self) -> Tensor", + "aten::det(Tensor self) -> Tensor", + "aten::logdet(Tensor self) -> Tensor", + "aten::max(Tensor self) -> Tensor", + "aten::min(Tensor self) -> Tensor", + "aten::mean(Tensor self) -> Tensor", + "aten::median(Tensor self) -> Tensor", + "aten::norm(Tensor self, Scalar p) -> Tensor", + "aten::std(Tensor self, int unbiased) -> Tensor", + "aten::sum(Tensor self) -> Tensor", + "aten::trace(Tensor self) -> Tensor", + "aten::var(Tensor self, int unbiased) -> Tensor", + "aten::all(Tensor self) -> Tensor", + "aten::any(Tensor self) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto type = node->input(0)->type()->cast()) { + return {type->withDim(0)}; + } + return {}; + }}; + + // Requirements: + // dims : 0 + // scalar type : preserved if floating point, otherwise long/int64 + // device : preserved + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - First input should be the only tensor input + static const register_formula_for all_reduce_ops_with_integer_upcast {{ + "aten::sum(Tensor self) -> Tensor", + "aten::prod(Tensor self) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto type = node->input(0)->type()->cast()) { + return {at::isFloatingType(type->scalarType()) ? type->withDim(0) : type->withDim(0)->toScalarType(at::kLong)}; + } + return {}; + }}; + + static const auto multidim_reduce_with_postprocess = + [](Node * node, size_t num_reduced_dim, bool upcast_integer) -> type_vec_t { + auto maybe_keepdim = node->get(attr::keepdim); + if (!maybe_keepdim) return {}; + if (auto type = node->input(0)->type()->cast()) { + if (upcast_integer && !at::isFloatingType(type->scalarType())) { + type = type->toScalarType(at::kLong); + } + if (*maybe_keepdim) { + return {type}; + } else if (type->dim() > num_reduced_dim) { + return {type->withDim(type->dim() - num_reduced_dim)}; + } + } + return {}; + }; + + // Requirements: + // dims : preserved if keepdim == false, 1 smaller otherwise + // scalar type : preserved for first output, byte/uint8 for second output if exists + // device : preserved + // tensor inputs : 1 + // tensor outputs : 1 or 2 + // Additionally: + // - First input should be the only tensor input + // - Has a bool keepdim argument + static const register_formula_for dim_reduce_ops {{ + "aten::argmax(Tensor self, int dim, int keepdim) -> Tensor", + "aten::argmin(Tensor self, int dim, int keepdim) -> Tensor", + "aten::max_values(Tensor self, int dim, int keepdim) -> Tensor", + "aten::min_values(Tensor self, int dim, int keepdim) -> Tensor", + "aten::mean(Tensor self, int dim, int keepdim) -> Tensor", + "aten::norm(Tensor self, Scalar p, int dim, int keepdim) -> Tensor", + "aten::std(Tensor self, int dim, int unbiased, int keepdim) -> Tensor", + "aten::var(Tensor self, int dim, int unbiased, int keepdim) -> Tensor", + "aten::logsumexp(Tensor self, int dim, int keepdim) -> Tensor", + "aten::all(Tensor self, int dim, int keepdim) -> Tensor", + "aten::any(Tensor self, int dim, int keepdim) -> Tensor", + + // Ops returning indices as second output + "aten::kthvalue(Tensor self, int k, int dim, int keepdim) -> (Tensor, Tensor)", + "aten::max(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)", + "aten::min(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)", + "aten::median(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)", + "aten::mode(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)", + }, [](Node * node) -> type_vec_t { + // NB: Note that while this function is generally meant to be used with ops that + // have a single output, we will fix up its return right below. + auto output_types = multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/1, /*integer_upcast=*/false); + if (!output_types.empty() && node->outputs().size() == 2) { + output_types.push_back(output_types.back()->toScalarType(at::kLong)); + } + return output_types; + }}; + + // Requirements: + // dims : preserved if keepdim == false, 1 smaller otherwise + // scalar type : preserved if floating point, otherwise long/int64 + // device : preserved + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - First input should be the only tensor input + // - has a bool keepdim argument + static const register_formula_for dim_reduce_ops_with_integer_upcast {{ + "aten::prod(Tensor self, int dim, int keepdim) -> Tensor", + }, [](Node * node) -> type_vec_t { + return multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/1, /*integer_upcast=*/true); + }}; + + // Requirements: + // dims : preserved if keepdim == false, 1 smaller otherwise + // scalar type : preserved if floating point, otherwise long/int64 + // device : preserved + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - has bool keepdim and int[] dim arguments + static const register_formula_for multidim_reduce_ops_with_integer_upcast {{ + "aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto dim = node->get>(attr::dim)) { + // TODO: can dim contain duplicates? + return multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/dim->size(), /*integer_upcast=*/true); + } + return {}; + }}; + + static const auto get_device_int = [](at::optional dev) -> at::optional { + if (!dev) return {}; + if (dev->is_cpu()) { + return {-1}; + } + return dev->has_index() ? at::optional{dev->index()} : at::nullopt; + }; + static const auto factory_with_ndim = [](Node * node, int dim) -> type_vec_t{ + auto maybe_layout = node->get(attr::layout); + if (!maybe_layout || maybe_layout != at::kStrided) return {}; + auto maybe_device = get_device_int(node->get(attr::device)); + if (!maybe_device) return {}; + auto maybe_scalar_type = node->get(attr::dtype); + if (!maybe_scalar_type) return {}; + return {TensorType::create(*maybe_scalar_type, *maybe_device, dim)}; + }; + + // Requirements: + // dims : preserved + // scalar type : equal to value of dtype + // device : equal to value of device + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - has ScalarType dtype, Layeout layout and Device device arguments + static const register_formula_for like_factories_with_options {{ + "aten::empty_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor", + "aten::full_like(Tensor self, Scalar fill_value, *, int dtype, int layout, int[] device) -> Tensor", + "aten::ones_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor", + "aten::rand_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor", + "aten::randint_like(Tensor self, int high, *, int dtype, int layout, int[] device) -> Tensor", + "aten::randint_like(Tensor self, int low, int high, *, int dtype, int layout, int[] device) -> Tensor", + "aten::randn_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor", + "aten::zeros_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto type = node->namedInput(attr::self)->type()->cast()) { + return factory_with_ndim(node, type->dim()); + } + return {}; + }}; + + // Requirements: + // dims : equal to number of elements in size + // scalar type : equal to value of dtype + // device : equal to value of device + // tensor inputs : 1 + // tensor outputs : 1 + // Additionally: + // - has int[] size, ScalarType dtype, Layeout layout and Device device arguments + static const register_formula_for size_factories_with_options {{ + "aten::empty(int[] size, *, int dtype, int layout, int[] device) -> Tensor", + "aten::full(int[] size, Scalar fill_value, *, int dtype, int layout, int[] device) -> Tensor", + "aten::ones(int[] size, *, int dtype, int layout, int[] device) -> Tensor", + "aten::rand(int[] size, *, int dtype, int layout, int[] device) -> Tensor", + "aten::randn(int[] size, *, int dtype, int layout, int[] device) -> Tensor", + "aten::zeros(int[] size, *, int dtype, int layout, int[] device) -> Tensor", + "aten::randint(int high, int[] size, *, int dtype, int layout, int[] device) -> Tensor", + "aten::randint(int low, int high, int[] size, *, int dtype, int layout, int[] device) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto maybe_size = node->get>(attr::size)) { + return factory_with_ndim(node, maybe_size->size()); + } + return {}; + }}; + + static const auto get_cast_scalar_type = [](Node *node) -> at::ScalarType { + switch (node->kind()) { + case aten::_cast_Byte: return at::kByte; + case aten::_cast_Char: return at::kChar; + case aten::_cast_Double: return at::kDouble; + case aten::_cast_Float: return at::kFloat; + case aten::_cast_Half: return at::kHalf; + case aten::_cast_Int: return at::kInt; + case aten::_cast_Long: return at::kLong; + case aten::_cast_Short: return at::kShort; + default: AT_ASSERTM(false, "unknown node kind in get_cast_scalar_type: ", node->kind().toQualString()); + } + }; + static const register_formula_for cast_ops {{ + "aten::_cast_Byte(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Char(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Double(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Float(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Half(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Int(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Long(Tensor self, int non_blocking) -> Tensor", + "aten::_cast_Short(Tensor self, int non_blocking) -> Tensor", + }, [](Node * node) -> type_vec_t { + if (auto type = node->namedInput(attr::self)->type()->cast()) { + return {type->toScalarType(get_cast_scalar_type(node))}; + } + return {}; + }}; + + // First, try to match one of the registered formulas to their operator sets. + for (auto & entry : shape_formulas) { + if (entry.first.find(node)) { + auto types = entry.second(node); + if (types.empty()) { + return false; + } else { + auto outputs = node->outputs(); + JIT_ASSERT(types.size() == outputs.size()); + for (size_t i = 0; i < types.size(); ++i) { + JIT_ASSERT(outputs[i]->type()->isSubtypeOf(DynamicType::get())); + outputs[i]->setType(types[i]); + } + return true; + } + } + } + + // This section implements shape prop for an assorted set of nodes that only + // need partial information about their input types. + const auto input_type = [node](size_t index) { + return node->input(index)->type()->cast(); + }; + if (node->matches("aten::masked_select(Tensor self, Tensor mask) -> Tensor")) { + auto type = input_type(0); + auto mask_type = input_type(1); + if (type && mask_type) { + if (type->dim() == 0 && mask_type->dim() == 0) { + node->output()->setType(type->withDim(0)); + } else { + node->output()->setType(type->withDim(1)); + } + return true; + } + if (auto type = input_type(0)) { + node->output()->setType(type->withDim(1)); + return true; + } + } else if (node->matches("aten::dot(Tensor self, Tensor tensor) -> Tensor")) { + if (auto type = any_tensor_type(node)) { + node->output()->setType(type->withDim(0)); + return true; + } + } else if (node->matches("aten::mv(Tensor self, Tensor vec) -> Tensor") || + node->matches("aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta, Scalar alpha) -> Tensor")) { + if (auto type = any_tensor_type(node)) { + node->output()->setType(type->withDim(1)); + return true; + } + } else if (node->matches("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor") || + node->matches("aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor") || + node->matches("aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta, Scalar alpha) -> Tensor")) { + if (auto type = any_tensor_type(node)) { + node->output()->setType(type->withDim(2)); + return true; + } + } else if (node->matches("aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor")) { + if (auto type = any_tensor_type(node)) { + node->output()->setType(type->withDim(3)); + return true; + } + } else if (node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor")) { + auto type = input_type(0); + auto index_type = input_type(1); + // index_select behaves very weirdly when self.dim() == 0. It allows both 0D and 1D + // indices, and returns a value that has as many dimensions as index. + if (type && index_type) { + if (type->dim() == 0) { + node->output()->setType(type->withDim(index_type->dim())); + } else { + node->output()->setType(type); + } + return true; + } + } else if (node->matches("aten::gather(Tensor self, int dim, Tensor index) -> Tensor")) { + auto type = input_type(0); + auto index_type = input_type(1); + // Gather has this annoying edge case where index always needs to match the number of + // dims of self, **except** when self is 1D and index is 0D in which case we return + // a 0D output. + if (type && index_type) { + if (index_type->dim() == 0) { + node->output()->setType(type->withDim(0)); + } else { + node->output()->setType(type); + } + return true; + } + } else if (node->matches("aten::embedding(Tensor weight, Tensor indices, int padding_idx, int scale_grad_by_freq, int sparse) -> Tensor")) { + auto weight_type = input_type(0); + auto indices_type = input_type(1); + if (weight_type && indices_type) { + node->output()->setType(weight_type->withDim(indices_type->dim() + 1)); + return true; + } + } else if (node->matches("aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor bias) -> Tensor")) { + if (auto type = input_type(0)) { + node->output()->setType(type); + return true; + } + if (auto type = input_type(1)) { + node->output()->setType(type); + return true; + } + } else if (node->matches("aten::dist(Tensor self, Tensor other, Scalar p) -> Tensor")) { + if (auto type = any_tensor_type(node)) { + node->output()->setType(type->withDim(0)); + return true; + } + } + + // The code below implements formulas that need type information for all their + // tensor inputs, and have exactly one output. + std::vector tensor_types; + static const auto reshape_prop = + [](Node * node, Symbol shape_input, const std::vector& tensor_types) -> TensorTypePtr { + if (auto shape = node->get>(shape_input)) { + return tensor_types.at(0)->withDim(shape->size()); + } + return nullptr; + }; const auto getSingleOutputType = [&]() -> TypePtr { - if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") || - node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") || - node->matches("aten::mul(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::pow(Tensor self, Tensor exponent) -> Tensor") || - node->matches("aten::min(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::max(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor")) { - return broadcast(tensor_types); - } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor") || - node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor") || - node->matches("aten::mul(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor") || - node->matches("aten::add(Scalar other, Tensor self) -> Tensor") || - node->matches("aten::sub(Scalar other, Tensor self) -> Tensor") || - node->matches("aten::mul(Scalar other, Tensor self) -> Tensor") || - node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) { - return tensor_types.at(0); - } else if (node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::le(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::ne(Tensor self, Tensor other) -> Tensor")) { - return broadcast(tensor_types)->toScalarType(at::kByte); - } else if (node->matches("aten::lt(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::le(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::gt(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::ge(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::eq(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::ne(Tensor self, Scalar other) -> Tensor")) { - return tensor_types.at(0)->toScalarType(at::kByte); - } else if (node->matches("aten::neg(Tensor self) -> Tensor") || - node->matches("aten::t(Tensor self) -> Tensor") || - node->matches("aten::sigmoid(Tensor self) -> Tensor") || - node->matches("aten::tanh(Tensor self) -> Tensor") || - node->matches("aten::exp(Tensor self) -> Tensor") || - node->matches("aten::relu(Tensor self) -> Tensor") || - node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor") || - node->matches("aten::narrow(Tensor self, int dim, int start, int length) -> Tensor") || - node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor")) { - return tensor_types.at(0); - } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) { + if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) { return tensor_types.at(0)->toScalarType(tensor_types.at(1)->scalarType()); - } else if (node->matches("aten::sum(Tensor self) -> Tensor")) { - // TODO: this depends on the dtype argument. why don't we have access to it in here? - // TODO: integral types are upcast + } else if (node->matches("aten::view_as(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::expand_as(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::reshape_as(Tensor self, Tensor other) -> Tensor")) { + return tensor_types.at(0)->withDim(tensor_types.at(1)->dim()); + } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor") || + node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor") || + node->matches("aten::as_strided(Tensor self, int[] size, int[] stride) -> Tensor") || + node->matches("aten::as_strided(Tensor self, int[] size, int[] stride, int storage_offset) -> Tensor")) { + return reshape_prop(node, attr::size, tensor_types); + } else if (node->matches("aten::reshape(Tensor self, int[] shape) -> Tensor")) { + return reshape_prop(node, attr::shape, tensor_types); + } else if (node->matches("aten::repeat(Tensor self, int[] repeats) -> Tensor")) { + return reshape_prop(node, attr::repeats, tensor_types); + } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor")) { auto & t = tensor_types.at(0); - return TensorType::create(t->scalarType(), t->device(), 0); - } else if (node->matches("aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor", - /*with_const=*/attr::keepdim)) { + return t->withDim(t->dim() + 1); + } else if (node->matches("aten::select(Tensor self, int dim, int index) -> Tensor") || + node->matches("aten::diagonal(Tensor self, int offset, int dim1, int dim2) -> Tensor")) { auto & t = tensor_types.at(0); - bool keepdim = node->get(attr::keepdim).value(); - if (!keepdim) { - if (auto dims = node->get>(attr::dim)) { - // TODO: do we need to account for duplicates in dim here? - return t->withDim(t->dim() - dims->size()); + return t->dim() > 0 ? t->withDim(t->dim() - 1) : nullptr; + } else if (node->matches("aten::matmul(Tensor self, Tensor other) -> Tensor")) { + int dim1 = tensor_types.at(0)->dim(); + int dim2 = tensor_types.at(1)->dim(); + if (dim1 == 1 && dim2 == 1) { + // Dot product + return tensor_types.at(0)->withDim(0); + } else if (dim1 == 2 && dim2 == 2) { + // Matrix multiply + return tensor_types.at(0); + } else if (dim1 == 1 && dim2 == 2) { + // Unsqueeze + matrix multiply + squeeze + return tensor_types.at(0); + } else if (dim1 == 2 && dim2 == 1) { + // Matrix vector multiply + return tensor_types.at(1); + } else { + // Batched matrix multiply (possibly with squeeze + unsqueeze if one argument is 1D) + auto type = broadcast(tensor_types); + if (tensor_types.at(0)->dim() == 1 || tensor_types.at(1)->dim() == 1) { + type = type->withDim(type->dim() - 1); } - return nullptr; + return type; + } + } else if (node->matches("aten::nonzero(Tensor self) -> Tensor")) { + return tensor_types.at(0)->toScalarType(at::kLong); + } else if (node->matches("aten::take(Tensor self, Tensor index) -> Tensor")) { + return tensor_types.at(1)->toScalarType(tensor_types.at(0)->scalarType()); + } else if (node->matches("aten::diagflat(Tensor self, int offset) -> Tensor")) { + return tensor_types.at(0)->withDim(2); + } else if (node->matches("aten::diag(Tensor self, int diagonal) -> Tensor")) { + auto & t = tensor_types.at(0); + if (t->dim() == 1) { + return t->withDim(2); + } else if (t->dim() == 2) { + return t->withDim(1); } else { - return t; + return nullptr; } - return nullptr; - } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor")) { + } else if (node->matches("aten::unfold(Tensor self, int dimension, int size, int step) -> Tensor")) { auto & t = tensor_types.at(0); - return t->withDim(t->dim() + 1); - } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor", /*with_const=*/attr::size) || - node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor", /*with_const=*/attr::size)) { - return tensor_types.at(0)->withDim(node->get>(attr::size)->size()); + return t->dim() == 0 ? t : t->withDim(t->dim() + 1); + } else if (node->matches("aten::polygamma(int n, Tensor self) -> Tensor")) { + return tensor_types.at(0); } return nullptr; }; + if (auto maybe_tensor_types = gatherTensorTypes(node)) { + tensor_types = std::move(*maybe_tensor_types); + } else { + return false; + } if (node->outputs().size() == 1) { if (auto type = getSingleOutputType()) { node->output()->setType(type); return true; } } - setUnshapedType(node); return false; } diff --git a/torch/csrc/jit/pybind.h b/torch/csrc/jit/pybind.h index 30a03bdd350ca1..f517302ec121f3 100644 --- a/torch/csrc/jit/pybind.h +++ b/torch/csrc/jit/pybind.h @@ -24,7 +24,12 @@ template <> struct type_caster { PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue")); bool load(handle src, bool) { - return false; + try { + value = torch::jit::toIValue(src); + return true; + } catch (std::exception& e) { + return false; + } } static handle cast(torch::jit::IValue src, return_value_policy /* policy */, handle /* parent */) { diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index 416de46f1e6953..11ec332251b6c5 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -440,6 +440,11 @@ void initPythonIRBindings(PyObject * module_) { .def("__repr__",[](Type & t) { return t.python_str(); }) + .def("str",[](Type & t) { + std::ostringstream s; + s << t; + return s.str(); + }) .def("kind",[](Type& t_) { Type * t = &t_; switch(t->kind()) { @@ -486,7 +491,8 @@ void initPythonIRBindings(PyObject * module_) { }) .def("isSubtypeOf", [](std::shared_ptr& self, std::shared_ptr other) { return self->isSubtypeOf(other); - }); + }) + .def_static("inferFrom", inferTypeFrom); py::class_>(m, "NumberType") .def_static("get", &NumberType::get); diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index f3be38cbb2b005..e812f2e02fdee5 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -270,7 +270,7 @@ struct TORCH_API CompleteTensorType : public TensorType { if (rhs->kind() == TypeKind::DynamicType) return true; if (rhs->kind() == TypeKind::TensorType) - return *dynamic_cast(this) == *rhs; + return *expect() == *rhs; return *this == *rhs; } std::string str() const override { From 120d7694327dafcfca5fa4447f438772c9b1c729 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 05:58:11 -0700 Subject: [PATCH 028/237] Add support for tracing strings (#11506) Summary: This enabled `torch.einsum` both in tracing and in script mode. It's used all over Pyro at the moment, and is needed for any use of the JIT in there. Fixes #11157. zdevito fritzo neerajprad Pull Request resolved: https://github.com/pytorch/pytorch/pull/11506 Differential Revision: D9764787 Pulled By: apaszke fbshipit-source-id: 9b5251b9e7c5897034602bd07ff67b425d33326c --- test/expect/TestJit.test_cpp_cuda.expect | 12 ++++++------ test/expect/TestScript.test_string_cu.expect | 4 ++-- test/test_jit.py | 15 +++++++++++++++ tools/jit/gen_jit_dispatch.py | 4 +++- torch/csrc/jit/constants.cpp | 4 ++-- torch/csrc/jit/interned_strings.h | 1 - torch/csrc/jit/ir.cpp | 2 +- torch/csrc/jit/operator.cpp | 1 + torch/csrc/jit/tracer.cpp | 2 +- 9 files changed, 31 insertions(+), 14 deletions(-) diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect index 54a3c16d459c69..50d2cbe963833c 100644 --- a/test/expect/TestJit.test_cpp_cuda.expect +++ b/test/expect/TestJit.test_cpp_cuda.expect @@ -105,19 +105,19 @@ graph(%0 : Float(2, 3, 4) %3 : Float(2, 3, 4) %4 : Float(2, 3, 4)) { %5 : int = prim::Constant[value=1]() - %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0) + %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0) block0() { %8 : Float(2, 3, 4) = aten::mul(%0, %5) -> (%0, %8) } - %9 : Float(2, 3, 4), %10 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%6) + %9 : Float(2, 3, 4), %10 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%6) block0() { %11 : Float(2, 3, 4) = aten::mul(%6, %2) %12 : Float(2, 3, 4) = aten::mul(%6, %4) -> (%11, %12) } %13 : Dynamic = prim::AutogradAdd(%1, %9) - %14 : Float(2, 3, 4), %15 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%13) + %14 : Float(2, 3, 4), %15 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%13) block0() { %16 : Float(2, 3, 4) = aten::mul(%13, %3) %17 : Float(2, 3, 4) = aten::mul(%13, %2) @@ -146,19 +146,19 @@ graph(%0 : Float(2, 3, 4) %2 : Float(2, 3, 4) %3 : Float(2, 3, 4)) { %4 : int = prim::Constant[value=1]() - %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0) + %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0) block0() { %7 : Float(2, 3, 4) = aten::mul(%0, %4) -> (%0, %7) } - %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%5) + %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%5) block0() { %10 : Float(2, 3, 4) = aten::mul(%5, %2) %11 : Float(2, 3, 4) = aten::mul(%5, %3) -> (%10, %11) } %12 : Dynamic = prim::AutogradAdd(%1, %8) - %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%12) + %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%12) block0() { %15 : Float(2, 3, 4) = aten::mul(%12, %4) -> (%12, %15) diff --git a/test/expect/TestScript.test_string_cu.expect b/test/expect/TestScript.test_string_cu.expect index 0fd663ca5cc661..cefcd07bbba130 100644 --- a/test/expect/TestScript.test_string_cu.expect +++ b/test/expect/TestScript.test_string_cu.expect @@ -1,7 +1,7 @@ graph(%a : Dynamic) { %2 : int = prim::Constant[value=2]() - %1 : string = prim::Constant[string=a\n\tb\n]() - %3 : string = prim::Constant[string=aa]() + %1 : string = prim::Constant[value="a\n\tb\n"]() + %3 : string = prim::Constant[value="aa"]() = prim::Print(%a, %1, %2, %3) return (%a); } diff --git a/test/test_jit.py b/test/test_jit.py index ffd2c853481eb9..1b880fe3b0fdec 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -230,6 +230,9 @@ def getExportImportCopy(self, m): os.unlink(f.name) return imported + def assertGraphContains(self, graph, kind): + self.assertTrue(any(n.kind() == kind for n in graph.nodes())) + def assertExpectedONNXGraph(self, trace, *args, **kwargs): torch.onnx._optimize_trace(trace, operator_export_type=OperatorExportTypes.ONNX) self.assertExpectedGraph(trace, *args, **kwargs) @@ -1322,6 +1325,18 @@ def foo(a): x = torch.randn(5, 5) self.assertEqual(foo(x), x + x + x) + def test_einsum(self): + def outer(x, y): + return torch.einsum('i,j->ij', (x, y)) + + traced = torch.jit.trace(outer, (torch.randn(4), torch.randn(5))) + script = torch.jit.script(outer) + fns = [traced, script] + x, y = torch.randn(10), torch.randn(2) + for fn in [traced, script]: + self.assertGraphContains(fn.graph, kind='aten::einsum') + self.assertEqual(fn(x, y), outer(x, y)) + @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "calls .cuda()") @skipIfRocm diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 31ab5ed110f0de..2f24986c39cdc3 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -25,6 +25,7 @@ 'std::array': 'bool[2]', 'std::array': 'bool[3]', 'std::array': 'bool[4]', + 'std::string': 'str', 'Scalar': 'Scalar', 'Tensor': 'Tensor', 'TensorList': 'Tensor[]', @@ -66,6 +67,7 @@ def jit_type_of(arg): 'bool': 'bool({}.toInt())', 'double': '{}.toDouble()', 'int64_t': '{}.toInt()', + 'std::string': '{}.toString()->string()', 'std::array': 'as_bool_array<2>({}.toIntList()->elements())', 'std::array': 'as_bool_array<3>({}.toIntList()->elements())', 'std::array': 'as_bool_array<4>({}.toIntList()->elements())', @@ -121,7 +123,7 @@ def is_magic_method(api_name): return api_name.startswith('__') and api_name.endswith('__') -blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional', 'std::string', 'void*'} +blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional', 'void*'} default_only_types = {'Generator'} diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index d7876411c687a6..8bf1ab566b31a9 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -33,7 +33,7 @@ Value* insertConstant( })); n->output()->setType(ListType::ofTensors()); } else if(val.isString()) { - n->s_(attr::string, val.toString()->string()); + n->s_(attr::value, val.toString()->string()); n->output()->setType(StringType::get()); } else { throw constant_not_supported_error("Unsupported value kind: " + val.tagKind()); @@ -86,7 +86,7 @@ RegisterOperators reg({ return 0; }; } else if (type == StringType::get()) { - auto s = node->s(attr::string); + auto s = node->s(attr::value); return [s](Stack& stack) { push(stack, s); return 0; diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index 6d8f409fa3a2c0..60e086db187472 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -109,7 +109,6 @@ namespace torch { namespace jit { _(attr, transA) \ _(attr, transB) \ _(attr, name) \ - _(attr, string) \ _(attr, a) \ _(attr, b) diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index a65a743ddb24cf..48da3ec9cea913 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -125,7 +125,7 @@ void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=fa printPrimList(out,n->is(name)); break; case AttributeKind::s: - out << escapeString(n->s(name)); + out << "\"" << escapeString(n->s(name)) << "\""; break; case AttributeKind::ss: printPrimList(out,n->ss(name)); diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp index c9c4ceb0d2d8d3..75e5833535bcfc 100644 --- a/torch/csrc/jit/operator.cpp +++ b/torch/csrc/jit/operator.cpp @@ -54,6 +54,7 @@ struct SchemaParser { {"Layout", IntType::get() }, {"Device", ListType::ofInts() }, {"Scalar", NumberType::get() }, + {"str", StringType::get() }, {"float", FloatType::get() }, {"int", IntType::get() }, {"bool", IntType::get() }, // TODO: add separate bool type diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 03e31440ef5c95..4f496420956b0b 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -39,8 +39,8 @@ void addInputs(Node *n, const char * name, int64_t value) { detail::g void addInputs(Node *n, const char * name, bool value) { detail::genericAddInput(n, value); } void addInputs(Node *n, const char * name, double value) { detail::genericAddInput(n, value); } void addInputs(Node *n, const char * name, const at::Scalar& value) { detail::genericAddInput(n, value); } +void addInputs(Node *n, const char * name, const std::string& value) { detail::genericAddInput(n, value); } void addInputs(Node *n, const char * name, const at::Tensor& value) { n->addInput(getValueTrace(value)); } -void addInputs(Node *n, const char * name, const std::string& value) { detail::badArgType(value); } void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); } void addInputs(Node *n, const char * name, at::Generator * value) { detail::badArgType(value); } void addInputs(Node *n, const char * name, at::ScalarType value) { detail::badArgType(value); } From 80fa8e10075f4c8e0de1f209c323bcdb06f03dee Mon Sep 17 00:00:00 2001 From: Neeraj Pradhan Date: Tue, 11 Sep 2018 06:50:33 -0700 Subject: [PATCH 029/237] Add .expand() method to distribution classes (#11341) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This adds a `.expand` method for distributions that is akin to the `torch.Tensor.expand` method for tensors. It returns a new distribution instance with batch dimensions expanded to the desired `batch_shape`. Since this calls `torch.Tensor.expand` on the distribution's parameters, it does not allocate new memory for the expanded distribution instance's parameters. e.g. ```python >>> d = dist.Normal(torch.zeros(100, 1), torch.ones(100, 1)) >>> d.sample().shape torch.Size([100, 1]) >>> d.expand([100, 10]).sample().shape torch.Size([100, 10]) ``` We have already been using the `.expand` method in Pyro in our [patch](https://github.com/uber/pyro/blob/dev/pyro/distributions/torch.py#L10) of `torch.distributions`. We use this in our models to enable dynamic broadcasting. This has also been requested by a few users on the distributions slack, and we believe will be useful to the larger community. Note that currently, there is no convenient and efficient way to expand distribution instances: - Many distributions use `TransformedDistribution` (or wrap over another distribution instance. e.g. `OneHotCategorical` uses a `Categorical` instance) under the hood, or have lazy parameters. This makes it difficult to collect all the relevant parameters, broadcast them and construct new instances. - In the few cases where this is even possible, the resulting implementation would be inefficient since we will go through a lot of broadcasting and args validation logic in `__init__.py` that can be avoided. The `.expand` method allows for a safe and efficient way to expand distribution instances. Additionally, this bypasses `__init__.py` (using `__new__` and populating relevant attributes) since we do not need to do any broadcasting or args validation (which was already done when the instance was first created). This can result in significant savings as compared to constructing new instances via `__init__` (that said, the `sample` and `log_prob` methods will probably be the rate determining steps in many applications). e.g. ```python >>> a = dist.Bernoulli(torch.ones([10000, 1]), validate_args=True) >>> %timeit a.expand([10000, 100]) 15.2 µs ± 224 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) >>> %timeit dist.Bernoulli(torch.ones([10000, 100]), validate_args=True) 11.8 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` cc. fritzo, apaszke, vishwakftw, alicanb Pull Request resolved: https://github.com/pytorch/pytorch/pull/11341 Differential Revision: D9728485 Pulled By: soumith fbshipit-source-id: 3b94c23bc6a43ee704389e6287aa83d1e278d52f --- test/test_distributions.py | 60 +++++++++++++++++++ torch/distributions/bernoulli.py | 13 ++++ torch/distributions/beta.py | 8 +++ torch/distributions/binomial.py | 14 +++++ torch/distributions/categorical.py | 15 +++++ torch/distributions/cauchy.py | 9 +++ torch/distributions/chi2.py | 6 ++ torch/distributions/dirichlet.py | 8 +++ torch/distributions/distribution.py | 28 +++++++++ torch/distributions/exponential.py | 8 +++ torch/distributions/fishersnedecor.py | 11 ++++ torch/distributions/gamma.py | 10 +++- torch/distributions/geometric.py | 11 ++++ torch/distributions/gumbel.py | 11 +++- torch/distributions/half_cauchy.py | 12 +++- torch/distributions/half_normal.py | 12 +++- torch/distributions/independent.py | 10 ++++ torch/distributions/laplace.py | 9 +++ torch/distributions/log_normal.py | 12 +++- torch/distributions/logistic_normal.py | 18 +++++- .../lowrank_multivariate_normal.py | 14 +++++ torch/distributions/multinomial.py | 9 +++ torch/distributions/multivariate_normal.py | 19 ++++++ torch/distributions/negative_binomial.py | 14 +++++ torch/distributions/normal.py | 9 +++ torch/distributions/one_hot_categorical.py | 8 +++ torch/distributions/pareto.py | 11 ++-- torch/distributions/poisson.py | 8 +++ torch/distributions/relaxed_bernoulli.py | 27 ++++++++- torch/distributions/relaxed_categorical.py | 24 +++++++- torch/distributions/studentT.py | 11 ++++ torch/distributions/uniform.py | 10 +++- torch/distributions/weibull.py | 18 +++++- 33 files changed, 446 insertions(+), 21 deletions(-) diff --git a/test/test_distributions.py b/test/test_distributions.py index 3aa414a77515e7..60bcec1605b6e5 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -793,6 +793,47 @@ def test_has_examples(self): self.assertIn(Dist, distributions_with_examples, "Please add {} to the EXAMPLES list in test_distributions.py".format(Dist.__name__)) + def test_distribution_expand(self): + shapes = [torch.Size(), torch.Size((2,)), torch.Size((2, 1))] + for Dist, params in EXAMPLES: + if Dist.__name__ == "TransformedDistribution": + continue + for param in params: + for shape in shapes: + d = Dist(**param) + expanded_shape = shape + d.batch_shape + original_shape = d.batch_shape + d.event_shape + expected_shape = shape + original_shape + expanded = d.expand(batch_shape=expanded_shape) + sample = expanded.sample() + actual_shape = expanded.sample().shape + self.assertEqual(expanded.__class__, d.__class__) + self.assertEqual(d.sample().shape, original_shape) + self.assertEqual(expanded.log_prob(sample), d.log_prob(sample)) + self.assertEqual(actual_shape, expected_shape) + + def test_distribution_subclass_expand(self): + expand_by = torch.Size((2,)) + for Dist, params in EXAMPLES: + if Dist.__name__ == "TransformedDistribution": + continue + + class SubClass(Dist): + pass + + for param in params: + d = SubClass(**param) + expanded_shape = expand_by + d.batch_shape + original_shape = d.batch_shape + d.event_shape + expected_shape = expand_by + original_shape + expanded = d.expand(batch_shape=expanded_shape) + sample = expanded.sample() + actual_shape = expanded.sample().shape + self.assertEqual(expanded.__class__, d.__class__) + self.assertEqual(d.sample().shape, original_shape) + self.assertEqual(expanded.log_prob(sample), d.log_prob(sample)) + self.assertEqual(actual_shape, expected_shape) + def test_bernoulli(self): p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True) r = torch.tensor(0.3, requires_grad=True) @@ -2178,6 +2219,25 @@ def test_independent_shape(self): except NotImplementedError: pass + def test_independent_expand(self): + for Dist, params in EXAMPLES: + if Dist.__name__ == "TransformedDistribution": + continue + for param in params: + base_dist = Dist(**param) + for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1): + for s in [torch.Size(), torch.Size((2,)), torch.Size((2, 3))]: + indep_dist = Independent(base_dist, reinterpreted_batch_ndims) + expanded_shape = s + indep_dist.batch_shape + expanded = indep_dist.expand(expanded_shape) + expanded_sample = expanded.sample() + expected_shape = expanded_shape + indep_dist.event_shape + self.assertEqual(expanded_sample.shape, expected_shape) + self.assertEqual(expanded.log_prob(expanded_sample), + indep_dist.log_prob(expanded_sample)) + self.assertEqual(expanded.event_shape, indep_dist.event_shape) + self.assertEqual(expanded.batch_shape, expanded_shape) + def test_cdf_icdf_inverse(self): # Tests the invertibility property on the distributions for Dist, params in EXAMPLES: diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py index 9db9691c4ddb9e..067a6eba9a43c4 100644 --- a/torch/distributions/bernoulli.py +++ b/torch/distributions/bernoulli.py @@ -46,6 +46,19 @@ def __init__(self, probs=None, logits=None, validate_args=None): batch_shape = self._param.size() super(Bernoulli, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Bernoulli, _instance) + batch_shape = torch.Size(batch_shape) + if 'probs' in self.__dict__: + new.probs = self.probs.expand(batch_shape) + new._param = new.probs + else: + new.logits = self.logits.expand(batch_shape) + new._param = new.logits + super(Bernoulli, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._param.new(*args, **kwargs) diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py index f23415d5bf0cbd..35a94e8545190b 100644 --- a/torch/distributions/beta.py +++ b/torch/distributions/beta.py @@ -36,6 +36,14 @@ def __init__(self, concentration1, concentration0, validate_args=None): self._dirichlet = Dirichlet(concentration1_concentration0) super(Beta, self).__init__(self._dirichlet._batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Beta, _instance) + batch_shape = torch.Size(batch_shape) + new._dirichlet = self._dirichlet.expand(batch_shape) + super(Beta, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): return self.concentration1 / (self.concentration1 + self.concentration0) diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py index acbb636ce5ee16..ad56856c90aef1 100644 --- a/torch/distributions/binomial.py +++ b/torch/distributions/binomial.py @@ -51,6 +51,20 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None): batch_shape = self._param.size() super(Binomial, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Binomial, _instance) + batch_shape = torch.Size(batch_shape) + new.total_count = self.total_count.expand(batch_shape) + if 'probs' in self.__dict__: + new.probs = self.probs.expand(batch_shape) + new._param = new.probs + else: + new.logits = self.logits.expand(batch_shape) + new._param = new.logits + super(Binomial, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._param.new(*args, **kwargs) diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py index 6dc046dfab42d4..267bed8826e7de 100644 --- a/torch/distributions/categorical.py +++ b/torch/distributions/categorical.py @@ -53,6 +53,21 @@ def __init__(self, probs=None, logits=None, validate_args=None): batch_shape = self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size() super(Categorical, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Categorical, _instance) + batch_shape = torch.Size(batch_shape) + param_shape = batch_shape + torch.Size((self._num_events,)) + if 'probs' in self.__dict__: + new.probs = self.probs.expand(param_shape) + new._param = new.probs + else: + new.logits = self.logits.expand(param_shape) + new._param = new.logits + new._num_events = self._num_events + super(Categorical, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._param.new(*args, **kwargs) diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py index dec9cfafe134fb..da31c78abbb413 100644 --- a/torch/distributions/cauchy.py +++ b/torch/distributions/cauchy.py @@ -36,6 +36,15 @@ def __init__(self, loc, scale, validate_args=None): batch_shape = self.loc.size() super(Cauchy, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Cauchy, _instance) + batch_shape = torch.Size(batch_shape) + new.loc = self.loc.expand(batch_shape) + new.scale = self.scale.expand(batch_shape) + super(Cauchy, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): return self.loc.new_tensor(nan).expand(self._extended_shape()) diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py index fcb0c5b7837727..fe8c5956fb5b68 100644 --- a/torch/distributions/chi2.py +++ b/torch/distributions/chi2.py @@ -1,3 +1,4 @@ +import torch from torch.distributions import constraints from torch.distributions.gamma import Gamma @@ -21,6 +22,11 @@ class Chi2(Gamma): def __init__(self, df, validate_args=None): super(Chi2, self).__init__(0.5 * df, 0.5, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Chi2, _instance) + batch_shape = torch.Size(batch_shape) + return super(Chi2, self).expand(batch_shape, new) + @property def df(self): return self.concentration * 2 diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py index ca014e0d9da40d..94d66d60c0848c 100644 --- a/torch/distributions/dirichlet.py +++ b/torch/distributions/dirichlet.py @@ -58,6 +58,14 @@ def __init__(self, concentration, validate_args=None): batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:] super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Dirichlet, _instance) + batch_shape = torch.Size(batch_shape) + new.concentration = self.concentration.expand(batch_shape + self.event_shape) + super(Dirichlet, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def rsample(self, sample_shape=()): shape = self._extended_shape(sample_shape) concentration = self.concentration.expand(shape) diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py index 3d1aeb9cadc9c9..8d357a4e9371a0 100644 --- a/torch/distributions/distribution.py +++ b/torch/distributions/distribution.py @@ -35,6 +35,27 @@ def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_ if not constraint.check(getattr(self, param)).all(): raise ValueError("The parameter {} has invalid values".format(param)) + def expand(self, batch_shape, _instance=None): + """ + Returns a new distribution instance (or populates an existing instance + provided by a derived class) with batch dimensions expanded to + `batch_shape`. This method calls :class:`~torch.Tensor.expand` on + the distribution's parameters. As such, this does not allocate new + memory for the expanded distribution instance. Additionally, + this does not repeat any args checking or parameter broadcasting in + `__init__.py`, when an instance is first created. + + Args: + batch_shape (torch.Size): the desired expanded size. + _instance: new instance provided by subclasses that + need to override `.expand`. + + Returns: + New distribution instance with batch dimensions expanded to + `batch_size`. + """ + raise NotImplementedError + @property def batch_shape(self): """ @@ -228,6 +249,13 @@ def _validate_sample(self, value): if not self.support.check(value).all(): raise ValueError('The value argument must be within the support') + def _get_checked_instance(self, cls, _instance=None): + if _instance is None and type(self).__init__ != cls.__init__: + raise NotImplementedError("Subclass {} of {} that defines a custom __init__ method " + "must also define a custom .expand() method.". + format(self.__class__.__name__, cls.__name__)) + return self.__new__(type(self)) if _instance is None else _instance + def __repr__(self): param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__] args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p] diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py index 85decc0e239fe5..a2fde7b382593c 100644 --- a/torch/distributions/exponential.py +++ b/torch/distributions/exponential.py @@ -41,6 +41,14 @@ def __init__(self, rate, validate_args=None): batch_shape = torch.Size() if isinstance(rate, Number) else self.rate.size() super(Exponential, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Exponential, _instance) + batch_shape = torch.Size(batch_shape) + new.rate = self.rate.expand(batch_shape) + super(Exponential, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) return self.rate.new(shape).exponential_() / self.rate diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py index 202621218eea7f..6fe09a7cff14db 100644 --- a/torch/distributions/fishersnedecor.py +++ b/torch/distributions/fishersnedecor.py @@ -37,6 +37,17 @@ def __init__(self, df1, df2, validate_args=None): batch_shape = self.df1.size() super(FisherSnedecor, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(FisherSnedecor, _instance) + batch_shape = torch.Size(batch_shape) + new.df1 = self.df1.expand(batch_shape) + new.df2 = self.df2.expand(batch_shape) + new._gamma1 = self._gamma1.expand(batch_shape) + new._gamma2 = self._gamma2.expand(batch_shape) + super(FisherSnedecor, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): df2 = self.df2.clone() diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py index 3f96fc031b2345..655ddac5f0c643 100644 --- a/torch/distributions/gamma.py +++ b/torch/distributions/gamma.py @@ -1,7 +1,6 @@ from numbers import Number import torch -from torch.autograd import Function from torch.distributions import constraints from torch.distributions.exp_family import ExponentialFamily from torch.distributions.utils import _finfo, broadcast_all, lazy_property @@ -48,6 +47,15 @@ def __init__(self, concentration, rate, validate_args=None): batch_shape = self.concentration.size() super(Gamma, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Gamma, _instance) + batch_shape = torch.Size(batch_shape) + new.concentration = self.concentration.expand(batch_shape) + new.rate = self.rate.expand(batch_shape) + super(Gamma, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) value = _standard_gamma(self.concentration.expand(shape)) / self.rate.expand(shape) diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py index 196a28b29e7d75..a84963664b4cf3 100644 --- a/torch/distributions/geometric.py +++ b/torch/distributions/geometric.py @@ -45,6 +45,17 @@ def __init__(self, probs=None, logits=None, validate_args=None): batch_shape = probs_or_logits.size() super(Geometric, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Geometric, _instance) + batch_shape = torch.Size(batch_shape) + if 'probs' in self.__dict__: + new.probs = self.probs.expand(batch_shape) + else: + new.logits = self.logits.expand(batch_shape) + super(Geometric, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): return 1. / self.probs - 1. diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py index e4e96aa9eb4373..f6d10aa4b0efb9 100644 --- a/torch/distributions/gumbel.py +++ b/torch/distributions/gumbel.py @@ -31,15 +31,22 @@ def __init__(self, loc, scale, validate_args=None): self.loc, self.scale = broadcast_all(loc, scale) finfo = _finfo(self.loc) if isinstance(loc, Number) and isinstance(scale, Number): - batch_shape = torch.Size() base_dist = Uniform(finfo.tiny, 1 - finfo.eps) else: - batch_shape = self.scale.size() base_dist = Uniform(self.loc.new(self.loc.size()).fill_(finfo.tiny), 1 - finfo.eps) transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)), ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)] super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Gumbel, _instance) + batch_shape = torch.Size(batch_shape) + base_dist = self.base_dist.expand(batch_shape) + transforms = self.transforms + super(Gumbel, new).__init__(base_dist, transforms, validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): return self.loc + self.scale * euler_constant diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py index 77a50d3f03c49f..8b81ea8b8c862d 100644 --- a/torch/distributions/half_cauchy.py +++ b/torch/distributions/half_cauchy.py @@ -1,5 +1,6 @@ import math +import torch from torch._six import inf from torch.distributions import constraints from torch.distributions.transforms import AbsTransform @@ -28,9 +29,18 @@ class HalfCauchy(TransformedDistribution): has_rsample = True def __init__(self, scale, validate_args=None): - super(HalfCauchy, self).__init__(Cauchy(0, scale), AbsTransform(), + base_dist = Cauchy(0, scale) + super(HalfCauchy, self).__init__(base_dist, AbsTransform(), validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(HalfCauchy, _instance) + batch_shape = torch.Size(batch_shape) + base_dist = self.base_dist.expand(batch_shape) + super(HalfCauchy, new).__init__(base_dist, AbsTransform(), validate_args=False) + new._validate_args = self._validate_args + return new + @property def scale(self): return self.base_dist.scale diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py index 059f3837604a63..d18cb3b971cd2c 100644 --- a/torch/distributions/half_normal.py +++ b/torch/distributions/half_normal.py @@ -1,5 +1,6 @@ import math +import torch from torch._six import inf from torch.distributions import constraints from torch.distributions.transforms import AbsTransform @@ -28,9 +29,18 @@ class HalfNormal(TransformedDistribution): has_rsample = True def __init__(self, scale, validate_args=None): - super(HalfNormal, self).__init__(Normal(0, scale), AbsTransform(), + base_dist = Normal(0, scale) + super(HalfNormal, self).__init__(base_dist, AbsTransform(), validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(HalfNormal, _instance) + batch_shape = torch.Size(batch_shape) + base_dist = self.base_dist.expand(batch_shape) + super(HalfNormal, new).__init__(base_dist, AbsTransform(), validate_args=False) + new._validate_args = self._validate_args + return new + @property def scale(self): return self.base_dist.scale diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py index 938e478472a9b7..ed8edb43f11644 100644 --- a/torch/distributions/independent.py +++ b/torch/distributions/independent.py @@ -46,6 +46,16 @@ def __init__(self, base_distribution, reinterpreted_batch_ndims, validate_args=N self.reinterpreted_batch_ndims = reinterpreted_batch_ndims super(Independent, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Independent, _instance) + batch_shape = torch.Size(batch_shape) + new.base_dist = self.base_dist.expand(batch_shape + + self.event_shape[:self.reinterpreted_batch_ndims]) + new.reinterpreted_batch_ndims = self.reinterpreted_batch_ndims + super(Independent, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @property def has_rsample(self): return self.base_dist.has_rsample diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py index d3c09faec43051..cc5b079fbdbdcb 100644 --- a/torch/distributions/laplace.py +++ b/torch/distributions/laplace.py @@ -43,6 +43,15 @@ def __init__(self, loc, scale, validate_args=None): batch_shape = self.loc.size() super(Laplace, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Laplace, _instance) + batch_shape = torch.Size(batch_shape) + new.loc = self.loc.expand(batch_shape) + new.scale = self.scale.expand(batch_shape) + super(Laplace, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) u = self.loc.new(shape).uniform_(_finfo(self.loc).eps - 1, 1) diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py index 9487dc9b5a9679..274bcd83c47a83 100644 --- a/torch/distributions/log_normal.py +++ b/torch/distributions/log_normal.py @@ -1,3 +1,4 @@ +import torch from torch.distributions import constraints from torch.distributions.transforms import ExpTransform from torch.distributions.normal import Normal @@ -27,7 +28,16 @@ class LogNormal(TransformedDistribution): has_rsample = True def __init__(self, loc, scale, validate_args=None): - super(LogNormal, self).__init__(Normal(loc, scale), ExpTransform(), validate_args=validate_args) + base_dist = Normal(loc, scale) + super(LogNormal, self).__init__(base_dist, ExpTransform(), validate_args=validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(LogNormal, _instance) + batch_shape = torch.Size(batch_shape) + base_dist = self.base_dist.expand(batch_shape) + super(LogNormal, new).__init__(base_dist, ExpTransform(), validate_args=False) + new._validate_args = self._validate_args + return new @property def loc(self): diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py index 39b2f3e488f183..909a30d3db8a7c 100644 --- a/torch/distributions/logistic_normal.py +++ b/torch/distributions/logistic_normal.py @@ -32,12 +32,24 @@ class LogisticNormal(TransformedDistribution): has_rsample = True def __init__(self, loc, scale, validate_args=None): - super(LogisticNormal, self).__init__( - Normal(loc, scale), StickBreakingTransform(), - validate_args=validate_args) + base_dist = Normal(loc, scale) + super(LogisticNormal, self).__init__(base_dist, + StickBreakingTransform(), + validate_args=validate_args) # Adjust event shape since StickBreakingTransform adds 1 dimension self._event_shape = torch.Size([s + 1 for s in self._event_shape]) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(LogisticNormal, _instance) + batch_shape = torch.Size(batch_shape) + base_dist = self.base_dist.expand(batch_shape + self.base_dist.batch_shape[-1:]) + super(LogisticNormal, new).__init__(base_dist, + StickBreakingTransform(), + validate_args=False) + new._event_shape = self._event_shape + new._validate_args = self._validate_args + return new + @property def loc(self): return self.base_dist.loc diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py index 4e3e25ae947fad..5e6239a69d0d37 100644 --- a/torch/distributions/lowrank_multivariate_normal.py +++ b/torch/distributions/lowrank_multivariate_normal.py @@ -116,6 +116,20 @@ def __init__(self, loc, cov_factor, cov_diag, validate_args=None): super(LowRankMultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(LowRankMultivariateNormal, _instance) + batch_shape = torch.Size(batch_shape) + loc_shape = batch_shape + self.event_shape + new.loc = self.loc.expand(loc_shape) + new.cov_diag = self.cov_diag.expand(loc_shape) + new.cov_factor = self.cov_factor.expand(loc_shape + self.cov_factor.shape[-1:]) + new._capacitance_tril = self._capacitance_tril.expand(batch_shape + self._capacitance_tril.shape[-2:]) + super(LowRankMultivariateNormal, new).__init__(batch_shape, + self.event_shape, + validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): return self.loc diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py index dd1e20d373469b..d02266c0d8db50 100644 --- a/torch/distributions/multinomial.py +++ b/torch/distributions/multinomial.py @@ -58,6 +58,15 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None): event_shape = self._categorical.param_shape[-1:] super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Multinomial, _instance) + batch_shape = torch.Size(batch_shape) + new.total_count = self.total_count + new._categorical = self._categorical.expand(batch_shape) + super(Multinomial, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py index 51de7816cafaff..855c727dc9b951 100644 --- a/torch/distributions/multivariate_normal.py +++ b/torch/distributions/multivariate_normal.py @@ -146,6 +146,25 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:] super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(MultivariateNormal, _instance) + batch_shape = torch.Size(batch_shape) + loc_shape = batch_shape + self.event_shape + cov_shape = batch_shape + self.event_shape + self.event_shape + new.loc = self.loc.expand(loc_shape) + new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape) + if 'covariance_matrix' in self.__dict__: + new.covariance_matrix = self.covariance_matrix.expand(cov_shape) + if 'scale_tril' in self.__dict__: + new.scale_tril = self.scale_tril.expand(cov_shape) + if 'precision_matrix' in self.__dict__: + new.precision_matrix = self.precision_matrix.expand(cov_shape) + super(MultivariateNormal, new).__init__(batch_shape, + self.event_shape, + validate_args=False) + new._validate_args = self._validate_args + return new + @lazy_property def scale_tril(self): return self._unbroadcasted_scale_tril.expand( diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py index de0b748f6a6fdf..01a009b9869d0a 100644 --- a/torch/distributions/negative_binomial.py +++ b/torch/distributions/negative_binomial.py @@ -38,6 +38,20 @@ def __init__(self, total_count, probs=None, logits=None, validate_args=None): batch_shape = self._param.size() super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(NegativeBinomial, _instance) + batch_shape = torch.Size(batch_shape) + new.total_count = self.total_count.expand(batch_shape) + if 'probs' in self.__dict__: + new.probs = self.probs.expand(batch_shape) + new._param = new.probs + else: + new.logits = self.logits.expand(batch_shape) + new._param = new.logits + super(NegativeBinomial, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._param.new(*args, **kwargs) diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py index 0f1375e6640f53..98a5318d7b9829 100644 --- a/torch/distributions/normal.py +++ b/torch/distributions/normal.py @@ -48,6 +48,15 @@ def __init__(self, loc, scale, validate_args=None): batch_shape = self.loc.size() super(Normal, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Normal, _instance) + batch_shape = torch.Size(batch_shape) + new.loc = self.loc.expand(batch_shape) + new.scale = self.scale.expand(batch_shape) + super(Normal, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) with torch.no_grad(): diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py index fbfec01ab7159a..c28949b079a115 100644 --- a/torch/distributions/one_hot_categorical.py +++ b/torch/distributions/one_hot_categorical.py @@ -38,6 +38,14 @@ def __init__(self, probs=None, logits=None, validate_args=None): event_shape = self._categorical.param_shape[-1:] super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(OneHotCategorical, _instance) + batch_shape = torch.Size(batch_shape) + new._categorical = self._categorical.expand(batch_shape) + super(OneHotCategorical, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py index 82fff3c34b02cb..6fa499642065de 100644 --- a/torch/distributions/pareto.py +++ b/torch/distributions/pareto.py @@ -1,7 +1,3 @@ -from numbers import Number - -import math - import torch from torch.distributions import constraints from torch.distributions.exponential import Exponential @@ -32,6 +28,13 @@ def __init__(self, scale, alpha, validate_args=None): transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)] super(Pareto, self).__init__(base_dist, transforms, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Pareto, _instance) + base_dist = self.base_dist.expand(batch_shape) + super(Pareto, new).__init__(base_dist, self.transforms, validate_args=False) + new._validate_args = self._validate_args + return new + @property def mean(self): # mean is inf for alpha <= 1 diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py index 6be54070a55f2c..954ed6e0d32064 100644 --- a/torch/distributions/poisson.py +++ b/torch/distributions/poisson.py @@ -43,6 +43,14 @@ def __init__(self, rate, validate_args=None): batch_shape = self.rate.size() super(Poisson, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Poisson, _instance) + batch_shape = torch.Size(batch_shape) + new.rate = self.rate.expand(batch_shape) + super(Poisson, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) with torch.no_grad(): diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py index 6b6c540ec48c00..643fddc2c96b68 100644 --- a/torch/distributions/relaxed_bernoulli.py +++ b/torch/distributions/relaxed_bernoulli.py @@ -46,6 +46,20 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None): batch_shape = self._param.size() super(LogitRelaxedBernoulli, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(LogitRelaxedBernoulli, _instance) + batch_shape = torch.Size(batch_shape) + new.temperature = self.temperature + if 'probs' in self.__dict__: + new.probs = self.probs.expand(batch_shape) + new._param = new.probs + else: + new.logits = self.logits.expand(batch_shape) + new._param = new.logits + super(LogitRelaxedBernoulli, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._param.new(*args, **kwargs) @@ -99,8 +113,17 @@ class RelaxedBernoulli(TransformedDistribution): has_rsample = True def __init__(self, temperature, probs=None, logits=None, validate_args=None): - super(RelaxedBernoulli, self).__init__(LogitRelaxedBernoulli(temperature, probs, logits), - SigmoidTransform(), validate_args=validate_args) + base_dist = LogitRelaxedBernoulli(temperature, probs, logits) + super(RelaxedBernoulli, self).__init__(base_dist, + SigmoidTransform(), + validate_args=validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(RelaxedBernoulli, _instance) + base_dist = self.base_dist.expand(batch_shape) + super(RelaxedBernoulli, new).__init__(base_dist, SigmoidTransform(), validate_args=False) + new._validate_args = self._validate_args + return new @property def temperature(self): diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py index 0c35defaf35bd7..a43d631bec9b2b 100644 --- a/torch/distributions/relaxed_categorical.py +++ b/torch/distributions/relaxed_categorical.py @@ -41,6 +41,15 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None): event_shape = self._categorical.param_shape[-1:] super(ExpRelaxedCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(ExpRelaxedCategorical, _instance) + batch_shape = torch.Size(batch_shape) + new.temperature = self.temperature + new._categorical = self._categorical.expand(batch_shape) + super(ExpRelaxedCategorical, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) @@ -100,8 +109,19 @@ class RelaxedOneHotCategorical(TransformedDistribution): has_rsample = True def __init__(self, temperature, probs=None, logits=None, validate_args=None): - super(RelaxedOneHotCategorical, self).__init__(ExpRelaxedCategorical(temperature, probs, logits), - ExpTransform(), validate_args=validate_args) + base_dist = ExpRelaxedCategorical(temperature, probs, logits) + super(RelaxedOneHotCategorical, self).__init__(base_dist, + ExpTransform(), + validate_args=validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(RelaxedOneHotCategorical, _instance) + base_dist = self.base_dist.expand(batch_shape) + super(RelaxedOneHotCategorical, new).__init__(base_dist, + ExpTransform(), + validate_args=False) + new._validate_args = self._validate_args + return new @property def temperature(self): diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py index c7738afa9cd63b..cefa4684298261 100644 --- a/torch/distributions/studentT.py +++ b/torch/distributions/studentT.py @@ -45,6 +45,17 @@ def __init__(self, df, loc=0., scale=1., validate_args=None): batch_shape = torch.Size() if isinstance(df, Number) else self.df.size() super(StudentT, self).__init__(batch_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(StudentT, _instance) + batch_shape = torch.Size(batch_shape) + new.df = self.df.expand(batch_shape) + new.loc = self.loc.expand(batch_shape) + new.scale = self.scale.expand(batch_shape) + new._chi2 = self._chi2.expand(batch_shape) + super(StudentT, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + def rsample(self, sample_shape=torch.Size()): # NOTE: This does not agree with scipy implementation as much as other distributions. # (see https://github.com/fritzo/notebooks/blob/master/debug-student-t.ipynb). Using DoubleTensor diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py index fa39f800e97df7..ce9cf496656346 100644 --- a/torch/distributions/uniform.py +++ b/torch/distributions/uniform.py @@ -1,4 +1,3 @@ -import math from numbers import Number import torch @@ -50,6 +49,15 @@ def __init__(self, low, high, validate_args=None): if self._validate_args and not torch.lt(self.low, self.high).all(): raise ValueError("Uniform is not defined when low>= high") + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Uniform, _instance) + batch_shape = torch.Size(batch_shape) + new.low = self.low.expand(batch_shape) + new.high = self.high.expand(batch_shape) + super(Uniform, new).__init__(batch_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @constraints.dependent_property def support(self): return constraints.interval(self.low, self.high) diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py index 4e196174f84e98..0636a74d113f82 100644 --- a/torch/distributions/weibull.py +++ b/torch/distributions/weibull.py @@ -1,5 +1,3 @@ -from numbers import Number -import math import torch from torch.distributions import constraints from torch.distributions.exponential import Exponential @@ -32,7 +30,21 @@ def __init__(self, scale, concentration, validate_args=None): base_dist = Exponential(self.scale.new(self.scale.size()).fill_(1.0)) transforms = [PowerTransform(exponent=self.concentration_reciprocal), AffineTransform(loc=0, scale=self.scale)] - super(Weibull, self).__init__(base_dist, transforms, validate_args=validate_args) + super(Weibull, self).__init__(base_dist, + transforms, + validate_args=validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Weibull, _instance) + batch_shape = torch.Size(batch_shape) + new.scale = self.scale.expand(batch_shape) + new.concentration = self.concentration.expand(batch_shape) + base_dist = self.base_dist.expand(batch_shape) + super(Weibull, new).__init__(base_dist, + self.transforms, + validate_args=False) + new._validate_args = self._validate_args + return new @property def mean(self): From 86ab92b0a9b5c2b76bf41aad00f9fb9e21d62eda Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Tue, 11 Sep 2018 07:23:23 -0700 Subject: [PATCH 030/237] Move TensorImpl / UndefinedTensor(Impl) to core (#11441) Summary: Moves TensorImpl to core. Renames UndefinedTensor to UndefinedTensorImpl and moves to core. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11441 Differential Revision: D9736620 Pulled By: gchanan fbshipit-source-id: 0322ae3b903e338de253b35a0d74a9d3e219204b --- aten/src/ATen/SparseTensorImpl.h | 2 +- aten/src/ATen/TensorImpl.h | 204 +----------------- aten/src/ATen/Utils.h | 2 +- aten/src/ATen/WrapDimUtils.h | 2 +- aten/src/ATen/WrapDimUtilsMulti.h | 2 +- aten/src/ATen/copy_wrapper.py | 2 +- aten/src/ATen/{ => core}/TensorImpl.cpp | 2 +- aten/src/ATen/core/TensorImpl.h | 204 ++++++++++++++++++ .../UndefinedTensorImpl.cpp} | 20 +- .../UndefinedTensorImpl.h} | 10 +- aten/src/ATen/function_wrapper.py | 22 +- aten/src/ATen/gen.py | 2 +- aten/src/ATen/templates/SparseTypeDerived.cpp | 2 +- aten/src/ATen/templates/Tensor.h | 12 +- aten/src/ATen/templates/TypeDefault.cpp | 4 +- aten/src/ATen/templates/TypeDerived.cpp | 4 +- aten/src/ATen/test/undefined_tensor_test.cpp | 4 +- aten/src/TH/THHalf.h | 2 +- aten/src/TH/THTensor.cpp | 2 +- aten/src/TH/generic/THTensor.cpp | 8 +- aten/src/TH/generic/THTensor.h | 2 +- aten/src/THC/generic/THCTensor.cpp | 6 +- .../src/THC/generic/THCTensorMathPointwise.cu | 4 +- .../complex_registration_extension.cpp | 4 +- torch/csrc/jit/ivalue.h | 6 +- 25 files changed, 269 insertions(+), 265 deletions(-) rename aten/src/ATen/{ => core}/TensorImpl.cpp (98%) create mode 100644 aten/src/ATen/core/TensorImpl.h rename aten/src/ATen/{UndefinedTensor.cpp => core/UndefinedTensorImpl.cpp} (55%) rename aten/src/ATen/{UndefinedTensor.h => core/UndefinedTensorImpl.h} (72%) diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 835b45c2a541ee..77ad805ac99e6f 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -1,7 +1,7 @@ #pragma once #include "ATen/Tensor.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/core/Error.h" namespace at { diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h index e0a649a49b6ccd..34ece0fc0d03fb 100644 --- a/aten/src/ATen/TensorImpl.h +++ b/aten/src/ATen/TensorImpl.h @@ -1,204 +1,2 @@ #pragma once - -#include -#include - -#include "ATen/core/Storage.h" -#include "ATen/core/optional.h" -#include "ATen/core/TensorTypeId.h" -#include "ATen/core/TensorTypeIdRegistration.h" -#include "ATen/core/LegacyTypeDispatch.h" -#include "ATen/core/Backend.h" - -struct THTensor; - -namespace at { -class Scalar; -struct Type; -struct Storage; -struct Tensor; -} // namespace at - -namespace at { -struct AT_API TensorImpl : public c10::intrusive_ptr_target { - TensorImpl() = delete; - TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable); - TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); - - virtual void release_resources() override; - - Type & type() const { - // NB: It's valid to use getTypeRaw here, because the TensorImpl - // could not have been created without initializing the Type first. - // TODO: This is not actually true via the Caffe2 codepath! Make - // it so. - return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable()); - } - - TensorTypeId type_id() const { return type_id_; } - virtual IntList sizes() const; - virtual IntList strides() const; - virtual int64_t dim() const; - virtual const Storage& storage() const; - friend struct Type; - - virtual int64_t numel() const { -#ifdef DEBUG - AT_ASSERT(compute_numel() == numel_); -#endif - return numel_; - } - - virtual bool is_contiguous() const { -#ifdef DEBUG - AT_ASSERT(compute_contiguous() == is_contiguous_); -#endif - return is_contiguous_; - } - - // this is called by the generated wrapper code when there are conditions - // when this output tensor should be zero dimensional. e.g. when all inputs - // to a function 'add' were zero dimensional, then condition_when_zero_dim == true. - // we also prevent this from getting marked as a zero dim tensor if it is not - // the right shape afterall. - virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim); - - // True if a tensor was auto-wrapped from a C++ or Python number. - // Wrapped numbers do not participate in the result type computation for - // mixed-type operations if there are any Tensors that are not wrapped - // numbers. Otherwise, they behave like their non-wrapped equivalents. - // See [Result type computation] in TensorIterator.h. - bool is_wrapped_number() const { - return is_wrapped_number_; - } - void set_wrapped_number(bool value) { - AT_ASSERT(dim() == 0); - is_wrapped_number_ = value; - } - - // ~~~~~ Autograd API ~~~~~ - // Some methods below are defined in TensorImpl.cpp because Tensor is an - // incomplete type. - - virtual void set_requires_grad(bool requires_grad) { - AT_ERROR("set_requires_grad is not implemented for Tensor"); - } - virtual bool requires_grad() const { - AT_ERROR("requires_grad is not implemented for Tensor"); - } - - virtual Tensor& grad(); - virtual const Tensor& grad() const; - - // TODO: make these protected - // Note: storage->size() may be greater than the recorded size - // of a tensor - at::Storage storage_; - - template - inline T * data() const { - return storage_.data() + storage_offset_; - } - - template - inline T * unsafe_data() const { - return storage_.unsafe_data() + storage_offset_; - } - - inline at::ScalarType scalar_type() const { - return scalar_type_; - } - - virtual int64_t storage_offset() const { - return storage_offset_; - } - - // represents that numel() == 0. - inline bool is_empty() const { - return numel() == 0; - } - - virtual void resize_dim(int64_t ndim) { - // NB: This is *truly* a resize; calling code (e.g., squeeze) - // assumes that old values are preserved - sizes_.resize(ndim); - strides_.resize(ndim); - refresh_numel(); - refresh_contiguous(); - } - - virtual void set_size(int64_t dim, int64_t new_size) { - sizes_[dim] = new_size; - refresh_numel(); - refresh_contiguous(); - } - - virtual void set_stride(int64_t dim, int64_t new_stride) { - strides_[dim] = new_stride; - refresh_numel(); - refresh_contiguous(); - } - - virtual void set_storage_offset(int64_t storage_offset) { - storage_offset_ = storage_offset; - refresh_numel(); - refresh_contiguous(); - } - - // WARNING: This function does not check if the requested - // sizes/strides are in bounds for the storage that is allocated; - // this is the responsibility of the caller - void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) { - AT_CHECK( - new_size.size() == new_stride.size(), - "dimensionality of sizes (", - new_size.size(), - ") must match dimensionality of strides (", - new_stride.size(), - ")"); - sizes_ = new_size.vec(); - strides_ = new_stride.vec(); - refresh_numel(); - refresh_contiguous(); - } - - virtual int64_t size(int64_t d) const; - virtual int64_t stride(int64_t d) const; - - bool is_variable() const { return is_variable_; }; - - private: - int64_t storage_offset_; - std::vector sizes_; - std::vector strides_; - - bool is_contiguous_; - int64_t numel_; - - int64_t compute_numel() const { - int64_t n = 1; - for (auto s : sizes()) { - n *= s; - } - return n; - } - bool compute_contiguous() const; - - protected: - void refresh_numel() { - numel_ = compute_numel(); - } - void refresh_contiguous() { - is_contiguous_ = compute_contiguous(); - } - TensorTypeId type_id_; - // INVARIANT: When storage is non-null, this scalar type must - // agree with the scalar type in storage - ScalarType scalar_type_; - bool is_variable_ = false; - bool is_wrapped_number_ = false; - - private: - TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable); -}; -} // namespace at +#include diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index fff88e39aba053..cea2f83d3e9180 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -2,7 +2,7 @@ #include "ATen/core/ATenGeneral.h" #include "ATen/StorageImpl.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include #include "ATen/Formatting.h" diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index 8e9db589c5267a..467a5664f6e05b 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -1,7 +1,7 @@ #pragma once #include "ATen/core/WrapDimMinimal.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" namespace at { diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h index f3d3a81a365c26..4d3df92fe0bc57 100644 --- a/aten/src/ATen/WrapDimUtilsMulti.h +++ b/aten/src/ATen/WrapDimUtilsMulti.h @@ -1,6 +1,6 @@ #pragma once -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/WrapDimUtils.h" #include #include diff --git a/aten/src/ATen/copy_wrapper.py b/aten/src/ATen/copy_wrapper.py index e4651f8c846070..a746f5543901ae 100644 --- a/aten/src/ATen/copy_wrapper.py +++ b/aten/src/ATen/copy_wrapper.py @@ -238,7 +238,7 @@ def create(all_types, backend): top_env['copy_includes'].append( '#include "ATen/{}.h"'.format(the_type['Type'])) top_env['copy_includes'].append( - '#include "ATen/TensorImpl.h"') + '#include "ATen/core/TensorImpl.h"') # Code generation for the_type in all_types: diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp similarity index 98% rename from aten/src/ATen/TensorImpl.cpp rename to aten/src/ATen/core/TensorImpl.cpp index f4ecaf0b6253fa..d5e5781b038ca5 100644 --- a/aten/src/ATen/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h new file mode 100644 index 00000000000000..e0a649a49b6ccd --- /dev/null +++ b/aten/src/ATen/core/TensorImpl.h @@ -0,0 +1,204 @@ +#pragma once + +#include +#include + +#include "ATen/core/Storage.h" +#include "ATen/core/optional.h" +#include "ATen/core/TensorTypeId.h" +#include "ATen/core/TensorTypeIdRegistration.h" +#include "ATen/core/LegacyTypeDispatch.h" +#include "ATen/core/Backend.h" + +struct THTensor; + +namespace at { +class Scalar; +struct Type; +struct Storage; +struct Tensor; +} // namespace at + +namespace at { +struct AT_API TensorImpl : public c10::intrusive_ptr_target { + TensorImpl() = delete; + TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable); + TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); + + virtual void release_resources() override; + + Type & type() const { + // NB: It's valid to use getTypeRaw here, because the TensorImpl + // could not have been created without initializing the Type first. + // TODO: This is not actually true via the Caffe2 codepath! Make + // it so. + return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable()); + } + + TensorTypeId type_id() const { return type_id_; } + virtual IntList sizes() const; + virtual IntList strides() const; + virtual int64_t dim() const; + virtual const Storage& storage() const; + friend struct Type; + + virtual int64_t numel() const { +#ifdef DEBUG + AT_ASSERT(compute_numel() == numel_); +#endif + return numel_; + } + + virtual bool is_contiguous() const { +#ifdef DEBUG + AT_ASSERT(compute_contiguous() == is_contiguous_); +#endif + return is_contiguous_; + } + + // this is called by the generated wrapper code when there are conditions + // when this output tensor should be zero dimensional. e.g. when all inputs + // to a function 'add' were zero dimensional, then condition_when_zero_dim == true. + // we also prevent this from getting marked as a zero dim tensor if it is not + // the right shape afterall. + virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim); + + // True if a tensor was auto-wrapped from a C++ or Python number. + // Wrapped numbers do not participate in the result type computation for + // mixed-type operations if there are any Tensors that are not wrapped + // numbers. Otherwise, they behave like their non-wrapped equivalents. + // See [Result type computation] in TensorIterator.h. + bool is_wrapped_number() const { + return is_wrapped_number_; + } + void set_wrapped_number(bool value) { + AT_ASSERT(dim() == 0); + is_wrapped_number_ = value; + } + + // ~~~~~ Autograd API ~~~~~ + // Some methods below are defined in TensorImpl.cpp because Tensor is an + // incomplete type. + + virtual void set_requires_grad(bool requires_grad) { + AT_ERROR("set_requires_grad is not implemented for Tensor"); + } + virtual bool requires_grad() const { + AT_ERROR("requires_grad is not implemented for Tensor"); + } + + virtual Tensor& grad(); + virtual const Tensor& grad() const; + + // TODO: make these protected + // Note: storage->size() may be greater than the recorded size + // of a tensor + at::Storage storage_; + + template + inline T * data() const { + return storage_.data() + storage_offset_; + } + + template + inline T * unsafe_data() const { + return storage_.unsafe_data() + storage_offset_; + } + + inline at::ScalarType scalar_type() const { + return scalar_type_; + } + + virtual int64_t storage_offset() const { + return storage_offset_; + } + + // represents that numel() == 0. + inline bool is_empty() const { + return numel() == 0; + } + + virtual void resize_dim(int64_t ndim) { + // NB: This is *truly* a resize; calling code (e.g., squeeze) + // assumes that old values are preserved + sizes_.resize(ndim); + strides_.resize(ndim); + refresh_numel(); + refresh_contiguous(); + } + + virtual void set_size(int64_t dim, int64_t new_size) { + sizes_[dim] = new_size; + refresh_numel(); + refresh_contiguous(); + } + + virtual void set_stride(int64_t dim, int64_t new_stride) { + strides_[dim] = new_stride; + refresh_numel(); + refresh_contiguous(); + } + + virtual void set_storage_offset(int64_t storage_offset) { + storage_offset_ = storage_offset; + refresh_numel(); + refresh_contiguous(); + } + + // WARNING: This function does not check if the requested + // sizes/strides are in bounds for the storage that is allocated; + // this is the responsibility of the caller + void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) { + AT_CHECK( + new_size.size() == new_stride.size(), + "dimensionality of sizes (", + new_size.size(), + ") must match dimensionality of strides (", + new_stride.size(), + ")"); + sizes_ = new_size.vec(); + strides_ = new_stride.vec(); + refresh_numel(); + refresh_contiguous(); + } + + virtual int64_t size(int64_t d) const; + virtual int64_t stride(int64_t d) const; + + bool is_variable() const { return is_variable_; }; + + private: + int64_t storage_offset_; + std::vector sizes_; + std::vector strides_; + + bool is_contiguous_; + int64_t numel_; + + int64_t compute_numel() const { + int64_t n = 1; + for (auto s : sizes()) { + n *= s; + } + return n; + } + bool compute_contiguous() const; + + protected: + void refresh_numel() { + numel_ = compute_numel(); + } + void refresh_contiguous() { + is_contiguous_ = compute_contiguous(); + } + TensorTypeId type_id_; + // INVARIANT: When storage is non-null, this scalar type must + // agree with the scalar type in storage + ScalarType scalar_type_; + bool is_variable_ = false; + bool is_wrapped_number_ = false; + + private: + TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable); +}; +} // namespace at diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/core/UndefinedTensorImpl.cpp similarity index 55% rename from aten/src/ATen/UndefinedTensor.cpp rename to aten/src/ATen/core/UndefinedTensorImpl.cpp index 956c70b9f178af..d270e379a3415e 100644 --- a/aten/src/ATen/UndefinedTensor.cpp +++ b/aten/src/ATen/core/UndefinedTensorImpl.cpp @@ -1,40 +1,40 @@ -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/core/Error.h" namespace at { // should this use the globalContext? Can it get a context passed in somehow? -UndefinedTensor::UndefinedTensor() +UndefinedTensorImpl::UndefinedTensorImpl() : TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) { } -IntList UndefinedTensor::sizes() const { +IntList UndefinedTensorImpl::sizes() const { AT_ERROR("sizes() called on undefined Tensor"); } -int64_t UndefinedTensor::size(int64_t d) const { +int64_t UndefinedTensorImpl::size(int64_t d) const { AT_ERROR("size(dim) called on an undefined Tensor"); } -int64_t UndefinedTensor::stride(int64_t d) const { +int64_t UndefinedTensorImpl::stride(int64_t d) const { AT_ERROR("stride(dim) called on an undefined Tensor"); } -int64_t UndefinedTensor::dim() const { +int64_t UndefinedTensorImpl::dim() const { AT_ERROR("dim() called on undefined Tensor"); } -const Storage& UndefinedTensor::storage() const { +const Storage& UndefinedTensorImpl::storage() const { AT_ERROR("storage() called on undefined Tensor"); } -int64_t UndefinedTensor::storage_offset() const { +int64_t UndefinedTensorImpl::storage_offset() const { AT_ERROR("storage_offset() called on an undefined Tensor"); } -IntList UndefinedTensor::strides() const { +IntList UndefinedTensorImpl::strides() const { AT_ERROR("strides() called on undefined Tensor"); } -UndefinedTensor UndefinedTensor::_singleton; +UndefinedTensorImpl UndefinedTensorImpl::_singleton; } diff --git a/aten/src/ATen/UndefinedTensor.h b/aten/src/ATen/core/UndefinedTensorImpl.h similarity index 72% rename from aten/src/ATen/UndefinedTensor.h rename to aten/src/ATen/core/UndefinedTensorImpl.h index 86faf028802c1e..6c734950d90cad 100644 --- a/aten/src/ATen/UndefinedTensor.h +++ b/aten/src/ATen/core/UndefinedTensorImpl.h @@ -1,13 +1,13 @@ #pragma once -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" namespace at { -struct AT_API UndefinedTensor final : public TensorImpl { +struct AT_API UndefinedTensorImpl final : public TensorImpl { public: // Without this, we get: - // error: identifier "at::UndefinedTensor::_singleton" is undefined in device code + // error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code // (ostensibly because the constexpr tricks MSVC into trying to compile this // function for device as well). #ifdef _WIN32 @@ -25,8 +25,8 @@ struct AT_API UndefinedTensor final : public TensorImpl { const Storage& storage() const override; int64_t storage_offset() const override; private: - UndefinedTensor(); - static UndefinedTensor _singleton; + UndefinedTensorImpl(); + static UndefinedTensorImpl _singleton; public: friend struct UndefinedType; }; diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 85b99469cdee15..b830aa3e5fe65e 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -187,7 +187,7 @@ def TypedDict(name, attrs, total=True): # type: ignore }""") BUFFER_DEFINITION = CodeTemplate("""\ -auto ${name}_ = c10::make_intrusive( +auto ${name}_ = c10::make_intrusive( ${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false).release(); auto ${name} = Tensor(${name}_, false);""") @@ -330,17 +330,17 @@ def __init__(self, reason): CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL') ALLOC_NOARGS_WRAP = { - 'THTensor*': 'c10::make_intrusive' + 'THTensor*': 'c10::make_intrusive' '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()', - 'THBoolTensor*': 'c10::make_intrusive' + 'THBoolTensor*': 'c10::make_intrusive' '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()', - 'THIndexTensor*': 'c10::make_intrusive' + 'THIndexTensor*': 'c10::make_intrusive' '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()', - 'THIntegerTensor*': 'c10::make_intrusive' + 'THIntegerTensor*': 'c10::make_intrusive' '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()', - 'THDenseTensor*': 'c10::make_intrusive' + 'THDenseTensor*': 'c10::make_intrusive' '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()', - 'THDenseIndexTensor*': 'c10::make_intrusive' + 'THDenseIndexTensor*': 'c10::make_intrusive' '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()' } @@ -1295,11 +1295,12 @@ def allocate_arg(env, arg, output_count): tensor_arg = '{}_'.format(name) if arg.get('mask', False): allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation) - tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensor::singleton() : (TensorImpl*){}_' + tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_' .format(name, name)) + intrusive_ptr_type = 'c10::intrusive_ptr' return [ 'auto {}_ = {};'.format(name, allocation), - 'auto {} = Tensor(c10::intrusive_ptr::reclaim({}));'.format(name, tensor_arg), + 'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg), ] def resize_arg(arg): @@ -1509,7 +1510,8 @@ def emit_body(env, option): env, arguments=[call]) return_tensor = ( "return Tensor(" + - "c10::intrusive_ptr::reclaim((${wrapped_tensor})${maybe_scalar}));") + "c10::intrusive_ptr::reclaim(" + + "(${wrapped_tensor})${maybe_scalar}));") body.append(CodeTemplate(return_tensor).substitute( env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar)) # return the same underlying Tensor type for both real and accreal; this ensures diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 3f962961f55812..025faffe5d2fe8 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -236,7 +236,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['DenseBackend'] = backend env['storage_tensor_headers'] = [] if density != 'Sparse': - env['storage_tensor_headers'] = ['#include "ATen/TensorImpl.h"'] + env['storage_tensor_headers'] = ['#include "ATen/core/TensorImpl.h"'] # used for generating switch logic for external functions tag = density_tag + backend + scalar_name diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 064da2bc2186e0..4bc8265a258f70 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -11,7 +11,7 @@ #include "ATen/Allocator.h" #include "ATen/DeviceGuard.h" #include "ATen/NativeFunctions.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/Utils.h" #include "ATen/WrapDimUtils.h" #include "ATen/core/Half.h" diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index e54c6e4b62fc40..b80e354b60b347 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -9,9 +9,9 @@ #include "ATen/core/SparseTensorRef.h" #include "ATen/core/Storage.h" #include "ATen/core/TensorAccessor.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/core/optional.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/core/Error.h" namespace at { @@ -41,7 +41,7 @@ namespace at { // special care must be taken to handle this. struct AT_API Tensor { Tensor(){}; - Tensor(c10::intrusive_ptr tensor_impl) + Tensor(c10::intrusive_ptr tensor_impl) : tensor_impl_(std::move(tensor_impl)) { if (tensor_impl_.get() == nullptr) { throw std::runtime_error("TensorBaseImpl with nullptr not supported"); @@ -61,7 +61,7 @@ struct AT_API Tensor { TensorImpl * unsafeReleaseTensorImpl() { return tensor_impl_.release(); } - const c10::intrusive_ptr& getIntrusivePtr() const { + const c10::intrusive_ptr& getIntrusivePtr() const { return tensor_impl_; } @@ -259,7 +259,7 @@ struct AT_API Tensor { friend struct WeakTensor; protected: - c10::intrusive_ptr tensor_impl_; + c10::intrusive_ptr tensor_impl_; }; struct AT_API WeakTensor { @@ -287,6 +287,6 @@ struct AT_API WeakTensor { } private: - c10::weak_intrusive_ptr weak_tensor_impl_; + c10::weak_intrusive_ptr weak_tensor_impl_; }; } // namespace at diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 4e9c454977d588..530aa09fddb734 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -110,8 +110,8 @@ Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) co return Storage(scalarType(), size, allocator); } Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const { - auto tensor_impl = c10::intrusive_ptr::reclaim(static_cast(th_pointer)); - if (retain && tensor_impl.get() != UndefinedTensor::singleton()) { + auto tensor_impl = c10::intrusive_ptr::reclaim(static_cast(th_pointer)); + if (retain && tensor_impl.get() != UndefinedTensorImpl::singleton()) { c10::raw::intrusive_ptr::incref(tensor_impl.get()); } return Tensor(std::move(tensor_impl)); diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index 5b26400ea52f12..bc2e12acb14f29 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -10,11 +10,11 @@ $th_headers $storage_tensor_headers #include "ATen/${Generator}.h" -#include "ATen/TensorImpl.h" +#include "ATen/core/TensorImpl.h" #include "ATen/Allocator.h" #include "ATen/DeviceGuard.h" #include "ATen/NativeFunctions.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/Utils.h" #include "ATen/WrapDimUtils.h" #include "ATen/core/Half.h" diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp index 1b605e7271c6c7..e47772a7e8a80b 100644 --- a/aten/src/ATen/test/undefined_tensor_test.cpp +++ b/aten/src/ATen/test/undefined_tensor_test.cpp @@ -2,7 +2,7 @@ #include "catch.hpp" #include "ATen/ATen.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include #include "test_seed.h" @@ -48,5 +48,5 @@ TEST_CASE( "undefined tensor test", "[]" ) { Tensor to_move = ones({1}, CPU(kFloat)); Tensor m(std::move(to_move)); REQUIRE(!to_move.defined()); - REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensor::singleton()); + REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton()); } diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h index fb68639ec44752..68a884f22bd0ad 100644 --- a/aten/src/TH/THHalf.h +++ b/aten/src/TH/THHalf.h @@ -4,7 +4,7 @@ #include #ifdef __cplusplus -#include +#include #endif #ifdef __cplusplus diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 0c731779b95685..5f861457c58ef4 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -8,7 +8,7 @@ #include -// NB: This is NOT valid on UndefinedTensor +// NB: This is NOT valid on UndefinedTensorImpl void THTensor_free(THTensor *self) { if (!self) return; diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index 3f373ee2119c26..5eac1e319347d1 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -54,13 +54,13 @@ scalar_t *THTensor_(data)(const THTensor *self) { /* Empty init */ THTensor *THTensor_(new)(void) { - return c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + return c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); } /* Pointer-copy init */ THTensor *THTensor_(newWithTensor)(THTensor *tensor) { - THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); THTensor_(setStorageNd)(self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), @@ -75,7 +75,7 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, if (strides.data()) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } - THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); @@ -547,7 +547,7 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self) } } -// NB: It is INVALID to call this on an UndefinedTensor +// NB: It is INVALID to call this on an UndefinedTensorImpl void THTensor_(retain)(THTensor *self) { c10::raw::intrusive_ptr::incref(self); diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h index d5316919d4e31e..27807ea57a7dd0 100644 --- a/aten/src/TH/generic/THTensor.h +++ b/aten/src/TH/generic/THTensor.h @@ -5,7 +5,7 @@ /* a la lua? dim, storageoffset, ... et les methodes ? */ #ifdef __cplusplus -#include +#include #endif #ifdef __cplusplus diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index db2b44511c2329..483a1fbec593e7 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -63,13 +63,13 @@ scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self) /* Empty init */ THCTensor *THCTensor_(new)(THCState *state) { - return c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + return c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); } /* Pointer-copy init */ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) { - THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); THCTensor_(setStorageNd)(state, self, THTensor_getStoragePtr(tensor), @@ -85,7 +85,7 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd if (strides.data()) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } - THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index 2d69e60a40fed3..f5e1443b0eb5e7 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -229,9 +229,9 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, #endif namespace { -c10::intrusive_ptr retainTensorImpl(THCTensor* self) { +c10::intrusive_ptr retainTensorImpl(THCTensor* self) { c10::raw::intrusive_ptr::incref(self); - return c10::intrusive_ptr::reclaim(self); + return c10::intrusive_ptr::reclaim(self); } } diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp index 4f7cd29cc5a180..acfaccced2be09 100644 --- a/test/cpp_extensions/complex_registration_extension.cpp +++ b/test/cpp_extensions/complex_registration_extension.cpp @@ -10,7 +10,7 @@ #include "ATen/DeviceGuard.h" #include "ATen/NativeFunctions.h" #include "ATen/TensorImpl.h" -#include "ATen/UndefinedTensor.h" +#include "ATen/core/UndefinedTensorImpl.h" #include "ATen/Utils.h" #include "ATen/WrapDimUtils.h" #include "ATen/core/Half.h" @@ -53,7 +53,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault { numel, getCPUAllocator(), /* resizable */ true)}; - Tensor t{c10::make_intrusive( + Tensor t{c10::make_intrusive( std::move(s), at::CPUTensorId(), /* is_variable */ false)}; diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 3602e7deb8eb89..d32a24b54917a8 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -118,14 +118,14 @@ struct TORCH_API IValue final { bool isTensor() const { return Tag::Tensor == tag; } at::Tensor toTensor() && { JIT_ASSERT(isTensor()); - at::Tensor t(c10::intrusive_ptr::reclaim(as_tensor_impl)); + at::Tensor t(c10::intrusive_ptr::reclaim(as_tensor_impl)); clearToNone(); return t; } at::Tensor toTensor() const & { JIT_ASSERT(isTensor()); - JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensor::singleton())); - auto tensor_impl = c10::intrusive_ptr::reclaim(as_tensor_impl); + JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensorImpl::singleton())); + auto tensor_impl = c10::intrusive_ptr::reclaim(as_tensor_impl); if (is_intrusive_ptr) { c10::raw::intrusive_ptr::incref(tensor_impl.get()); } From de460c7ad312a1f30d6329406b9b2fb53c530a0a Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Tue, 11 Sep 2018 08:39:20 -0700 Subject: [PATCH 031/237] Improvements on conv/pool/fold/stft/ParamDict docs (#11106) Summary: Also fixes some incorrect formula rendering. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11106 Differential Revision: D9752433 Pulled By: SsnL fbshipit-source-id: 535fc8498638e8b645757fc7535d8771992b7d21 --- aten/src/ATen/native/native_functions.yaml | 2 +- docs/source/conf.py | 2 +- torch/_tensor_docs.py | 9 +- torch/_torch_docs.py | 191 ++++++++++++--------- torch/functional.py | 18 +- torch/nn/functional.py | 95 +++++----- torch/nn/modules/container.py | 8 +- torch/nn/modules/conv.py | 75 ++++---- torch/nn/modules/fold.py | 28 ++- torch/nn/modules/instancenorm.py | 27 +++ torch/nn/modules/linear.py | 20 ++- torch/nn/modules/module.py | 2 +- torch/nn/modules/pooling.py | 157 ++++++++--------- torch/nn/parallel/data_parallel.py | 7 +- torch/nn/utils/rnn.py | 9 +- 15 files changed, 360 insertions(+), 290 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2b71982be8cf6a..44fea84527acc2 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -374,7 +374,7 @@ - func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor -- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor +- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad=0) -> Tensor - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor) diff --git a/docs/source/conf.py b/docs/source/conf.py index cbcdf707f2fdb4..ca578c40e3df5a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -229,7 +229,7 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { 'python': ('https://docs.python.org/', None), - 'numpy': ('http://docs.scipy.org/doc/numpy/', None), + 'numpy': ('https://docs.scipy.org/doc/numpy/', None), } # -- A patch that prevents Sphinx from cross-referencing ivar tags ------- diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index f7ace5e0edcac2..c3039583b04fb7 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -1412,14 +1412,7 @@ def callable(a, b) -> number r""" narrow(dimension, start, length) -> Tensor -Returns a new tensor that is a narrowed version of :attr:`self` tensor. The -dimension :attr:`dim` is narrowed from :attr:`start` to :attr:`start + length`. The -returned tensor and :attr:`self` tensor share the same underlying storage. - -Args: - dimension (int): the dimension along which to narrow - start (int): the starting dimension - length (int): the distance to the ending dimension +See :func:`torch.narrow` Example:: diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index ea6016f778f711..f66a4e1c17d5bc 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -117,7 +117,7 @@ def parse_kwargs(desc): and returns a new resulting tensor. .. math:: - out = input + value + \text{out} = \text{input} + \text{value} If :attr:`input` is of type FloatTensor or DoubleTensor, :attr:`value` must be a real number, otherwise it should be an integer. @@ -147,7 +147,7 @@ def parse_kwargs(desc): :ref:`broadcastable `. .. math:: - out = input + value \times other + \text{out} = \text{input} + \text{value} \times \text{other} If :attr:`other` is of type FloatTensor or DoubleTensor, :attr:`value` must be a real number, otherwise it should be an integer. @@ -197,7 +197,7 @@ def parse_kwargs(desc): and :attr:`out` will be a :math:`(n \times p)` tensor. .. math:: - out = \beta\ mat + \alpha\ (\sum_{i=0}^{b} batch1_i \mathbin{@} batch2_i) + out = \beta\ \text{mat} + \alpha\ (\sum_{i=0}^{b} \text{batch1}_i \mathbin{@} \text{batch2}_i) For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` must be real numbers, otherwise they should be integers. @@ -229,7 +229,7 @@ def parse_kwargs(desc): multiply the result by the scalar :attr:`value` and add it to :attr:`tensor`. .. math:: - out_i = tensor_i + value \times \frac{tensor1_i}{tensor2_i} + \text{out}_i = \text{tensor}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i} The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be :ref:`broadcastable `. @@ -239,7 +239,7 @@ def parse_kwargs(desc): Args: tensor (Tensor): the tensor to be added - value (Number, optional): multiplier for :math:`tensor1 ./ tensor2` + value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}` tensor1 (Tensor): the numerator tensor tensor2 (Tensor): the denominator tensor out (Tensor, optional): the output tensor @@ -264,7 +264,7 @@ def parse_kwargs(desc): and add it to :attr:`tensor`. .. math:: - out_i = tensor_i + value \times tensor1_i \times tensor2_i + \text{out}_i = \text{tensor}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be :ref:`broadcastable `. @@ -306,7 +306,7 @@ def parse_kwargs(desc): :attr:`mat1` and :attr`mat2` and the added matrix :attr:`mat` respectively. .. math:: - out = \beta\ mat + \alpha\ (mat1_i \mathbin{@} mat2_i) + \text{out} = \beta\ \text{mat} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i) For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` must be real numbers, otherwise they should be integers. @@ -346,7 +346,7 @@ def parse_kwargs(desc): :attr:`mat` and :attr:`vec` and the added tensor :attr:`tensor` respectively. .. math:: - out = \beta\ tensor + \alpha\ (mat \mathbin{@} vec) + \text{out} = \beta\ \text{tensor} + \alpha\ (\text{mat} \mathbin{@} \text{vec}) For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` must be real numbers, otherwise they should be integers @@ -380,7 +380,7 @@ def parse_kwargs(desc): :attr:`mat` respectively. .. math:: - out = \beta\ mat + \alpha\ (vec1 \otimes vec2) + \text{out} = \beta\ \text{mat} + \alpha\ (\text{vec1} \otimes \text{vec2}) If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector of size `m`, then :attr:`mat` must be @@ -394,7 +394,7 @@ def parse_kwargs(desc): Args: beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`) mat (Tensor): matrix to be added - alpha (Number, optional): multiplier for :math:`vec1 \otimes vec2` (:math:`\alpha`) + alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`) vec1 (Tensor): the first vector of the outer product vec2 (Tensor): the second vector of the outer product out (Tensor, optional): the output tensor @@ -427,7 +427,7 @@ def parse_kwargs(desc): other (Tensor): second tensor to compare atol (float, optional): absolute tolerance. Default: 1e-08 rtol (float, optional): relative tolerance. Default: 1e-05 - equal_nan (float, optional): if ``True``, then two ``NaN``s will be compared as ``True``. Default: ``False`` + equal_nan (float, optional): if ``True``, then two ``NaN`` s will be compared as equal. Default: ``False`` Example:: @@ -560,7 +560,7 @@ def parse_kwargs(desc): same as the scaling factors used in :meth:`torch.addbmm`. .. math:: - out_i = \beta\ mat_i + \alpha\ (batch1_i \mathbin{@} batch2_i) + \text{out}_i = \beta\ \text{mat}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i) For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` must be real numbers, otherwise they should be integers. @@ -568,7 +568,7 @@ def parse_kwargs(desc): Args: beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`) mat (Tensor): the tensor to be added - alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`) + alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`) batch1 (Tensor): the first batch of matrices to be multiplied batch2 (Tensor): the second batch of matrices to be multiplied out (Tensor, optional): the output tensor @@ -609,7 +609,7 @@ def parse_kwargs(desc): Example:: - >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1] + >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1] >>> a tensor([[ 0.1737, 0.0950, 0.3609], [ 0.7148, 0.0289, 0.2676], @@ -642,8 +642,8 @@ def parse_kwargs(desc): tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``, -:math:`out[n] += weights[i]` if :attr:`weights` is specified else -:math:`out[n] += 1`. +``out[n] += weights[i]`` if :attr:`weights` is specified else +``out[n] += 1``. Arguments: input (Tensor): 1-d int tensor @@ -651,9 +651,9 @@ def parse_kwargs(desc): Should be of same size as input tensor. minlength (int): optional, minimum number of bins. Should be non-negative. -Shape: - output (Tensor): ``Size([max(input) + 1])`` if :attr:`input` is non-empty, else - ``Size(0)`` +Returns: + output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if + :attr:`input` is non-empty, else ``Size(0)`` Example:: @@ -685,7 +685,7 @@ def parse_kwargs(desc): :math:`(b \times n \times p)` tensor. .. math:: - out_i = batch1_i \mathbin{@} batch2_i + \text{out}_i = \text{batch1}_i \mathbin{@} \text{batch2}_i .. note:: This function does not :ref:`broadcast `. For broadcasting matrix products, see :func:`torch.matmul`. @@ -1259,7 +1259,7 @@ def parse_kwargs(desc): and returns a new resulting tensor. .. math:: - out_i = \frac{input_i}{value} + \text{out}_i = \frac{\text{input}_i}{\text{value}} If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value` should be a real number, otherwise it should be an integer @@ -1285,7 +1285,7 @@ def parse_kwargs(desc): :ref:`broadcastable `. .. math:: - out_i = \frac{input_i}{other_i} + \text{out}_i = \frac{\text{input}_i}{\text{other}_i} Args: input (Tensor): the numerator tensor @@ -1341,16 +1341,17 @@ def parse_kwargs(desc): (Tensor, Tensor): A tuple containing - **e** (*Tensor*): Shape :math:`(n \times 2)`. Each row is an eigenvalue of ``a``, - where the first element is the real part and the second element is the imaginary part. - The eigenvalues are not necessarily ordered. + where the first element is the real part and the second element is the imaginary part. + The eigenvalues are not necessarily ordered. - **v** (*Tensor*): If ``eigenvectors=False``, it's an empty tensor. - Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length) - eigenvectors of corresponding eigenvalues ``e`` as follows. - If the corresponding e[j] is a real number, column v[:, j] is the eigenvector corresponding to - eigenvalue e[j]. - If the corresponding e[j] and e[j + 1] eigenvalues form a complex conjugate pair, then the true eigenvectors - can be computed as - :math:`eigenvector[j] = v[:, j] + i * v[:, j + 1], eigenvector[j + 1] = v[:, j] - i * v[:, j + 1]`. + Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length) + eigenvectors of corresponding eigenvalues ``e`` as follows. + If the corresponding e[j] is a real number, column v[:, j] is the eigenvector corresponding to + eigenvalue e[j]. + If the corresponding e[j] and e[j + 1] eigenvalues form a complex conjugate pair, then the true eigenvectors + can be computed as + :math:`\text{eigenvector}[j] = v[:, j] + i \times v[:, j + 1]`, + :math:`\text{eigenvector}[j + 1] = v[:, j] - i \times v[:, j + 1]`. """) add_docstr(torch.eq, @@ -1645,7 +1646,7 @@ def parse_kwargs(desc): If :attr:`input` is an n-dimensional tensor with size :math:`(x_0, x_1..., x_{i-1}, x_i, x_{i+1}, ..., x_{n-1})` -and :attr:`dim` :math:`= i`, then :attr:`index` must be an :math:`n`-dimensional tensor with +and ``dim = i``, then :attr:`index` must be an :math:`n`-dimensional tensor with size :math:`(x_0, x_1, ..., x_{i-1}, y, x_{i+1}, ..., x_{n-1})` where :math:`y \geq 1` and :attr:`out` will have the same size as :attr:`index`. @@ -1667,7 +1668,7 @@ def parse_kwargs(desc): r""" ge(input, other, out=None) -> Tensor -Computes :math:`input \geq other` element-wise. +Computes :math:`\text{input} \geq \text{other}` element-wise. The second argument can be a number or a tensor whose shape is :ref:`broadcastable ` with the first argument. @@ -1708,7 +1709,7 @@ def parse_kwargs(desc): .. math:: \begin{array}{ll} - \min_X & \|X\|_2 & \mbox{subject to} & AX = B. + \min_X & \|X\|_2 & \text{subject to} & AX = B. \end{array} Returned tensor :math:`X` has shape :math:`(\max(m, n) \times k)`. The first :math:`n` @@ -1821,7 +1822,7 @@ def parse_kwargs(desc): .. note:: - The `out` keyword only supports 2D matrix inputs, that is, + The :attr:`out` keyword only supports 2D matrix inputs, that is, `B, A` must be 2D matrices. .. note:: @@ -1832,10 +1833,10 @@ def parse_kwargs(desc): `A.contiguous().transpose(-1, -2).strides()` respectively. Args: - B (Tensor): input matrix of size :math:`(*, m, k)` , where `*` - is zero or more batch dimensions. + B (Tensor): input matrix of size :math:`(*, m, k)` , where :math:`*` + is zero or more batch dimensions. A (Tensor): input square matrix of size :math:`(*, m, m)`, where - `*` is zero or more batch dimensions. + :math:`*` is zero or more batch dimensions. out ((Tensor, Tensor), optional): optional output tuple. Example:: @@ -1893,7 +1894,7 @@ def parse_kwargs(desc): r""" gt(input, other, out=None) -> Tensor -Computes :math:`input > other` element-wise. +Computes :math:`\text{input} > \text{other}` element-wise. The second argument can be a number or a tensor whose shape is :ref:`broadcastable ` with the first argument. @@ -2054,7 +2055,7 @@ def parse_kwargs(desc): r""" le(input, other, out=None) -> Tensor -Computes :math:`input \leq other` element-wise. +Computes :math:`\text{input} \leq \text{other}` element-wise. The second argument can be a number or a tensor whose shape is :ref:`broadcastable ` with the first argument. @@ -2082,7 +2083,7 @@ def parse_kwargs(desc): on a scalar :attr:`weight` and returns the resulting :attr:`out` tensor. .. math:: - out_i = start_i + weight \times (end_i - start_i) + \text{out}_i = \text{start}_i + \text{weight} \times (\text{end}_i - \text{start}_i) The shapes of :attr:`start` and :attr:`end` must be :ref:`broadcastable `. @@ -2299,7 +2300,7 @@ def parse_kwargs(desc): r""" lt(input, other, out=None) -> Tensor -Computes :math:`input < other` element-wise. +Computes :math:`\text{input} < \text{other}` element-wise. The second argument can be a number or a tensor whose shape is :ref:`broadcastable ` with the first argument. @@ -2471,7 +2472,7 @@ def parse_kwargs(desc): but they must be :ref:`broadcastable `. .. math:: - out_i = \max(tensor_i, other_i) + \text{out}_i = \max(\text{tensor}_i, \text{other}_i) .. note:: When the shapes do not match, the shape of the returned output tensor follows the :ref:`broadcasting rules `. @@ -2647,7 +2648,7 @@ def parse_kwargs(desc): but they must be :ref:`broadcastable `. .. math:: - out_i = \min(tensor_i, other_i) + \text{out}_i = \min(\text{tensor}_i, \text{other}_i) .. note:: When the shapes do not match, the shape of the returned output tensor follows the :ref:`broadcasting rules `. @@ -2803,7 +2804,7 @@ def parse_kwargs(desc): :attr:`value` and returns a new resulting tensor. .. math:: - out_i = value \times input_i + \text{out}_i = \text{value} \times \text{input}_i If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value` should be a real number, otherwise it should be an integer @@ -2830,7 +2831,7 @@ def parse_kwargs(desc): :ref:`broadcastable `. .. math:: - out_i = input_i \times other_i + \text{out}_i = \text{input}_i \times \text{other}_i Args: input (Tensor): the first multiplicand tensor @@ -2874,7 +2875,7 @@ def parse_kwargs(desc): If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`. If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape -:math:`(m \times num\_samples)`. +:math:`(m \times \text{num\_samples})`. If replacement is ``True``, samples are drawn with replacement. @@ -2953,6 +2954,32 @@ def parse_kwargs(desc): [1.0311, 0.3901, 0.5049]]) """) +add_docstr(torch.narrow, + r""" +narrow(input, dimension, start, length) -> Tensor + +Returns a new tensor that is a narrowed version of :attr:`input` tensor. The +dimension :attr:`dim` is input from :attr:`start` to :attr:`start + length`. The +returned tensor and :attr:`self` tensor share the same underlying storage. + +Args: + input (Tensor): the tensor to narrow + dimension (int): the dimension along which to narrow + start (int): the starting dimension + length (int): the distance to the ending dimension + +Example:: + + >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> torch.narrow(x, 0, 0, 2) + tensor([[ 1, 2, 3], + [ 4, 5, 6]]) + >>> torch.narrow(x, 1, 1, 2) + tensor([[ 2, 3], + [ 5, 6], + [ 8, 9]]) +""") + add_docstr(torch.ne, r""" ne(input, other, out=None) -> Tensor @@ -2984,7 +3011,7 @@ def parse_kwargs(desc): Returns a new tensor with the negative of the elements of :attr:`input`. .. math:: - out = -1 \times input + \text{out} = -1 \times \text{input} Args: input (Tensor): the input tensor @@ -3313,13 +3340,13 @@ def parse_kwargs(desc): Cholesky factor :attr:`u`: returns matrix `inv` If :attr:`upper` is ``True`` or not provided, :attr:`u` is upper -triangular such that: +triangular such that the returned tensor is .. math:: inv = (u^T u)^{-1} If :attr:`upper` is ``False``, :attr:`u` is lower triangular -such that: +such that the tensor value is .. math:: inv = (uu^{T})^{-1} @@ -3412,12 +3439,12 @@ def parse_kwargs(desc): When :attr:`exponent` is a scalar value, the operation applied is: .. math:: - out_i = x_i ^ {exponent} + \text{out}_i = x_i ^ \text{exponent} When :attr:`exponent` is a tensor, the operation applied is: .. math:: - out_i = x_i ^ {exponent_i} + \text{out}_i = x_i ^ {\text{exponent}_i} When :attr:`exponent` is a tensor, the shapes of :attr:`input` and :attr:`exponent` must be :ref:`broadcastable `. @@ -3623,7 +3650,7 @@ def parse_kwargs(desc): >>> torch.rand(2, 3) tensor([[ 0.8237, 0.5781, 0.6879], [ 0.3816, 0.7249, 0.0998]]) -""") +""".format(**factory_common_args)) add_docstr(torch.rand_like, r""" @@ -3824,13 +3851,13 @@ def parse_kwargs(desc): r""" range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor -Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor + 1` +Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1` with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is the gap between two values in the tensor. .. math:: - \text{{out}}_{{i+1}} = \text{{out}}_i + step. - + \text{out}_{i+1} = \text{out}_i + \text{step}. +""" + r""" .. warning:: This function is deprecated in favor of :func:`torch.arange`. @@ -3856,7 +3883,7 @@ def parse_kwargs(desc): r""" arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor -Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor` +Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor` with values from the interval ``[start, end)`` taken with common difference :attr:`step` beginning from `start`. @@ -3865,8 +3892,8 @@ def parse_kwargs(desc): in such cases. .. math:: - \text{{out}}_{{i+1}} = \text{{out}}_{{i}} + \text{{step}} - + \text{out}_{{i+1}} = \text{out}_{i} + \text{step} +""" + r""" Args: start (Number): the starting value for the set of points. Default: ``0``. end (Number): the ending value for the set of points @@ -4293,8 +4320,8 @@ def parse_kwargs(desc): When :attr:`dim` is given, a squeeze operation is done only in the given dimension. If `input` is of shape: :math:`(A \times 1 \times B)`, -`squeeze(input, 0)` leaves the tensor unchanged, but :func:`squeeze(input, 1)` will -squeeze the tensor to the shape :math:`(A \times B)`. +``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)`` +will squeeze the tensor to the shape :math:`(A \times B)`. .. note:: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other. @@ -4504,7 +4531,7 @@ def parse_kwargs(desc): :math:`m` dimensional vector. This function calculates all eigenvalues (and vectors) of :attr:`input` -such that :math:`input = V diag(e) V^T`. +such that :math:`\text{input} = V \text{diag}(e) V^T`. The boolean argument :attr:`eigenvectors` defines computation of eigenvectors or eigenvalues only. @@ -4530,10 +4557,10 @@ def parse_kwargs(desc): (Tensor, Tensor): A tuple containing - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``, - The eigenvalues are in ascending order. + The eigenvalues are in ascending order. - **V** (*Tensor*): Shape :math:`(m \times m)`. - If ``eigenvectors=False``, it's a tensor filled with zeros. - Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``. + If ``eigenvectors=False``, it's a tensor filled with zeros. + Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``. Examples:: @@ -4561,7 +4588,7 @@ def parse_kwargs(desc): Expects :attr:`input` to be a matrix (2-D tensor) and transposes dimensions 0 and 1. -Can be seen as a short-hand function for :meth:`transpose(input, 0, 1)` +Can be seen as a short-hand function for ``transpose(input, 0, 1)``. Args: input (Tensor): the input tensor @@ -4907,28 +4934,28 @@ def parse_kwargs(desc): r""" trtrs(b, A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor) -Solves a system of equations with a triangular coefficient matrix `A` -and multiple right-hand sides `b`. +Solves a system of equations with a triangular coefficient matrix :math:`A` +and multiple right-hand sides :attr:`b`. -In particular, solves :math:`AX = b` and assumes `A` is upper-triangular +In particular, solves :math:`AX = b` and assumes :math:`A` is upper-triangular with the default keyword arguments. Args: A (Tensor): the input triangular coefficient matrix - b (Tensor): multiple right-hand sides. Each column of `b` is a + b (Tensor): multiple right-hand sides. Each column of :math:`b` is a right-hand side for the system of equations. upper (bool, optional): whether to solve the upper-triangular system of equations (default) or the lower-triangular system of equations. Default: True. - transpose (bool, optional): whether `A` should be transposed before + transpose (bool, optional): whether :math:`A` should be transposed before being sent into the solver. Default: False. - unitriangular (bool, optional): whether `A` is unit triangular. - If True, the diagonal elements of `A` are assumed to be - 1 and not referenced from `A`. Default: False. + unitriangular (bool, optional): whether :math:`A` is unit triangular. + If True, the diagonal elements of :math:`A` are assumed to be + 1 and not referenced from :math:`A`. Default: False. Returns: - A tuple (X, M) where `M` is a clone of `A` and `X` is the solution to - `AX = b` (or whatever variant of the system of equations, depending on - the keyword arguments.) + A tuple :math:`(X, M)` where :math:`M` is a clone of :math:`A` and :math:`X` + is the solution to :math:`AX = b` (or whatever variant of the system of + equations, depending on the keyword arguments.) Shape: - A: :math:`(N, N)` @@ -5968,17 +5995,17 @@ def parse_kwargs(desc): meshgrid(seq) -> seq Take a sequence of :math:`N` tensors, each of which can be either scalar or 1-dimensional -vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by -expanding the :math:`i`th input over dimensions defined by other inputs. +vector, and create :math:`N` N-dimensional grids, where the :math:`i` th grid is defined by +expanding the :math:`i` th input over dimensions defined by other inputs. Arguments: seq (sequence of Tensors): sequence of scalars or 1 dimensional tensors. Scalars will be treated as tensors of size :math:`(1,)` automatically. Returns: - seq (sequence of Tensors): If the input has :math:`k` tensors of size - :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors, - where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. + seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`, + then the output would also has :math:`k` tensors, where all tensors are + of size :math:`(N_1, N_2, \ldots , N_k)`. Example:: diff --git a/torch/functional.py b/torch/functional.py index d84597f3fff437..e31903db5a7129 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -50,7 +50,7 @@ def split(tensor, split_size_or_sections, dim=0): If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will be split into equally sized chunks (if possible). Last chunk will be smaller if - the tensor size along the given dimension :attr:`dim= is not divisible by + the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`split_size`. If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split @@ -287,8 +287,8 @@ def stft(input, n_fft, hop_length=None, win_length=None, window=None, .. math:: X[m, \omega] = \sum_{k = 0}^{\text{win\_length}}% - window[k]\ input[m \times hop_length + k]\ % - e^{- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}}, + \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ % + \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right), where :math:`m` is the index of the sliding window, and :math:`\omega` is the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When @@ -345,7 +345,7 @@ def stft(input, n_fft, hop_length=None, win_length=None, window=None, win_length (int): the size of window frame and STFT filter. Default: ``None`` (treated as equal to :attr:`n_fft`) window (Tensor, optional): the optional window function. - Default: ``None`` (treated as window of all :math:`1`s) + Default: ``None`` (treated as window of all :math:`1` s) center (bool, optional): whether to pad :attr:`input` on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Default: ``True`` @@ -452,7 +452,7 @@ def unique(input, sorted=False, return_inverse=False, dim=None): def argmax(input, dim=None, keepdim=False): - """Returns the indices of the maximum values of a tensor across a dimension. + r"""Returns the indices of the maximum values of a tensor across a dimension. This is the second value returned by :meth:`torch.max`. See its documentation for the exact semantics of this method. @@ -483,7 +483,7 @@ def argmax(input, dim=None, keepdim=False): def argmin(input, dim=None, keepdim=False): - """Returns the indices of the minimum values of a tensor across a dimension. + r"""Returns the indices of the minimum values of a tensor across a dimension. This is the second value returned by :meth:`torch.min`. See its documentation for the exact semantics of this method. @@ -514,7 +514,7 @@ def argmin(input, dim=None, keepdim=False): def tensordot(a, b, dims=2): - """Returns a contraction of a and b over multiple dimensions. + r"""Returns a contraction of a and b over multiple dimensions. :attr:`tensordot` implements a generalizes the matrix product. @@ -531,7 +531,7 @@ def tensordot(a, b, dims=2): .. math:: r_{i_0,...,i_{m-d}, i_d,...,i_n} - = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} * b_{k_0,...,k_{d-1}, i_d,...,i_n}. + = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}. When called with :attr:`dims` of the list form, the given dimensions will be contracted in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes @@ -569,7 +569,7 @@ def tensordot(a, b, dims=2): def argsort(input, dim=None, descending=False): - """Returns the indices that sort a tensor along a given dimension in ascending + r"""Returns the indices that sort a tensor along a given dimension in ascending order by value. This is the second value returned by :meth:`torch.sort`. See its documentation diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 47b7375bd1f456..d9889d631f7fb8 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -68,16 +68,16 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.Conv1d` for details and output shape. Args: - input: input tensor of shape :math:`minibatch \times in\_channels \times iW` - weight: filters of shape :math:`out\_channels \times \frac{in\_channels}{groups} \times kW` - bias: optional bias of shape (:math:`out\_channels`). Default: ``None`` + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)` + weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kW)` + bias: optional bias of shape :math:`(\text{out\_channels})`. Default: ``None`` stride: the stride of the convolving kernel. Can be a single number or a one-element tuple `(sW,)`. Default: 1 padding: implicit zero paddings on both sides of the input. Can be a single number or a one-element tuple `(padW,)`. Default: 0 dilation: the spacing between kernel elements. Can be a single number or a one-element tuple `(dW,)`. Default: 1 - groups: split input into groups, :math:`in\_channels` should be divisible by + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the number of groups. Default: 1 Examples:: @@ -96,16 +96,16 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.Conv2d` for details and output shape. Args: - input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`) - weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kH \times kW`) - bias: optional bias tensor of shape (:math:`out\_channels`). Default: ``None`` + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)` + weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kH \times kW)` + bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None`` stride: the stride of the convolving kernel. Can be a single number or a tuple `(sH, sW)`. Default: 1 padding: implicit zero paddings on both sides of the input. Can be a single number or a tuple `(padH, padW)`. Default: 0 dilation: the spacing between kernel elements. Can be a single number or a tuple `(dH, dW)`. Default: 1 - groups: split input into groups, :math:`in\_channels` should be divisible by the + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the number of groups. Default: 1 Examples:: @@ -114,7 +114,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): >>> filters = torch.randn(8,4,3,3) >>> inputs = torch.randn(1,4,5,5) >>> F.conv2d(inputs, filters, padding=1) -""") +""") # noqa: E501 conv3d = _add_docstr(torch.conv3d, r""" conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor @@ -125,16 +125,16 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.Conv3d` for details and output shape. Args: - input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`) - weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kT \times kH \times kW`) - bias: optional bias tensor of shape (:math:`out\_channels`). Default: None + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)` + weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kT \times kH \times kW)` + bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: None stride: the stride of the convolving kernel. Can be a single number or a tuple `(sT, sH, sW)`. Default: 1 padding: implicit zero paddings on both sides of the input. Can be a single number or a tuple `(padT, padH, padW)`. Default: 0 dilation: the spacing between kernel elements. Can be a single number or a tuple `(dT, dH, dW)`. Default: 1 - groups: split input into groups, :math:`in\_channels` should be divisible by + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the number of groups. Default: 1 Examples:: @@ -142,7 +142,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): >>> filters = torch.randn(33, 16, 3, 3, 3) >>> inputs = torch.randn(20, 16, 50, 10, 20) >>> F.conv3d(inputs, filters) -""") +""") # noqa: E501 conv_transpose1d = _add_docstr(torch.conv_transpose1d, r""" conv_transpose1d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor @@ -153,9 +153,9 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.ConvTranspose1d` for details and output shape. Args: - input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`) - weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kW`) - bias: optional bias of shape (:math:`out\_channels`). Default: None + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)` + weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kW)` + bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None stride: the stride of the convolving kernel. Can be a single number or a tuple ``(sW,)``. Default: 1 padding: ``kernel_size - 1 - padding`` zero-padding will be added to both @@ -163,7 +163,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): ``(padW,)``. Default: 0 output_padding: additional size added to one side of each dimension in the output shape. Can be a single number or a tuple ``(out_padW)``. Default: 0 - groups: split input into groups, :math:`in\_channels` should be divisible by the + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the number of groups. Default: 1 dilation: the spacing between kernel elements. Can be a single number or a tuple ``(dW,)``. Default: 1 @@ -184,9 +184,9 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.ConvTranspose2d` for details and output shape. Args: - input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`) - weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kH \times kW`) - bias: optional bias of shape (:math:`out\_channels`). Default: None + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)` + weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kH \times kW)` + bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None stride: the stride of the convolving kernel. Can be a single number or a tuple ``(sH, sW)``. Default: 1 padding: ``kernel_size - 1 - padding`` zero-padding will be added to both @@ -195,7 +195,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): output_padding: additional size added to one side of each dimension in the output shape. Can be a single number or a tuple ``(out_padH, out_padW)``. Default: 0 - groups: split input into groups, :math:`in\_channels` should be divisible by the + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the number of groups. Default: 1 dilation: the spacing between kernel elements. Can be a single number or a tuple ``(dH, dW)``. Default: 1 @@ -206,7 +206,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): >>> inputs = torch.randn(1, 4, 5, 5) >>> weights = torch.randn(4, 8, 3, 3) >>> F.conv_transpose2d(inputs, weights, padding=1) -""") +""") # noqa: E501 conv_transpose3d = _add_docstr(torch.conv_transpose3d, r""" conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor @@ -217,9 +217,9 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.ConvTranspose3d` for details and output shape. Args: - input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`) - weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kT \times kH \times kW`) - bias: optional bias of shape (:math:`out\_channels`). Default: None + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)` + weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kT \times kH \times kW)` + bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None stride: the stride of the convolving kernel. Can be a single number or a tuple ``(sT, sH, sW)``. Default: 1 padding: ``kernel_size - 1 - padding`` zero-padding will be added to both @@ -228,7 +228,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): output_padding: additional size added to one side of each dimension in the output shape. Can be a single number or a tuple ``(out_padT, out_padH, out_padW)``. Default: 0 - groups: split input into groups, :math:`in\_channels` should be divisible by the + groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the number of groups. Default: 1 dilation: the spacing between kernel elements. Can be a single number or a tuple `(dT, dH, dW)`. Default: 1 @@ -238,20 +238,18 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): >>> inputs = torch.randn(20, 16, 50, 10, 20) >>> weights = torch.randn(16, 33, 3, 3, 3) >>> F.conv_transpose3d(inputs, weights) -""") - +""") # noqa: E501 -def conv_tbc(input, weight, bias, pad=0): - r"""Applies a 1-dimensional sequence convolution over an input sequence. - Input and output dimensions are (Time, Batch, Channels) - hence TBC. +conv_tbc = _add_docstr(torch.conv_tbc, r""" +Applies a 1-dimensional sequence convolution over an input sequence. +Input and output dimensions are (Time, Batch, Channels) - hence TBC. - Args: - input: input tensor of shape (:math:`\text{sequence length} \times batch \times in\_channels`) - weight: filter of shape (:math:`\text{kernel width} \times in\_channels \times out\_channels`) - bias: bias of shape (:math:`out\_channels`) - pad: number of timesteps to pad - """ - return torch.conv_tbc(input, weight, bias, pad) +Args: + input: input tensor of shape :math:`(\text{sequence length} \times batch \times \text{in\_channels})` + weight: filter of shape (:math:`\text{kernel width} \times \text{in\_channels} \times \text{out\_channels}`) + bias: bias of shape (:math:`\text{out\_channels}`) + pad: number of timesteps to pad. Default: 0 +""") # Pooling @@ -264,9 +262,9 @@ def conv_tbc(input, weight, bias, pad=0): See :class:`~torch.nn.AvgPool1d` for details and output shape. Args: - input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`) + input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)` kernel_size: the size of the window. Can be a single number or a - tuple `(kW,)` + tuple :math:`(kW,)` stride: the stride of the window. Can be a single number or a tuple `(sW,)`. Default: :attr:`kernel_size` padding: implicit zero paddings on both sides of the input. Can be a @@ -276,11 +274,12 @@ def conv_tbc(input, weight, bias, pad=0): count_include_pad: when True, will include the zero-padding in the averaging calculation. Default: ``True`` -Example:: +Examples:: >>> # pool of square window of size=3, stride=2 >>> input = torch.tensor([[[1,2,3,4,5,6,7]]]) >>> F.avg_pool1d(input, kernel_size=3, stride=2) tensor([[[ 2., 4., 6.]]]) + """) @@ -294,9 +293,9 @@ def conv_tbc(input, weight, bias, pad=0): See :class:`~torch.nn.AvgPool2d` for details and output shape. Args: - input: input tensor (:math:`minibatch \times in\_channels \times iH \times iW`) + input: input tensor :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)` kernel_size: size of the pooling region. Can be a single number or a - tuple (:math:`kH \times kW`) + tuple :math:`(kH \times kW)` stride: stride of the pooling operation. Can be a single number or a tuple `(sH, sW)`. Default: :attr:`kernel_size` padding: implicit zero paddings on both sides of the input. Can be a @@ -317,9 +316,9 @@ def conv_tbc(input, weight, bias, pad=0): See :class:`~torch.nn.AvgPool3d` for details and output shape. Args: - input: input tensor (:math:`minibatch \times in\_channels \times iT \times iH \times iW`) + input: input tensor :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)` kernel_size: size of the pooling region. Can be a single number or a - tuple (:math:`kT \times kH \times kW`) + tuple :math:`(kT \times kH \times kW)` stride: stride of the pooling operation. Can be a single number or a tuple `(sT, sH, sW)`. Default: :attr:`kernel_size` padding: implicit zero paddings on both sides of the input. Can be a @@ -345,13 +344,13 @@ def fractional_max_pool2d(input, kernel_size, output_size=None, Args: kernel_size: the size of the window to take a max over. Can be a single number :math:`k` (for a square kernel of :math:`k \times k`) - or a tuple (:math:`kH \times kW`) + or a tuple :math:`(kH \times kW)` output_size: the target output size of the image of the form :math:`oH \times oW`. Can be a tuple `(oH, oW)` or a single number :math:`oH` for a square image :math:`oH \times oH` output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given. This has to be a number or tuple in the range (0, 1) return_indices: if ``True``, will return the indices along with the outputs. - Useful to pass to `max_unpool2d`. + Useful to pass to :func:`~torch.nn.functional.max_unpool2d`. Examples:: >>> input = torch.randn(20, 16, 50, 32) diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py index 454151afed8201..ef79b1ffb6a289 100644 --- a/torch/nn/modules/container.py +++ b/torch/nn/modules/container.py @@ -403,8 +403,8 @@ class ParameterDict(Module): Arguments: parameters (iterable, optional): a mapping (dictionary) of - (string : :class:`~torch.nn.Parameter``) or an iterable of key,value pairs - of type (string, :class:`~torch.nn.Parameter``) + (string : :class:`~torch.nn.Parameter`) or an iterable of key,value pairs + of type (string, :class:`~torch.nn.Parameter`) Example:: @@ -480,8 +480,8 @@ def update(self, parameters): Arguments: parameters (iterable): a mapping (dictionary) of - (string : :class:`~torch.nn.Parameter``) or an iterable of - key/value pairs of type (string, :class:`~torch.nn.Parameter``) + (string : :class:`~torch.nn.Parameter`) or an iterable of + key/value pairs of type (string, :class:`~torch.nn.Parameter`) """ if not isinstance(parameters, Iterable): raise TypeError("ParametersDict.update should be called with an " diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index 03cf297e3c0aa6..7cbf5fbf604314 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -101,24 +101,24 @@ class Conv1d(_ConvNd): * At groups= :attr:`in_channels`, each input channel is convolved with its own set of filters, of size - :math:`\left\lfloor\frac{\text{out\_channels}}{\text{in\_channels}}\right\rfloor` + :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor` .. note:: - Depending of the size of your kernel, several (of the last) - columns of the input might be lost, because it is a valid - `cross-correlation`_, and not a full `cross-correlation`_. - It is up to the user to add proper padding. + Depending of the size of your kernel, several (of the last) + columns of the input might be lost, because it is a valid + `cross-correlation`_, and not a full `cross-correlation`_. + It is up to the user to add proper padding. .. note:: - The configuration when `groups == in_channels` and `out_channels == K * in_channels` - where `K` is a positive integer is termed in literature as depthwise convolution. + When `groups == in_channels` and `out_channels == K * in_channels`, + where `K` is a positive integer, this operation is also termed in + literature as depthwise convolution. - In other words, for an input of size :math:`(N, C_{in}, L_{in})`, if you want a - depthwise convolution with a depthwise multiplier `K`, - then you use the constructor arguments - :math:`(\text{in\_channels}=C_{in}, \text{out\_channels}=C_{in} * K, ..., \text{groups}=C_{in})` + In other words, for an input of size :math:`(N, C_{in}, L_{in})`, + a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments + :math:`(C_\text{in}=C_{in}, C_\text{out}=C_{in} \times K, ..., \text{groups}=C_{in})`. Args: in_channels (int): Number of channels in the input image @@ -145,11 +145,11 @@ class Conv1d(_ConvNd): weight (Tensor): the learnable weights of the module of shape (out_channels, in_channels, kernel_size). The values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}` + :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}` bias (Tensor): the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``, then the values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}` + :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}` Examples:: @@ -218,7 +218,7 @@ class Conv2d(_ConvNd): concatenated. * At groups= :attr:`in_channels`, each input channel is convolved with its own set of filters, of size: - :math:`\left\lfloor\frac{\text{out\_channels}}{\text{in\_channels}}\right\rfloor`. + :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`. The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be: @@ -235,13 +235,13 @@ class Conv2d(_ConvNd): .. note:: - The configuration when `groups == in_channels` and `out_channels == K * in_channels` - where `K` is a positive integer is termed in literature as depthwise convolution. + When `groups == in_channels` and `out_channels == K * in_channels`, + where `K` is a positive integer, this operation is also termed in + literature as depthwise convolution. - In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`, if you want a - depthwise convolution with a depthwise multiplier `K`, - then you use the constructor arguments - :math:`(in\_channels=C_{in}, out\_channels=C_{in} * K, ..., groups=C_{in})` + In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`, + a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments + :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`. Args: in_channels (int): Number of channels in the input image @@ -261,6 +261,7 @@ class Conv2d(_ConvNd): H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor + .. math:: W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor @@ -269,11 +270,11 @@ class Conv2d(_ConvNd): (out_channels, in_channels, kernel_size[0], kernel_size[1]). The values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` bias (Tensor): the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``, then the values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` Examples:: @@ -357,13 +358,13 @@ class Conv3d(_ConvNd): .. note:: - The configuration when `groups == in_channels` and `out_channels == K * in_channels` - where `K` is a positive integer is termed in literature as depthwise convolution. + When `groups == in_channels` and `out_channels == K * in_channels`, + where `K` is a positive integer, this operation is also termed in + literature as depthwise convolution. - In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`, if you want a - depthwise convolution with a depthwise multiplier `K`, - then you use the constructor arguments - :math:`(in\_channels=C_{in}, out\_channels=C_{in} * K, ..., groups=C_{in})` + In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`, + a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments + :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`. Args: in_channels (int): Number of channels in the input image @@ -383,9 +384,11 @@ class Conv3d(_ConvNd): D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor + .. math:: H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor + .. math:: W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor @@ -394,11 +397,11 @@ class Conv3d(_ConvNd): (out_channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2]) The values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` bias (Tensor): the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``, then the values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` Examples:: @@ -550,11 +553,11 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd): (in_channels, out_channels, kernel_size[0], kernel_size[1]). The values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}` + :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}` bias (Tensor): the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``, then the values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}` + :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}` """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, @@ -663,11 +666,11 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd): (in_channels, out_channels, kernel_size[0], kernel_size[1]) The values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` bias (Tensor): the learnable bias of the module of shape (out_channels) If :attr:`bias` is ``True``, then the values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}` Examples:: @@ -806,11 +809,11 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd): (in_channels, out_channels, kernel_size[0], kernel_size[1], kernel_size[2]) The values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` bias (Tensor): the learnable bias of the module of shape (out_channels) If :attr:`bias` is ``True``, then the values of these weights are sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where - :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` + :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}` Examples:: diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py index 4465b9b10f0e7e..d0035820927e32 100644 --- a/torch/nn/modules/fold.py +++ b/torch/nn/modules/fold.py @@ -15,11 +15,12 @@ class Fold(Module): :math:`L` is the total number of blocks. (This is exacly the same specification as the output shape of :class:`~torch.nn.Unfold`.) This operation combines these local blocks into the large :attr:`output` tensor - of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`. - Similar to :class:`~torch.nn.Unfold`, the arguments must satisfy + of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)` + by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the + arguments must satisfy .. math:: - L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] \ + L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] % - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor, where :math:`d` is over all spatial dimensions. @@ -42,7 +43,8 @@ class Fold(Module): It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does. Args: - output_size (int or tuple): the shape of the spatial dimensions [2:] of the output + output_size (int or tuple): the shape of the spatial dimensions of the + output (i.e., ``input.sizes()[2:]``) kernel_size (int or tuple): the size of the sliding blocks stride (int or tuple): the stride of the sliding blocks in the input spatial dimensions. Default: 1 @@ -59,6 +61,13 @@ class Fold(Module): * For the case of two output spatial dimensions this operation is sometimes called ``col2im``. + .. note:: + :class:`~torch.nn.Fold` calculates each combined value in the resulting + large tensor by summing all values from all containing blocks. + :class:`~torch.nn.Unfold` extracts the values in the local blocks by + copying from the large tensor. So, if the blocks overlap, they are not + inverses of each other. + .. warning:: Currently, only 4-D output tensors (batched image-like tensors) are supported. @@ -113,10 +122,10 @@ class Unfold(Module): the total number of such blocks: .. math:: - L = \prod_d \left\lfloor\frac{\text{input\_spatial\_size}[d] + 2 \times \text{padding}[d] \ + L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] % - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor, - where :math:`\text{input\_spatial\_size}` is formed by the spatial dimensions + where :math:`\text{spatial\_size}` is formed by the spatial dimensions of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial dimensions. @@ -152,6 +161,13 @@ class Unfold(Module): * For the case of two input spatial dimensions this operation is sometimes called ``im2col``. + .. note:: + :class:`~torch.nn.Fold` calculates each combined value in the resulting + large tensor by summing all values from all containing blocks. + :class:`~torch.nn.Unfold` extracts the values in the local blocks by + copying from the large tensor. So, if the blocks overlap, they are not + inverses of each other. + .. warning:: Currently, only 4-D input tensors (batched image-like tensors) are supported. diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py index f79bbc61dd4e71..24932a678ec568 100644 --- a/torch/nn/modules/instancenorm.py +++ b/torch/nn/modules/instancenorm.py @@ -78,6 +78,15 @@ class InstanceNorm1d(_InstanceNorm): where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the new observed value. + .. note:: + :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but + have some subtle differences. :class:`InstanceNorm1d` is applied + on each channel of channeled data like multidimensional time series, but + :class:`LayerNorm` is usually applied on entire sample and often in NLP + tasks. Additionaly, :class:`LayerNorm` applies elementwise affine + transform, while :class:`InstanceNorm1d` usually don't apply affine + transform. + Args: num_features: :math:`C` from an expected input of size :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)` @@ -143,6 +152,15 @@ class InstanceNorm2d(_InstanceNorm): where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the new observed value. + .. note:: + :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but + have some subtle differences. :class:`InstanceNorm2d` is applied + on each channel of channeled data like RGB images, but + :class:`LayerNorm` is usually applied on entire sample and often in NLP + tasks. Additionaly, :class:`LayerNorm` applies elementwise affine + transform, while :class:`InstanceNorm2d` usually don't apply affine + transform. + Args: num_features: :math:`C` from an expected input of size :math:`(N, C, H, W)` @@ -208,6 +226,15 @@ class InstanceNorm3d(_InstanceNorm): where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the new observed value. + .. note:: + :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but + have some subtle differences. :class:`InstanceNorm3d` is applied + on each channel of channeled data like 3D models with RGB color, but + :class:`LayerNorm` is usually applied on entire sample and often in NLP + tasks. Additionaly, :class:`LayerNorm` applies elementwise affine + transform, while :class:`InstanceNorm3d` usually don't apply affine + transform. + Args: num_features: :math:`C` from an expected input of size :math:`(N, C, D, H, W)` diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py index 573c6b20cbf035..3cd4661e5a6de2 100644 --- a/torch/nn/modules/linear.py +++ b/torch/nn/modules/linear.py @@ -17,17 +17,17 @@ class Linear(Module): Default: ``True`` Shape: - - Input: :math:`(N, *, in\_features)` where :math:`*` means any number of + - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of additional dimensions - - Output: :math:`(N, *, out\_features)` where all but the last dimension + - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension are the same shape as the input. Attributes: weight: the learnable weights of the module of shape - `(out_features x in_features)`. The values are initialized from - :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`(\text{out\_features}, \text{in\_features})`. The values are + initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where :math:`k = \frac{1}{\text{in\_features}}` - bias: the learnable bias of the module of shape :math:`(out_features)`. + bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. If :attr:`bias` is ``True``, the values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where :math:`k = \frac{1}{\text{in\_features}}` @@ -38,6 +38,7 @@ class Linear(Module): >>> input = torch.randn(128, 20) >>> output = m(input) >>> print(output.size()) + torch.Size([128, 30]) """ def __init__(self, in_features, out_features, bias=True): @@ -87,12 +88,12 @@ class Bilinear(Module): Attributes: weight: the learnable weights of the module of shape - `(out_features x in1_features x in2_features)`. The values are initialized from - :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`(\text{out\_features} x \text{in1\_features} x \text{in2\_features})`. + The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where :math:`k = \frac{1}{\text{in1\_features}}` - bias: the learnable bias of the module of shape `(out_features)` + bias: the learnable bias of the module of shape :math:`(\text{out\_features})` If :attr:`bias` is ``True``, the values are initialized from - :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where :math:`k = \frac{1}{\text{in1\_features}}` Examples:: @@ -102,6 +103,7 @@ class Bilinear(Module): >>> input2 = torch.randn(128, 30) >>> output = m(input1, input2) >>> print(output.size()) + torch.Size([128, 40]) """ def __init__(self, in1_features, in2_features, out_features, bias=True): diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index a85c220e66f521..06be26b8cdc146 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -42,7 +42,7 @@ def forward(self, x): return F.relu(self.conv2(x)) Submodules assigned in this way will be registered, and will have their - parameters converted too when you call `.cuda()`, etc. + parameters converted too when you call :meth:`to`, etc. """ dump_patches = False diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py index 7ca95d21230cad..9243f14791b659 100644 --- a/torch/nn/modules/pooling.py +++ b/torch/nn/modules/pooling.py @@ -30,8 +30,8 @@ class MaxPool1d(_MaxPoolNd): and output :math:`(N, C, L_{out})` can be precisely described as: .. math:: - out(N_i, C_j, k) = \max_{m=0, \ldots, kernel\_size-1} - input(N_i, C_j, stride * k + m) + out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1} + input(N_i, C_j, stride \times k + m) If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. @@ -43,16 +43,16 @@ class MaxPool1d(_MaxPoolNd): padding: implicit zero padding to be added on both sides dilation: a parameter that controls the stride of elements in the window return_indices: if ``True``, will return the max indices along with the outputs. - Useful when Unpooling later + Useful for :class:`torch.nn.MaxUnpool1d` later ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape Shape: - Input: :math:`(N, C, L_{in})` - - Output: :math:`(N, C, L_{out})` where + - Output: :math:`(N, C, L_{out})`, where .. math:: - L_{out} = \left\lfloor \frac{L_{in} + 2 * \text{padding} - \text{dilation} - * (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor + L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation} + \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor Examples:: @@ -84,9 +84,11 @@ class MaxPool2d(_MaxPoolNd): can be precisely described as: .. math:: - - out(N_i, C_j, h, w) = \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} - \text{input}(N_i, C_j, \text{stride[0]} * h + m, \text{stride[1]} * w + n) + \begin{aligned} + out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\ + & \text{input}(N_i, C_j, \text{stride[0]} \times h + m, + \text{stride[1]} \times w + n) + \end{aligned} If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. @@ -104,20 +106,20 @@ class MaxPool2d(_MaxPoolNd): padding: implicit zero padding to be added on both sides dilation: a parameter that controls the stride of elements in the window return_indices: if ``True``, will return the max indices along with the outputs. - Useful when Unpooling later + Useful for :class:`torch.nn.MaxUnpool2d` later ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape Shape: - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where + - Output: :math:`(N, C, H_{out}, W_{out})`, where .. math:: H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]} - * (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor + \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor .. math:: W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]} - * (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor + \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor Examples:: @@ -147,14 +149,11 @@ class MaxPool3d(_MaxPoolNd): can be precisely described as: .. math:: - out(N_i, C_j, d, h, w) = - \begin{gathered} - \max_{k=0, \ldots, kD-1} - \max_{m=0, \ldots, kH-1} - \max_{n=0, \ldots, kW-1} \\ - \text{input}(N_i, C_j, \text{stride[0]} * - k + d, \text{stride[1]} * h + m, \text{stride[2]} * w + n) - \end{gathered} + \begin{aligned} + \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\ + & \text{input}(N_i, C_j, \text{stride[0]} \times k + d, + \text{stride[1]} \times h + m, \text{stride[2]} \times w + n) + \end{aligned} If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points. @@ -172,23 +171,23 @@ class MaxPool3d(_MaxPoolNd): padding: implicit zero padding to be added on all three sides dilation: a parameter that controls the stride of elements in the window return_indices: if ``True``, will return the max indices along with the outputs. - Useful when Unpooling later + Useful for :class:`torch.nn.MaxUnpool3d` later ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape Shape: - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where .. math:: - D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] - \text{dilation}[0] * + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor .. math:: - H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] - \text{dilation}[1] * + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor .. math:: - W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] - \text{dilation}[2] * + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor Examples:: @@ -202,7 +201,7 @@ class MaxPool3d(_MaxPoolNd): .. _link: https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md - """ + """ # noqa: E501 def forward(self, input): return F.max_pool3d(input, self.kernel_size, self.stride, @@ -227,29 +226,29 @@ class MaxUnpool1d(_MaxUnpoolNd): including the indices of the maximal values and computes a partial inverse in which all non-maximal values are set to zero. - .. note:: `MaxPool1d` can map several input sizes to the same output sizes. - Hence, the inversion process can get ambiguous. + .. note:: :class:`MaxPool1d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. To accommodate this, you can provide the needed output size - as an additional argument `output_size` in the forward call. + as an additional argument :attr:`output_size` in the forward call. See the Inputs and Example below. Args: kernel_size (int or tuple): Size of the max pooling window. stride (int or tuple): Stride of the max pooling window. - It is set to ``kernel_size`` by default. + It is set to :attr:`kernel_size` by default. padding (int or tuple): Padding that was added to the input Inputs: - `input`: the input Tensor to invert - - `indices`: the indices given out by `MaxPool1d` - - `output_size` (optional) : a `torch.Size` that specifies the targeted output size + - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d` + - `output_size` (optional): the targeted output size Shape: - Input: :math:`(N, C, H_{in})` - - Output: :math:`(N, C, H_{out})` where + - Output: :math:`(N, C, H_{out})`, where .. math:: - H_{out} = (H_{in} - 1) * \text{stride}[0] - 2 * \text{padding}[0] + \text{kernel\_size}[0] + H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0] or as given by :attr:`output_size` in the call operator @@ -292,32 +291,32 @@ class MaxUnpool2d(_MaxUnpoolNd): including the indices of the maximal values and computes a partial inverse in which all non-maximal values are set to zero. - .. note:: `MaxPool2d` can map several input sizes to the same output sizes. - Hence, the inversion process can get ambiguous. + .. note:: :class:`MaxPool2d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. To accommodate this, you can provide the needed output size - as an additional argument `output_size` in the forward call. + as an additional argument :attr:`output_size` in the forward call. See the Inputs and Example below. Args: kernel_size (int or tuple): Size of the max pooling window. stride (int or tuple): Stride of the max pooling window. - It is set to ``kernel_size`` by default. + It is set to :attr:`kernel_size` by default. padding (int or tuple): Padding that was added to the input Inputs: - `input`: the input Tensor to invert - - `indices`: the indices given out by `MaxPool2d` - - `output_size` (optional) : a `torch.Size` that specifies the targeted output size + - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d` + - `output_size` (optional): the targeted output size Shape: - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where + - Output: :math:`(N, C, H_{out}, W_{out})`, where .. math:: - H_{out} = (H_{in} - 1) * \text{stride[0]} - 2 * \text{padding[0]} + \text{kernel\_size[0]} + H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]} .. math:: - W_{out} = (W_{in} - 1) * \text{stride[1]} - 2 * \text{padding[1]} + \text{kernel\_size[1]} + W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]} or as given by :attr:`output_size` in the call operator @@ -364,35 +363,35 @@ class MaxUnpool3d(_MaxUnpoolNd): including the indices of the maximal values and computes a partial inverse in which all non-maximal values are set to zero. - .. note:: `MaxPool3d` can map several input sizes to the same output sizes. - Hence, the inversion process can get ambiguous. + .. note:: :class:`MaxPool3d` can map several input sizes to the same output + sizes. Hence, the inversion process can get ambiguous. To accommodate this, you can provide the needed output size - as an additional argument `output_size` in the forward call. + as an additional argument :attr:`output_size` in the forward call. See the Inputs section below. Args: kernel_size (int or tuple): Size of the max pooling window. stride (int or tuple): Stride of the max pooling window. - It is set to ``kernel_size`` by default. + It is set to :attr:`kernel_size` by default. padding (int or tuple): Padding that was added to the input Inputs: - `input`: the input Tensor to invert - - `indices`: the indices given out by `MaxPool3d` - - `output_size` (optional) : a `torch.Size` that specifies the targeted output size + - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d` + - `output_size` (optional): the targeted output size Shape: - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where .. math:: - D_{out} = (D_{in} - 1) * \text{stride[0]} - 2 * \text{padding[0]} + \text{kernel\_size[0]} + D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]} .. math:: - H_{out} = (H_{in} - 1) * \text{stride[1]} - 2 * \text{padding[1]} + \text{kernel\_size[1]} + H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]} .. math:: - W_{out} = (W_{in} - 1) * \text{stride[2]} - 2 * \text{padding[2]} + \text{kernel\_size[2]} + W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]} or as given by :attr:`output_size` in the call operator @@ -436,8 +435,8 @@ class AvgPool1d(_AvgPoolNd): .. math:: - \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k} - \text{input}(N_i, C_j, \text{stride} * l + m) + \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k} + \text{input}(N_i, C_j, \text{stride} \times l + m) If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points. @@ -454,11 +453,11 @@ class AvgPool1d(_AvgPoolNd): Shape: - Input: :math:`(N, C, L_{in})` - - Output: :math:`(N, C, L_{out})` where + - Output: :math:`(N, C, L_{out})`, where .. math:: L_{out} = \left\lfloor \frac{L_{in} + - 2 * \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor + 2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor Examples:: @@ -494,7 +493,7 @@ class AvgPool2d(_AvgPoolNd): .. math:: out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} - input(N_i, C_j, stride[0] * h + m, stride[1] * w + n) + input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points. @@ -514,14 +513,14 @@ class AvgPool2d(_AvgPoolNd): Shape: - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where + - Output: :math:`(N, C, H_{out}, W_{out})`, where .. math:: - H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[0] - + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor .. math:: - W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[1] - + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor Examples:: @@ -557,11 +556,12 @@ class AvgPool3d(_AvgPoolNd): can be precisely described as: .. math:: - - \text{out}(N_i, C_j, d, h, w) = \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} - \frac{\text{input}(N_i, C_j, \text{stride}[0] * d + k, \text{stride}[1] * h + m, - \text{stride}[2] * w + n)} - {kD * kH * kW} + \begin{aligned} + \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\ + & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k, + \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)} + {kD \times kH \times kW} + \end{aligned} If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides for :attr:`padding` number of points. @@ -581,18 +581,18 @@ class AvgPool3d(_AvgPoolNd): Shape: - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` - - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where + - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where .. math:: - D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] - + D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor .. math:: - H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] - + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor .. math:: - W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] - + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor Examples:: @@ -718,11 +718,11 @@ class LPPool1d(_LPPoolNd): Shape: - Input: :math:`(N, C, L_{in})` - - Output: :math:`(N, C, L_{out})` where + - Output: :math:`(N, C, L_{out})`, where .. math:: L_{out} = \left\lfloor\frac{L_{in} + - 2 * \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor + 2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor Examples:: >>> # power-2 pool of window of length 3, with stride 2. @@ -746,7 +746,7 @@ class LPPool2d(_LPPoolNd): f(X) = \sqrt[p]{\sum_{x \in X} x^{p}} - At p = :math:`\infty`, one gets Max Pooling - - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling) + - At p = 1, one gets Sum Pooling (which is proportional to average pooling) The parameters :attr:`kernel_size`, :attr:`stride` can either be: @@ -764,14 +764,14 @@ class LPPool2d(_LPPoolNd): Shape: - Input: :math:`(N, C, H_{in}, W_{in})` - - Output: :math:`(N, C, H_{out}, W_{out})` where + - Output: :math:`(N, C, H_{out}, W_{out})`, where .. math:: - H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[0] - \text{dilation}[0] * + H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor .. math:: - W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[1] - \text{dilation}[1] * + W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor Examples:: @@ -800,6 +800,9 @@ def __init__(self, output_size, return_indices=False): def extra_repr(self): return 'output_size={}'.format(self.output_size) +# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and +# output shapes are, and how the operation computes output. + class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd): r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes. diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py index 3fe381a59b5108..94b60bda836d12 100644 --- a/torch/nn/parallel/data_parallel.py +++ b/torch/nn/parallel/data_parallel.py @@ -36,9 +36,10 @@ class DataParallel(Module): This container parallelizes the application of the given module by splitting the input across the specified devices by chunking in the batch - dimension. In the forward pass, the module is replicated on each device, - and each replica handles a portion of the input. During the backwards - pass, gradients from each replica are summed into the original module. + dimension (other objects will be copied once per device). In the forward + pass, the module is replicated on each device, and each replica handles a + portion of the input. During the backwards pass, gradients from each replica + are summed into the original module. The batch size should be larger than the number of GPUs used. diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py index b61fdabd3e828d..c5ad556e510a89 100644 --- a/torch/nn/utils/rnn.py +++ b/torch/nn/utils/rnn.py @@ -215,10 +215,9 @@ def pad_sequence(sequences, batch_first=False, padding_value=0): torch.Size([25, 3, 300]) Note: - This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` where `T` is the - length of the longest sequence. - Function assumes trailing dimensions and type of all the Tensors - in sequences are same. + This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` + where `T` is the length of the longest sequence. This function assumes + trailing dimensions and type of all the Tensors in sequences are same. Arguments: sequences (list[Tensor]): list of variable length sequences. @@ -227,7 +226,7 @@ def pad_sequence(sequences, batch_first=False, padding_value=0): padding_value (float, optional): value for padded elements. Default: 0. Returns: - Tensor of size ``T x B x *`` if batch_first is False + Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``. Tensor of size ``B x T x *`` otherwise """ From 31850163acaa8fff1fbb1d1d48d36fdfb97acae8 Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Tue, 11 Sep 2018 08:51:02 -0700 Subject: [PATCH 032/237] Remove separate ATen build target (#11488) Summary: ATen has had a separate build target in the past, but with our move to a root-level CMakeLists.txt file this makes less sense and is harder to maintain. Also, as we blend code between Caffe2 and ATen this will become even less maintainable. Talked to ezyang about this, but also cc zdevito, Yangqing, and soumith. If this is too difficult, I will revert, but want to see if we can simplify for now. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11488 Differential Revision: D9770266 Pulled By: orionr fbshipit-source-id: c7ba52a1676d84e2d052dad4c042b666f49451cd --- .jenkins/pytorch/build.sh | 6 ---- aten/CMakeLists.txt | 68 +++++++----------------------------- aten/src/ATen/CMakeLists.txt | 67 ----------------------------------- tools/test_aten_install.sh | 11 ------ 4 files changed, 13 insertions(+), 139 deletions(-) delete mode 100755 tools/test_aten_install.sh diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index f108c37cdd6ed1..d9e58d7e8b1604 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -102,12 +102,6 @@ fi # Add the test binaries so that they won't be git clean'ed away git add -f build/bin -# Testing ATen install -if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then - echo "Testing ATen install" - time tools/test_aten_install.sh -fi - # Test C FFI plugins # cffi install doesn't work for Python 3.7 if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index ee025265a982e7..0f0019d57b11e9 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -1,19 +1,5 @@ -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(ATen CXX C) - include(CMakeDependentOption) - option(USE_CUDA "Use CUDA" ON) - option(USE_ROCM "Use ROCm" OFF) - option(USE_CUDNN "Use cuDNN" ON) - option(USE_MKLDNN "Use MKLDNN" ON) - cmake_dependent_option( - USE_CUDNN "Use cuDNN" ON - "USE_CUDA" OFF) - option(ATEN_NO_TEST "Do not build ATen test binaries" OFF) -else() - if (BUILD_ATEN_MOBILE) - return() - endif() +if (BUILD_ATEN_MOBILE) + return() endif() # Find modules @@ -42,32 +28,6 @@ SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory") SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory") SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory") -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # ---[ Build variables set within the cmake tree - include(../cmake/BuildVariables.cmake) - set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.") - - # ---[ Misc checks to cope with various compiler modes - include(../cmake/MiscCheck.cmake) - - # External projects - include(ExternalProject) - - # ---[ Utils - # TODO: merge the following 3 files into cmake/public/utils.cmake. - include(../cmake/Utils.cmake) - include(../cmake/public/utils.cmake) - - # ---[ Dependencies - include(../cmake/Dependencies.cmake) - list(APPEND ATen_CPU_INCLUDE ${Caffe2_CPU_INCLUDE}) - list(APPEND ATen_CUDA_INCLUDE ${Caffe2_GPU_INCLUDE}) - list(APPEND ATen_CPU_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS}) - list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS}) - list(APPEND ATen_PUBLIC_CUDA_DEPENDENCY_LIBS - ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) -endif() - if(USE_CUDA) list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS}) endif() @@ -132,16 +92,14 @@ list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/src/ATen) add_subdirectory(src/ATen) -if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # Pass source, includes, and libs to parent - set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) - set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) - set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) - set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) - set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) - set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) - set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) - set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) - set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) - set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) -endif() +# Pass source, includes, and libs to parent +set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) +set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) +set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) +set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) +set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) +set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) +set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) +set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index d65211e2269d98..994756fa18c995 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -1,11 +1,6 @@ cmake_minimum_required(VERSION 3.0 FATAL_ERROR) SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # ---[ Generate and install header and cpp files - include(../../../cmake/Codegen.cmake) -endif() - IF(NOT MSVC) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers") @@ -352,34 +347,6 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE") endif() endif() -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # Eventually replace this use of LOCATION with use of - # $, but generators only work in some cases - cmake_policy(SET CMP0026 OLD) - get_target_property(ATEN_CPU_OUTPUT_NAME ATen_cpu LOCATION) - get_filename_component(ATEN_CPU_OUTPUT_NAME ${ATEN_CPU_OUTPUT_NAME} NAME) - set(ATEN_LIBRARIES - "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CPU_OUTPUT_NAME}") - if(USE_CUDA OR USE_ROCM) - get_target_property(ATEN_CUDA_OUTPUT_NAME ATen_cuda LOCATION) - get_filename_component(ATEN_CUDA_OUTPUT_NAME ${ATEN_CUDA_OUTPUT_NAME} NAME) - list(APPEND ATEN_LIBRARIES - "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CUDA_OUTPUT_NAME}") - endif() - - install(TARGETS ATen_cpu - RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}" - LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}" - ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}") - - if(USE_CUDA OR USE_ROCM) - install(TARGETS ATen_cuda - RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}" - LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}" - ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}") - endif() -endif() - SET(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}") CONFIGURE_FILE(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake") INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" @@ -404,40 +371,6 @@ else() add_subdirectory(test) endif() -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - if (BUILD_TEST) - foreach(test_src ${ATen_CPU_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - add_executable(${test_name} "${test_src}") - target_include_directories( - ${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) - target_link_libraries(${test_name} ATen_cpu) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION test) - endforeach() - - if(USE_CUDA OR USE_ROCM) - foreach(test_src ${ATen_CUDA_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - torch_cuda_based_add_executable(${test_name} "${test_src}") - target_include_directories( - ${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE}) - target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION test) - endforeach() - endif() - endif() - - # Make sure these don't get built by parent - set(ATen_CPU_TEST_SRCS) - set(ATen_CUDA_TEST_SRCS) -endif() - # Pass source, includes, and libs to parent set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) diff --git a/tools/test_aten_install.sh b/tools/test_aten_install.sh deleted file mode 100755 index d2d57235505778..00000000000000 --- a/tools/test_aten_install.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh -set -xe -rm -rf aten_build -rm -rf aten_install -mkdir aten_build aten_install -cd aten_build -cmake ../aten -DUSE_CUDA=OFF -DCMAKE_INSTALL_PREFIX=../aten_install -NUM_JOBS="$(getconf _NPROCESSORS_ONLN)" -make -j"$NUM_JOBS" install -cd .. -aten/tools/test_install.sh $(pwd)/aten_install $(pwd)/aten From 4e8d9a4a5838e99082c6866032a39efd78d341ac Mon Sep 17 00:00:00 2001 From: Soumith Chintala Date: Tue, 11 Sep 2018 08:51:19 -0700 Subject: [PATCH 033/237] Introducing python setup.py rebuild develop (#11487) Summary: This speeds up incremental builds by doing the following changes: - Uses `rsync` instead of `cp` (when `rsync` is found) which is a bit smarter in doing "maybe copy" - Introduces a `rebuild` mode which does not rerun `cmake` in `build_pytorch_libs.sh`. *Note: `rebuild` should only be used if you dont add / remove files to the build, as `cmake` is not rerun* Current no-op rebuild speedup: - 1m 15s -> 20s There are some lingering bugs. No-op rebuilds rerun `cmake` for two rebuilds (likely that cmake logic is dependent on the install folder, hence kicking off rebuild). So what you see ``` python setup.py rebuild develop # first time - ~5 mins python setup.py rebuild develop # second time - ~3 mins python setup.py rebuild develop # third time - ~2 mins python setup.py rebuild develop # fourth time - ~20 seconds python setup.py rebuild develop # fifth time - ~20 seconds ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11487 Differential Revision: D9769087 Pulled By: soumith fbshipit-source-id: 20fbecde33af6426149c13767e8734fb3be783c5 --- setup.py | 38 ++++++- tools/build_pytorch_libs.sh | 206 ++++++++++++++++++++---------------- 2 files changed, 148 insertions(+), 96 deletions(-) diff --git a/setup.py b/setup.py index 7edd89cbd7fcb6..10ef480c73ceeb 100644 --- a/setup.py +++ b/setup.py @@ -187,6 +187,7 @@ def hotpatch_var(var, prefix='USE_'): BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH') USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK') +RERUN_CMAKE = True NUM_JOBS = multiprocessing.cpu_count() max_jobs = os.getenv("MAX_JOBS") @@ -382,6 +383,8 @@ def build_libs(libs): build_libs_cmd += ['--use-mkldnn'] if USE_GLOO_IBVERBS: build_libs_cmd += ['--use-gloo-ibverbs'] + if not RERUN_CMAKE: + build_libs_cmd += ['--dont-rerun-cmake'] my_env["BUILD_TORCH"] = "ON" my_env["BUILD_PYTHON"] = "ON" @@ -464,6 +467,7 @@ def check_file(f): build_dep_cmds = {} +rebuild_dep_cmds = {} for lib in dep_libs: # wrap in function to capture lib @@ -475,6 +479,16 @@ def run(self): build_dep.lib = lib build_dep_cmds['build_' + lib.lower()] = build_dep + class rebuild_dep(build_deps): + description = 'Rebuild {} external library'.format(lib) + + def run(self): + global RERUN_CMAKE + RERUN_CMAKE = False + build_libs([self.lib]) + rebuild_dep.lib = lib + rebuild_dep_cmds['rebuild_' + lib.lower()] = rebuild_dep + class build_module(PytorchCommand): def run(self): @@ -514,8 +528,15 @@ def load(filename): all_commands = [entry for f in ninja_files + cmake_files for entry in load(f)] - with open('compile_commands.json', 'w') as f: - json.dump(all_commands, f, indent=2) + + new_contents = json.dumps(all_commands, indent=2) + contents = '' + if os.path.exists('compile_commands.json'): + with open('compile_commands.json', 'r') as f: + contents = f.read() + if contents != new_contents: + with open('compile_commands.json', 'w') as f: + f.write(new_contents) if not USE_NINJA: print("WARNING: 'develop' is not building C++ code incrementally") print("because ninja is not installed. Run this to enable it:") @@ -676,6 +697,17 @@ class build(distutils.command.build.build): ] + distutils.command.build.build.sub_commands +class rebuild(distutils.command.build.build): + sub_commands = [ + ('build_deps', lambda self: True), + ] + distutils.command.build.build.sub_commands + + def run(self): + global RERUN_CMAKE + RERUN_CMAKE = False + distutils.command.build.build.run(self) + + class install(setuptools.command.install.install): def run(self): @@ -1116,11 +1148,13 @@ def make_relative_rpath(path): 'build_deps': build_deps, 'build_module': build_module, 'rebuild_libtorch': rebuild_libtorch, + 'rebuild': rebuild, 'develop': develop, 'install': install, 'clean': clean, } cmdclass.update(build_dep_cmds) +cmdclass.update(rebuild_dep_cmds) entry_points = { 'console_scripts': [ diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index d09dc756d82d8e..3134bcafa6f968 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -10,6 +10,11 @@ set -ex +SYNC_COMMAND="cp" +if [ -x "$(command -v rsync)" ]; then + SYNC_COMMAND="time rsync -lptgoD" +fi + # Options for building only a subset of the libraries USE_CUDA=0 USE_ROCM=0 @@ -17,8 +22,12 @@ USE_NNPACK=0 USE_MKLDNN=0 USE_GLOO_IBVERBS=0 CAFFE2_STATIC_LINK_CUDA=0 +RERUN_CMAKE=1 while [[ $# -gt 0 ]]; do case "$1" in + --dont-rerun-cmake) + RERUN_CMAKE=0 + ;; --use-cuda) USE_CUDA=1 ;; @@ -147,46 +156,49 @@ function build() { THCS | THCUNN ) BUILD_C_FLAGS=$C_FLAGS;; *) BUILD_C_FLAGS=$C_FLAGS" -fexceptions";; esac - # TODO: The *_LIBRARIES cmake variables should eventually be - # deprecated because we are using .cmake files to handle finding - # installed libraries instead - ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \ - ${CMAKE_GENERATOR} \ - -DTorch_FOUND="1" \ - -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ - -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \ - -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \ - -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \ - -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \ - -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \ - -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \ - -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \ - -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \ - -Dcwrap_files="$CWRAP_FILES" \ - -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \ - -DTH_LIB_PATH="$INSTALL_DIR/lib" \ - -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \ - -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \ - -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \ - -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \ - -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \ - -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \ - -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \ - -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \ - -DTH_SO_VERSION=1 \ - -DTHC_SO_VERSION=1 \ - -DTHNN_SO_VERSION=1 \ - -DTHCUNN_SO_VERSION=1 \ - -DTHD_SO_VERSION=1 \ - -DUSE_CUDA=$USE_CUDA \ - -DBUILD_EXAMPLES=OFF \ - -DBUILD_TEST=$BUILD_TEST \ - -DNO_NNPACK=$((1-$USE_NNPACK)) \ - -DNCCL_EXTERNAL=1 \ - -DCMAKE_DEBUG_POSTFIX="" \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - ${@:2} \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]} + if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then + # TODO: The *_LIBRARIES cmake variables should eventually be + # deprecated because we are using .cmake files to handle finding + # installed libraries instead + ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \ + ${CMAKE_GENERATOR} \ + -DCMAKE_INSTALL_MESSAGE="LAZY" \ + -DTorch_FOUND="1" \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \ + -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \ + -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \ + -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \ + -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \ + -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \ + -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \ + -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \ + -Dcwrap_files="$CWRAP_FILES" \ + -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \ + -DTH_LIB_PATH="$INSTALL_DIR/lib" \ + -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \ + -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \ + -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \ + -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \ + -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \ + -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \ + -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \ + -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \ + -DTH_SO_VERSION=1 \ + -DTHC_SO_VERSION=1 \ + -DTHNN_SO_VERSION=1 \ + -DTHCUNN_SO_VERSION=1 \ + -DTHD_SO_VERSION=1 \ + -DUSE_CUDA=$USE_CUDA \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_TEST=$BUILD_TEST \ + -DNO_NNPACK=$((1-$USE_NNPACK)) \ + -DNCCL_EXTERNAL=1 \ + -DCMAKE_DEBUG_POSTFIX="" \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + ${@:2} \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]} + fi ${CMAKE_INSTALL} -j"$MAX_JOBS" popd @@ -211,18 +223,21 @@ function path_remove { function build_nccl() { mkdir -p build/nccl pushd build/nccl - ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \ - ${CMAKE_GENERATOR} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ - -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \ - -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \ - -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \ - -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \ - -DNUM_JOBS="$MAX_JOBS" + if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then + ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \ + ${CMAKE_GENERATOR} \ + -DCMAKE_INSTALL_MESSAGE="LAZY" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \ + -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \ + -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \ + -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \ + -DNUM_JOBS="$MAX_JOBS" + fi ${CMAKE_INSTALL} -j"$MAX_JOBS" mkdir -p ${INSTALL_DIR}/lib - cp "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1" + $SYNC_COMMAND "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1" if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so" fi @@ -252,50 +267,53 @@ function build_caffe2() { EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH") fi - ${CMAKE_VERSION} $BASE_DIR \ - ${CMAKE_GENERATOR} \ - -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \ - -DBUILDING_WITH_TORCH_LIBS=ON \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - -DBUILD_TORCH=$BUILD_TORCH \ - -DBUILD_PYTHON=$BUILD_PYTHON \ - -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \ - -DBUILD_BINARY=$BUILD_BINARY \ - -DBUILD_TEST=$BUILD_TEST \ - -DINSTALL_TEST=$INSTALL_TEST \ - -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \ - -DONNX_NAMESPACE=$ONNX_NAMESPACE \ - -DUSE_CUDA=$USE_CUDA \ - -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \ - -DUSE_ROCM=$USE_ROCM \ - -DUSE_NNPACK=$USE_NNPACK \ - -DUSE_LEVELDB=$USE_LEVELDB \ - -DUSE_LMDB=$USE_LMDB \ - -DUSE_OPENCV=$USE_OPENCV \ - -DUSE_GLOG=OFF \ - -DUSE_GFLAGS=OFF \ - -DUSE_SYSTEM_EIGEN_INSTALL=OFF \ - -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \ - -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \ - -DCUDNN_LIBRARY=$CUDNN_LIBRARY \ - -DUSE_MKLDNN=$USE_MKLDNN \ - -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \ - -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \ - -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \ - -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ - -DCMAKE_C_FLAGS="$USER_CFLAGS" \ - -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \ - -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \ - -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]} + if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then + ${CMAKE_VERSION} $BASE_DIR \ + ${CMAKE_GENERATOR} \ + -DCMAKE_INSTALL_MESSAGE="LAZY" \ + -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \ + -DBUILDING_WITH_TORCH_LIBS=ON \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DBUILD_TORCH=$BUILD_TORCH \ + -DBUILD_PYTHON=$BUILD_PYTHON \ + -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \ + -DBUILD_BINARY=$BUILD_BINARY \ + -DBUILD_TEST=$BUILD_TEST \ + -DINSTALL_TEST=$INSTALL_TEST \ + -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \ + -DONNX_NAMESPACE=$ONNX_NAMESPACE \ + -DUSE_CUDA=$USE_CUDA \ + -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \ + -DUSE_ROCM=$USE_ROCM \ + -DUSE_NNPACK=$USE_NNPACK \ + -DUSE_LEVELDB=$USE_LEVELDB \ + -DUSE_LMDB=$USE_LMDB \ + -DUSE_OPENCV=$USE_OPENCV \ + -DUSE_GLOG=OFF \ + -DUSE_GFLAGS=OFF \ + -DUSE_SYSTEM_EIGEN_INSTALL=OFF \ + -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \ + -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \ + -DCUDNN_LIBRARY=$CUDNN_LIBRARY \ + -DUSE_MKLDNN=$USE_MKLDNN \ + -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \ + -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \ + -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DCMAKE_C_FLAGS="$USER_CFLAGS" \ + -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \ + -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \ + -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]} # STOP!!! Are you trying to add a C or CXX flag? Add it # to CMakeLists.txt and aten/CMakeLists.txt, not here. # We need the vanilla cmake build to work. + fi # This is needed by the aten tests built with caffe2 if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then - # cp root/torch/lib/tmp_install/libnccl root/build/lib/libnccl - cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" + # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl + $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" fi ${CMAKE_INSTALL} -j"$MAX_JOBS" @@ -352,15 +370,15 @@ pushd $TORCH_LIB_DIR # binaries to torch/lib rm -rf "$INSTALL_DIR/lib/cmake" rm -rf "$INSTALL_DIR/lib/python" -cp -r "$INSTALL_DIR/lib"/* . +$SYNC_COMMAND -r "$INSTALL_DIR/lib"/* . if [ -d "$INSTALL_DIR/lib64/" ]; then - cp -r "$INSTALL_DIR/lib64"/* . + $SYNC_COMMAND -r "$INSTALL_DIR/lib64"/* . fi -cp ../../aten/src/THNN/generic/THNN.h . -cp ../../aten/src/THCUNN/generic/THCUNN.h . -cp -r "$INSTALL_DIR/include" . +$SYNC_COMMAND ../../aten/src/THNN/generic/THNN.h . +$SYNC_COMMAND ../../aten/src/THCUNN/generic/THCUNN.h . +$SYNC_COMMAND -r "$INSTALL_DIR/include" . if [ -d "$INSTALL_DIR/bin/" ]; then - cp -r "$INSTALL_DIR/bin/"/* . + $SYNC_COMMAND -r "$INSTALL_DIR/bin/"/* . fi popd From deac304b6b4e515dd606738c01ec441e9baa1987 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 11 Sep 2018 09:26:27 -0700 Subject: [PATCH 034/237] Bugfix for basic slicing Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11428 Differential Revision: D9753999 Pulled By: jamesr66a fbshipit-source-id: cfc4163a5a06b41beb808a4e24650d71f5d91f4f --- test/test_jit.py | 13 +++++++++++++ torch/csrc/jit/script/compiler.cpp | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/test/test_jit.py b/test/test_jit.py index 1b880fe3b0fdec..23b76d815d4f07 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1700,6 +1700,19 @@ def random_foo(x): x = torch.rand(5, 6) self.assertEqual(random_foo(x), random_foo_traced(x)) + def test_trace_slice_expr_complete_type(self): + def random_foo(x): + return x + 1.0 + + random_foo_traced = torch.jit.trace(random_foo, (torch.rand(3, 4),)) + + @torch.jit.script + def random_bar(x): + return random_foo_traced(x)[0:1] + + x = torch.rand(3, 4) + self.assertEqual(random_bar(x), (x + 1)[0:1]) + class TestBatched(TestCase): # generate random examples and create an batchtensor with them diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 8aae072a02a1fb..988ac6e1139fb0 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -1727,7 +1727,7 @@ struct to_ir { auto slice_exp = SliceExpr(subscript.subscript_exprs()[0]); auto * sliceable = emitExpr(subscript.value()); at::optional maybe_dim; - if (sliceable->type()->kind() == TypeKind::DynamicType) { + if (sliceable->type()->isSubtypeOf(DynamicType::get())) { // If the sliceable object is a tensor, specify a default dimension maybe_dim = 0; } From d32b41003aa5d0ec10aa72be126ad9c136b50111 Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Tue, 11 Sep 2018 10:03:36 -0700 Subject: [PATCH 035/237] Copy protos on install same as develop (#11517) Summary: This is a potential fix for https://github.com/pytorch/pytorch/issues/11453 and https://github.com/pytorch/pytorch/issues/11074 worked through with pjh5 . Turns out we had some protos copy code that was in the .sh file that was removed. Better to have it in setup.py, though, same as for develop. cc ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11517 Differential Revision: D9771911 Pulled By: orionr fbshipit-source-id: 76975d8f71f38d951eaaed0b50dd3ec36dd177a9 --- setup.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 10ef480c73ceeb..f99a2bf945c162 100644 --- a/setup.py +++ b/setup.py @@ -408,6 +408,17 @@ def build_libs(libs): sys.exit(1) +# Copy Caffe2's Python proto files (generated during the build with the +# protobuf python compiler) from the build folder to the root folder +# cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py +def copy_protos(): + for src in glob.glob( + os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')): + dst = os.path.join( + cwd, os.path.relpath(src, caffe2_build_dir)) + shutil.copyfile(src, dst) + + # Build all dependent libraries class build_deps(PytorchCommand): def run(self): @@ -509,15 +520,7 @@ def run(self): self.run_command('create_version_file') setuptools.command.develop.develop.run(self) self.create_compile_commands() - - # Copy Caffe2's Python proto files (generated during the build with the - # protobuf python compiler) from the build folder to the root folder - # cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py - for src in glob.glob( - os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')): - dst = os.path.join( - cwd, os.path.relpath(src, caffe2_build_dir)) - self.copy_file(src, dst) + copy_protos() def create_compile_commands(self): def load(filename): @@ -713,6 +716,7 @@ class install(setuptools.command.install.install): def run(self): if not self.skip_build: self.run_command('build_deps') + copy_protos() setuptools.command.install.install.run(self) From 01c7542f43a0c27215bfadf3cde43797f700982d Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Tue, 11 Sep 2018 10:29:54 -0700 Subject: [PATCH 036/237] Use -isystem for system includes in C++ extensions (#11459) Summary: I noticed warnings from within pybind11 being shown when building C++ extensions. This can be avoided by including non-user-supplied headers with `-isystem` instead of `-I` I hope this works on Windows. soumith ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11459 Differential Revision: D9764444 Pulled By: goldsborough fbshipit-source-id: b288572106078f347f0342f158f9e2b63a58c235 --- torch/utils/cpp_extension.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 43edbd2f8bd055..22940aaa5f7ea5 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -817,15 +817,21 @@ def _write_ninja_file(path, # Turn into absolute paths so we can emit them into the ninja build # file wherever it is. sources = [os.path.abspath(file) for file in sources] - includes = [os.path.abspath(file) for file in extra_include_paths] + user_includes = [os.path.abspath(file) for file in extra_include_paths] # include_paths() gives us the location of torch/torch.h - includes += include_paths(with_cuda) + system_includes = include_paths(with_cuda) # sysconfig.get_paths()['include'] gives us the location of Python.h - includes.append(sysconfig.get_paths()['include']) + system_includes.append(sysconfig.get_paths()['include']) + + # Windoze does not understand `-isystem`. + if sys.platform == 'win32': + user_includes += system_includes + system_includes.clear() common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)] - common_cflags += ['-I{}'.format(include) for include in includes] + common_cflags += ['-I{}'.format(include) for include in user_includes] + common_cflags += ['-isystem {}'.format(include) for include in system_includes] if is_binary_build(): common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0'] From f80f15866b28ca148c797f5998f61bffb6e4b364 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 11 Sep 2018 10:37:39 -0700 Subject: [PATCH 037/237] Get rid of manual dispatch on Type. (#11486) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11486 I discovered these by narrowing the interface on Type, and then fixing call sites outside of core plumbing code which depended on these methods being provided. Reviewed By: cpuhrsch Differential Revision: D9757935 fbshipit-source-id: 3abda0c98919a448a326a757671d438964f6909f --- aten/src/ATen/native/Embedding.cpp | 9 ++++----- aten/src/ATen/native/LinearAlgebra.cpp | 6 +++--- aten/src/ATen/native/TensorShape.cpp | 2 +- aten/src/ATen/native/cudnn/RNN.cpp | 6 +++--- aten/src/ATen/native/sparse/SparseTensorMath.cpp | 4 ++-- torch/csrc/utils/tensor_flatten.cpp | 3 +-- 6 files changed, 14 insertions(+), 16 deletions(-) diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 67c0877f9fa072..99fa4c701d4bbf 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -67,18 +67,17 @@ Tensor embedding_sparse_backward( int64_t num_features = grad_.size(-1); auto weight_size = std::array{{ num_weights, num_features }}; auto& dense_type = grad.type(); - auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU); // check if all our grad come from padding_idx if (grad.numel() == 0) { - return sparse_type._sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}), - dense_type.tensor({0, num_features}), - weight_size); + return at::_sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}), + dense_type.tensor({0, num_features}), + weight_size); } auto index = indices.reshape({1, -1}); auto values = grad.reshape({-1, num_features}); - return sparse_type._sparse_coo_tensor_unsafe(index, values, weight_size); + return at::_sparse_coo_tensor_unsafe(index, values, weight_size); } Tensor embedding_dense_backward_cpu( diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 2371d82efc6cfb..d596a7cb99a265 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -169,14 +169,14 @@ Tensor mm(const Tensor& self, const Tensor& mat2) { if (self.is_sparse()) { return mat2.type().addmm(at::zeros({}, mat2.type()), self, mat2, 0, 1); } - return self.type()._mm(self, mat2); + return at::_mm(self, mat2); } Tensor& mm_out(Tensor& result, const Tensor& self, const Tensor& mat2) { if (self.is_sparse()) { - return mat2.type().addmm_out(result, at::zeros({}, mat2.type()), self, mat2, 0, 1); + return at::addmm_out(result, at::zeros({}, mat2.options()), self, mat2, 0, 1); } - return self.type()._mm_out(result, self, mat2); + return at::_mm_out(result, self, mat2); } Tensor mv(const Tensor& self, const Tensor& vec) { diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index cf6a9ece5c0d9d..822bea31fee942 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -206,7 +206,7 @@ Tensor repeat(const Tensor& self, IntList repeats) { Tensor xtensor = self.expand(padded_size); Tensor result = self.type().tensor(target_size); - Tensor urtensor = result.type().alias(result); + Tensor urtensor = at::alias(result); for (int64_t i = 0; i < xtensor.dim(); ++i) { // can't unfold with step 0, so make sure step is at least 1 // (it doesn't matter what it is in that case, because the size is 0). diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 8fc896afe23a12..e3b6aaadf0f0b6 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -1059,9 +1059,9 @@ std::tuple> _cudnn_rnn_backward( std::array output_mask ) { - auto grad_output = grad_output_r.defined() ? grad_output_r : output.type().zeros_like(output); - auto grad_hy = grad_hy_r.defined() ? grad_hy_r : hx.type().zeros_like(hx); - auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : cx.type().zeros_like(cx)) : grad_cy_r; + auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output); + auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx); + auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx)) : grad_cy_r; Tensor dx, dhx, dcx; // NB: unconditionally compute this gradient, because it mutates reserve diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 2e37ad41a3b96e..afd8001734a9a8 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -784,7 +784,7 @@ Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self, // sparse, dense -> sparse Tensor smm(const Tensor& self, const Tensor& mat2) { auto result = self.type().tensor(); - self.type().sspaddmm_out(result, result, self, mat2, 0.0, 1.0); + at::sspaddmm_out(result, result, self, mat2, 0.0, 1.0); return result; } @@ -792,7 +792,7 @@ Tensor smm(const Tensor& self, const Tensor& mat2) { Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) { auto result = self.type().tensor(); - self.type().sspaddmm_out(result, self, mat1, mat2, beta, alpha); + at::sspaddmm_out(result, self, mat1, mat2, beta, alpha); return result; } diff --git a/torch/csrc/utils/tensor_flatten.cpp b/torch/csrc/utils/tensor_flatten.cpp index e1bbd48316d6c6..29414368c7bd50 100644 --- a/torch/csrc/utils/tensor_flatten.cpp +++ b/torch/csrc/utils/tensor_flatten.cpp @@ -85,9 +85,8 @@ std::vector unflatten_sparse_tensors( std::vector outputs; outputs.reserve(tensors.size()); - auto & type = tensors[0].type(); for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i) - outputs.emplace_back(type._sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes())); + outputs.emplace_back(at::_sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes())); return outputs; } From 727a4453aa742db4dc59079fe10d3b7a67e62086 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Tue, 11 Sep 2018 10:44:00 -0700 Subject: [PATCH 038/237] New Serialization Proto Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11166 Reviewed By: mingzhe09088 Differential Revision: D9623522 Pulled By: houseroad fbshipit-source-id: f21153034a398de7959404321d8534234cd58a40 --- caffe2/proto/torch.proto | 550 +++++++++++++++++++++++++++++++++++++ caffe2/utils/proto_wrap.cc | 15 + cmake/ProtoBuf.cmake | 2 +- 3 files changed, 566 insertions(+), 1 deletion(-) create mode 100644 caffe2/proto/torch.proto diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto new file mode 100644 index 00000000000000..9e626d8d845260 --- /dev/null +++ b/caffe2/proto/torch.proto @@ -0,0 +1,550 @@ +syntax = "proto2"; + +import "caffe2/proto/caffe2.proto"; + +package torch; + +// Overview +// +// ONNX is an open specification that is comprised of the following components: +// +// 1) A definition of an extensible computation graph model. +// 2) Definitions of standard data types. +// 3) Definitions of built-in operators. +// +// This document describes the syntax of models and their computation graphs, +// as well as the standard data types. Together, they are referred to as the ONNX +// Intermediate Representation, or 'IR' for short. +// +// The normative semantic specification of the ONNX IR is found in docs/IR.md. +// Definitions of the built-in neural network operators may be found in docs/Operators.md. + +// Notes +// +// Release +// +// We are still in the very early stage of defining ONNX. The current +// version of ONNX is a starting point. While we are actively working +// towards a complete spec, we would like to get the community involved +// by sharing our working version of ONNX. +// +// Protobuf compatibility +// +// To simplify framework compatibility, ONNX is defined using the subset of +// protobuf that is compatible with both protobuf v2 and v3. This means that we +// do not use any protobuf features that are only available in one of the two +// versions. +// +// Here are the most notable contortions we have to carry out to work around +// these limitations: +// +// - No 'map' (added protobuf 3.0). We instead represent mappings as lists +// of key-value pairs, where order does not matter and duplicates +// are not allowed. + +// Versioning +// +// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md +// +// To be compatible with both proto2 and proto3, we will use a version number +// that is not defined by the default value but an explicit enum number. +enum Version { + // proto3 requires the first enum value to be zero. + // We add this just to appease the compiler. + _START_VERSION = 0; + // The version field is always serialized and we will use it to store the + // version that the graph is generated from. This helps us set up version + // control. + // For the IR, we are using simple numbers starting with with 0x00000001, + // which was the version we published on Oct 10, 2017. + IR_VERSION_2017_10_10 = 0x0000000000000001; + + // IR_VERSION 2 published on Oct 30, 2017 + // - Added type discriminator to AttributeProto to support proto3 users + IR_VERSION_2017_10_30 = 0x0000000000000002; + + // IR VERSION 3 published on Nov 3, 2017 + // - For operator versioning: + // - Added new message OperatorSetIdProto + // - Added opset_import in ModelProto + // - For vendor extensions, added domain in NodeProto + IR_VERSION_NEWEST_ONNX = 0x0000000000000003; + + // PYTORCH IR VERSION + IR_VERSION_NEWEST = 0x0000000000000103; +} + +// Attributes +// +// A named attribute containing either singular float, integer, string, graph, +// and tensor values, or repeated float, integer, string, graph, and tensor values. +// An AttributeProto MUST contain the name field, and *only one* of the +// following content fields, effectively enforcing a C/C++ union equivalent. +message AttributeProto { + + // Note: this enum is structurally identical to the OpSchema::AttrType + // enum defined in schema.h. If you rev one, you likely need to rev the other. + enum AttributeType { + UNDEFINED = 0; + FLOAT = 1; + INT = 2; + STRING = 3; + TENSOR = 4; + GRAPH = 5; + + FLOATS = 6; + INTS = 7; + STRINGS = 8; + TENSORS = 9; + GRAPHS = 10; + } + + // The name field MUST be present for this version of the IR. + optional string name = 1; // namespace Attribute + + // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. + // In this case, this AttributeProto does not contain data, and it's a reference of attribute + // in parent scope. + // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. + optional string ref_attr_name = 21; + + // A human-readable documentation for this attribute. Markdown is allowed. + optional string doc_string = 13; + + // The type field MUST be present for this version of the IR. + // For 0.0.1 versions of the IR, this field was not defined, and + // implementations needed to use has_field hueristics to determine + // which value field was in use. For IR_VERSION 0.0.2 or later, this + // field MUST be set and match the f|i|s|t|... field in use. This + // change was made to accomodate proto3 implementations. + optional AttributeType type = 20; // discriminator that indicates which field below is in use + + // Exactly ONE of the following fields must be present for this version of the IR + optional float f = 2; // float + optional int64 i = 3; // int + optional bytes s = 4; // UTF-8 string + optional TensorProto t = 5; // tensor value + optional GraphProto g = 6; // graph + // Do not use field below, it's deprecated. + // optional ValueProto v = 12; // value - subsumes everything but graph + + repeated float floats = 7; // list of floats + repeated int64 ints = 8; // list of ints + repeated bytes strings = 9; // list of UTF-8 strings + repeated TensorProto tensors = 10; // list of tensors + repeated GraphProto graphs = 11; // list of graph +} + +// Defines information on value, including the name, the type, and +// the shape of the value. +message ValueInfoProto { + // This field MUST be present in this version of the IR. + optional string name = 1; // namespace Value + // This field MUST be present in this version of the IR. + optional TypeProto type = 2; + // A human-readable documentation for this value. Markdown is allowed. + optional string doc_string = 3; +} + +// Nodes +// +// Computation graphs are made up of a DAG of nodes, which represent what is +// commonly called a "layer" or "pipeline stage" in machine learning frameworks. +// +// For example, it can be a node of type "Conv" that takes in an image, a filter +// tensor and a bias tensor, and produces the convolved output. +message NodeProto { + repeated string input = 1; // namespace Value + repeated string output = 2; // namespace Value + + // An optional identifier for this node in a graph. + // This field MAY be absent in ths version of the IR. + optional string name = 3; // namespace Node + + // The symbolic identifier of the Operator to execute. + optional string op_type = 4; // namespace Operator + // The domain of the OperatorSet that specifies the operator named by op_type. + optional string domain = 7; // namespace Domain + + // Additional named attributes. + repeated AttributeProto attribute = 5; + + // A human-readable documentation for this node. Markdown is allowed. + optional string doc_string = 6; + + // Additional annotations, attributes are defined in Schema + // To be added as annotations: + // string engine + // string list control_input + // int64 is_gradient_op + // string debug_info + repeated AttributeProto annotations = 8; + + // Besides the node type, PyTorhc also serialize ATen function signature + optional caffe2.DeviceOption device_option = 51; + optional string aten_function = 52; +} + +// Models +// +// ModelProto is a top-level file/container format for bundling a ML model and +// associating its computation graph with metadata. +// +// The semantics of the model are described by the associated GraphProto. +// +// Model ==> Caffe2 MetaNetDef +// ==> PyTorch Module +message ModelProto { + // The version of the IR this model targets. See Version enum above. + // This field MUST be present. + optional int64 ir_version = 1; + + // The OperatorSets this model relies on. + // All ModelProtos MUST have at least one entry that + // specifies which version of the ONNX OperatorSet is + // being imported. + // + // All nodes in the ModelProto's graph will bind against the operator + // with the same-domain/same-op_type operator with the HIGHEST version + // in the referenced operator sets. + repeated OperatorSetIdProto opset_import = 8; + + // The name of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_name = 2; + + // The version of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_version = 3; + + // Domain name of the model. + // We use reverse domain names as name space indicators. For example: + // `com.facebook.fair` or `com.microsoft.cognitiveservices` + // + // Together with `model_version` and GraphProto.name, this forms the unique identity of + // the graph. + optional string domain = 4; + + // The version of the graph encoded. See Version enum below. + optional int64 model_version = 5; + + // A human-readable documentation for this model. Markdown is allowed. + optional string doc_string = 6; + + // The parameterized graph that is evaluated to execute the model. + // The main graph, in single graph case, it is ONNX compatible. + optional GraphProto graph = 7; + + // The remaining nets in MetaNetDef. + // Submodules and methods in PyTorch. + repeated GraphProto methods = 15; + + // Named metadata values; keys should be distinct. + // Many meta data in MetaNetDef and preditor are piggy backed here. + // 1) project + // 2) model_class + // 3) internal_version + // 4) predictor_type + // 5) predictor_id + // 6) execute_plan + // 7) applicationSpecificInfo (another string map, need to verify it has no duplicate.) + // 8) engine + // 9) publish time + repeated StringStringEntryProto metadata_props = 14; + + // Model name + optional string name = 16; + + // Model name + repeated AttributeProto annotations = 17; + + // Mapping from list name to blob name list, must be string list type. + // Equivalent to blobs in MetaNetDef. + repeated AttributeProto blob_lists = 51; + + // Mapping from plan name to serialized plan, must be string list type. + // Equivalent to plans in MetaNetDef. + repeated AttributeProto plans = 52; +}; + +// StringStringEntryProto follows the pattern for cross-proto-version maps. +// See https://developers.google.com/protocol-buffers/docs/proto3#maps +message StringStringEntryProto { + optional string key = 1; + optional string value= 2; +}; + +// Graphs +// +// A graph defines the computational logic of a model and is comprised of a parameterized +// list of nodes that form a directed acyclic graph based on their inputs and outputs. +// This is the equivalent of the "network" or "graph" in many deep learning +// frameworks. +// Graph ==> NetDef in Caffe2 +// ==> Submodule/Method in PyTorch +message GraphProto { + // The nodes in the graph, sorted topologically. + repeated NodeProto node = 1; + + // The name of the graph. + optional string name = 2; // namespace Graph + + // A list of named tensor values, used to specify constant inputs of the graph. + // Each TensorProto entry must have a distinct name (within the list) that + // also appears in the input list. + repeated TensorProto initializer = 5; + + // A human-readable documentation for this graph. Markdown is allowed. + optional string doc_string = 10; + + // The inputs and outputs of the graph. + repeated ValueInfoProto input = 11; + repeated ValueInfoProto output = 12; + + // Information for the values in the graph. The ValueInfoProto.name's + // must be distinct. It is optional for a value to appear in value_info list. + repeated ValueInfoProto value_info = 13; + + // Additional annotations. + repeated AttributeProto annotations = 14; + + // DO NOT USE the following fields, they were deprecated from earlier versions. + // repeated string input = 3; + // repeated string output = 4; + // optional int64 ir_version = 6; + // optional int64 producer_version = 7; + // optional string producer_tag = 8; + // optional string domain = 9; +} + +// Tensors +// +// A serialized tensor value. +message TensorProto { + enum DataType { + UNDEFINED = 0; + // Basic types. + FLOAT = 1; // float + UINT8 = 2; // uint8_t + INT8 = 3; // int8_t + UINT16 = 4; // uint16_t + INT16 = 5; // int16_t + INT32 = 6; // int32_t + INT64 = 7; // int64_t + STRING = 8; // string + BOOL = 9; // bool + + // Advanced types + FLOAT16 = 10; + DOUBLE = 11; + UINT32 = 12; + UINT64 = 13; + COMPLEX64 = 14; // complex with float32 real and imaginary components + COMPLEX128 = 15; // complex with float64 real and imaginary components + // Future extensions go here. + + // Special data type, real type information is stored in ValueInfoProto. + // If data_type is SPECIAL, raw_data should be used. + SPECIAL = 51; + } + + // The shape of the tensor. + repeated int64 dims = 1; + repeated int64 strides = 14; + + // The data type of the tensor. + optional DataType data_type = 2; + + // For very large tensors, we may want to store them in chunks, in which + // case the following fields will specify the segment that is stored in + // the current TensorProto. + message Segment { + optional int64 begin = 1; + optional int64 end = 2; + optional int64 chuck_num = 51; + optional int64 chuck_id = 52; + } + // Used as offset in the external shared data. + optional Segment segment = 3; + + // Tensor content must be organized in row-major order. + // + // Depending on the data_type field, exactly one of the fields below with + // name ending in _data is used to store the elements of the tensor. + + // For float and complex64 values + // Complex64 tensors are encoded as a single array of floats, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. + repeated float float_data = 4 [packed = true]; + + // For int32, uint8, int8, uint16, int16, bool, and float16 values + // float16 values must be bit-wise converted to an uint16_t prior + // to writing to the buffer. + // When this field is present, the data_type field MUST be + // INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16 + repeated int32 int32_data = 5 [packed = true]; + + // For strings. + // Each element of string_data is a UTF-8 encoded Unicode + // string. No trailing null, no leading BOM. The protobuf "string" + // scalar type is not used to match ML community conventions. + // When this field is present, the data_type field MUST be STRING + repeated bytes string_data = 6; + + // For int64. + // When this field is present, the data_type field MUST be INT64 + repeated int64 int64_data = 7 [packed = true]; + + // Optionally, a name for the tensor. + optional string name = 8; // namespace Value + + // A human-readable documentation for this tensor. Markdown is allowed. + optional string doc_string = 12; + + // Serializations can either use one of the fields above, or use this + // raw bytes field. The only exception is the string case, where one is + // required to store the content in the repeated bytes string_data field. + // + // When this raw_data field is used to store tensor value, elements MUST + // be stored in as fixed-width, little-endian order. + // Floating-point data types MUST be stored in IEEE 754 format. + // Complex64 elements must be written as two consecutive FLOAT values, real component first. + // Complex128 elements must be written as two consecutive DOUBLE values, real component first. + // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // + // Note: the advantage of specific field rather than the raw_data field is + // that in some cases (e.g. int data), protobuf does a better packing via + // variable length storage, and may lead to smaller binary footprint. + // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED + optional bytes raw_data = 9; + + // For double + // Complex64 tensors are encoded as a single array of doubles, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 + repeated double double_data = 10 [packed = true]; + + // For uint64 and uint32 values + // When this field is present, the data_type field MUST be + // UINT32 or UINT64 + repeated uint64 uint64_data = 11 [packed = true]; + + // External data by file name + optional string external_data = 13; + + // If two tensors represent the same weights/content, use alias. + // Must exist a TensorProto named alias in the initializer list. + // To avoid the duplicate tensor in attribute, such as value in Constant node. + // This is useful, if everything is stored just in the proto. + optional string alias = 16; + + // Additional annotations. + repeated AttributeProto annotations = 17; + + // Device info + optional caffe2.DeviceOption device_option = 51; + + // For PyTorch serialized tensor. + optional int64 require_gradient = 52; + optional int64 is_buffer = 53; +} + +// Defines a tensor shape. A dimension can be either an integer value +// or a symbolic variable. A symbolic variable represents an unknown +// dimension. +message TensorShapeProto { + message Dimension { + oneof value { + int64 dim_value = 1; + string dim_param = 2; // namespace Shape + }; + // Standard denotation can optionally be used to denote tensor + // dimensions with standard semantic descriptions to ensure + // that operations are applied to the correct axis of a tensor. + // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition + // for pre-defined dimension denotations. + optional string denotation = 3; + }; + // To represent a scalar, using no dim to represent 0-d tensor. + repeated Dimension dim = 1; + + repeated Dimension stride = 51; +} + +// Types +// +// The standard ONNX data types. +message TypeProto { + + message Tensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST be present for this version of the IR. + optional TensorProto.DataType elem_type = 1; + optional TensorShapeProto shape = 2; + } + + // Sequence type: List, Tuple + message Sequence { + // elem_type and elem_type_list cannot appear together. + // If all the element types are the same, we use elem_type, + // otherwise, we specify the type of each element in elem_type_list. + optional TypeProto elem_type = 1; + repeated TypeProto elem_type_list = 51; + enum SequenceType { + UNDEFINED = 0; + LIST = 1; + TUPLE = 2; + } + optional SequenceType sequence_type = 52; + } + + // Map, (not necessary at this moment) + message Map { + optional TensorProto.DataType key_type = 1; + optional TypeProto value_type = 2; + } + + // Special type of blobs, based on the type_name, we can choose the right + // serializer and deserialzier. + message SpecialBlob { + optional string type_name = 1; + } + + oneof value { + // The type of a tensor. + Tensor tensor_type = 1; + Sequence sequence_type = 4; + Map map_type = 5; + SpecialBlob special_type = 51; + } + + // An optional denotation can be used to denote the whole + // type with a standard semantic description as to what is + // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition + // for pre-defined type denotations. + optional string denotation = 6; +} + +// Operator Sets +// +// OperatorSets are uniquely identified by a (domain, opset_version) pair. +message OperatorSetIdProto { + // The domain of the operator set being identified. + // The empty string ("") or absence of this field implies the operator + // set that is defined as part of the ONNX specification. + // This field MUST be present in this version of the IR when referring to any other operator set. + optional string domain = 1; + + // The version of the operator set being identified. + // This field MUST be present in this version of the IR. + optional int64 version = 2; +} diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc index b573968d9095ed..eb06524cae8417 100644 --- a/caffe2/utils/proto_wrap.cc +++ b/caffe2/utils/proto_wrap.cc @@ -29,3 +29,18 @@ void ShutdownProtobufLibrary() { } } // namespace caffe2 + +namespace torch { + +// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function +// used to avoid duplicated global variable in the case when protobuf +// is built with hidden visibility. +CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() { + return ::google::protobuf::internal::GetEmptyStringAlreadyInited(); +} + +void ShutdownProtobufLibrary() { + ::google::protobuf::ShutdownProtobufLibrary(); +} + +} // namespace torch diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake index 9adc2a2be8347a..ed12b3b90e5480 100644 --- a/cmake/ProtoBuf.cmake +++ b/cmake/ProtoBuf.cmake @@ -179,7 +179,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var) # If we remove all reference to these pb.h files from external # libraries and binaries this rewrite can be removed. - COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake + COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil} COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM ) From d09041bd81f4264b82f3cb6c1b05e486b234efcc Mon Sep 17 00:00:00 2001 From: Fei Sun Date: Tue, 11 Sep 2018 10:59:03 -0700 Subject: [PATCH 039/237] Add an option to statically link cuda (#10596) Summary: Need to link CUDA statically for benchmarking purpose. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10596 Reviewed By: llyfacebook Differential Revision: D9370738 Pulled By: sf-wind fbshipit-source-id: 4464d62473e95fe8db65b0bd3b301f262bf269bf --- scripts/build_windows.bat | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat index ebb76ccd5a3e30..281d20a8d6926a 100644 --- a/scripts/build_windows.bat +++ b/scripts/build_windows.bat @@ -10,6 +10,10 @@ setlocal SET ORIGINAL_DIR=%cd% SET CAFFE2_ROOT=%~dp0%.. +if NOT DEFINED CAFFE2_STATIC_LINK_CUDA ( + set CAFFE2_STATIC_LINK_CUDA=OFF +) + if NOT DEFINED CMAKE_BUILD_TYPE ( set CMAKE_BUILD_TYPE=Release ) @@ -57,6 +61,7 @@ cmake .. ^ -G%CMAKE_GENERATOR% ^ -DBUILD_TEST=OFF ^ -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ + -DCAFFE2_STATIC_LINK_CUDA=%CAFFE2_STATIC_LINK_CUDA% ^ -DUSE_CUDA=%USE_CUDA% ^ -DTORCH_CUDA_ARCH_LIST=5.0 ^ -DUSE_NNPACK=OFF ^ From a566bc2f1140f9a51343656f8b1b47a6b72a5fc9 Mon Sep 17 00:00:00 2001 From: Will Feng Date: Tue, 11 Sep 2018 11:20:07 -0700 Subject: [PATCH 040/237] Disable all CircleCI jobs (#11523) Summary: Disable all CircleCI jobs until we are ready to move forward with them. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11523 Differential Revision: D9774462 Pulled By: yf225 fbshipit-source-id: c5724e71eb68bac4df958b4f7bcc380050668b3c --- .circleci/config.yml | 144 +++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bcd2a5527b9835..2390d6ac5c0a87 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -845,36 +845,36 @@ workflows: version: 2 build: jobs: - - pytorch_linux_trusty_py2_7_9_build_test - - pytorch_linux_trusty_py2_7_build_test - - pytorch_linux_trusty_py3_5_build_test - - pytorch_linux_trusty_py3_6_gcc4_8_build_test - - pytorch_linux_trusty_py3_6_gcc5_4_build_test - - pytorch_linux_trusty_py3_6_gcc7_build_test - - pytorch_linux_trusty_pynightly_build_test - - pytorch_linux_xenial_py3_clang5_asan_build - - pytorch_linux_xenial_py3_clang5_asan_test: - requires: - - pytorch_linux_xenial_py3_clang5_asan_build - - pytorch_linux_xenial_cuda8_cudnn6_py3_build - - pytorch_linux_xenial_cuda8_cudnn6_py3_test: - requires: - - pytorch_linux_xenial_cuda8_cudnn6_py3_build - - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test: - requires: - - pytorch_linux_xenial_cuda8_cudnn6_py3_build - - pytorch_linux_xenial_cuda9_cudnn7_py2_build - - pytorch_linux_xenial_cuda9_cudnn7_py2_test: - requires: - - pytorch_linux_xenial_cuda9_cudnn7_py2_build - - pytorch_linux_xenial_cuda9_cudnn7_py3_build - - pytorch_linux_xenial_cuda9_cudnn7_py3_test: - requires: - - pytorch_linux_xenial_cuda9_cudnn7_py3_build - - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build - - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test: - requires: - - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build + # - pytorch_linux_trusty_py2_7_9_build_test + # - pytorch_linux_trusty_py2_7_build_test + # - pytorch_linux_trusty_py3_5_build_test + # - pytorch_linux_trusty_py3_6_gcc4_8_build_test + # - pytorch_linux_trusty_py3_6_gcc5_4_build_test + # - pytorch_linux_trusty_py3_6_gcc7_build_test + # - pytorch_linux_trusty_pynightly_build_test + # - pytorch_linux_xenial_py3_clang5_asan_build + # - pytorch_linux_xenial_py3_clang5_asan_test: + # requires: + # - pytorch_linux_xenial_py3_clang5_asan_build + # - pytorch_linux_xenial_cuda8_cudnn6_py3_build + # - pytorch_linux_xenial_cuda8_cudnn6_py3_test: + # requires: + # - pytorch_linux_xenial_cuda8_cudnn6_py3_build + # - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test: + # requires: + # - pytorch_linux_xenial_cuda8_cudnn6_py3_build + # - pytorch_linux_xenial_cuda9_cudnn7_py2_build + # - pytorch_linux_xenial_cuda9_cudnn7_py2_test: + # requires: + # - pytorch_linux_xenial_cuda9_cudnn7_py2_build + # - pytorch_linux_xenial_cuda9_cudnn7_py3_build + # - pytorch_linux_xenial_cuda9_cudnn7_py3_test: + # requires: + # - pytorch_linux_xenial_cuda9_cudnn7_py3_build + # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build + # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test: + # requires: + # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build # - pytorch_macos_10_13_py3_build # - pytorch_macos_10_13_py3_test: @@ -882,48 +882,48 @@ workflows: # - pytorch_macos_10_13_py3_build # - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: - requires: - - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: - requires: - - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test: - requires: - - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build - - caffe2_py2_mkl_ubuntu16_04_build - - caffe2_py2_mkl_ubuntu16_04_test: - requires: - - caffe2_py2_mkl_ubuntu16_04_build - - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test: - requires: - - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build - - caffe2_py2_gcc4_8_ubuntu14_04_build - - caffe2_py2_gcc4_8_ubuntu14_04_test: - requires: - - caffe2_py2_gcc4_8_ubuntu14_04_build - - caffe2_onnx_py2_gcc5_ubuntu16_04_build - - caffe2_onnx_py2_gcc5_ubuntu16_04_test: - requires: - - caffe2_onnx_py2_gcc5_ubuntu16_04_build - - caffe2_conda2_ubuntu16_04_build - - caffe2_conda2_ubuntu16_04_test: - requires: - - caffe2_conda2_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build - - caffe2_py2_gcc4_9_ubuntu14_04_build - - caffe2_py2_clang3_8_ubuntu16_04_build - - caffe2_py2_clang3_9_ubuntu16_04_build - - caffe2_py2_gcc6_ubuntu16_04_build - - caffe2_py2_gcc7_ubuntu16_04_build - - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build - - caffe2_py2_android_ubuntu16_04_build - - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build - - caffe2_py2_cuda9_0_cudnn7_centos7_build + # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build + # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build + # - caffe2_py2_mkl_ubuntu16_04_build + # - caffe2_py2_mkl_ubuntu16_04_test: + # requires: + # - caffe2_py2_mkl_ubuntu16_04_build + # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test: + # requires: + # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build + # - caffe2_py2_gcc4_8_ubuntu14_04_build + # - caffe2_py2_gcc4_8_ubuntu14_04_test: + # requires: + # - caffe2_py2_gcc4_8_ubuntu14_04_build + # - caffe2_onnx_py2_gcc5_ubuntu16_04_build + # - caffe2_onnx_py2_gcc5_ubuntu16_04_test: + # requires: + # - caffe2_onnx_py2_gcc5_ubuntu16_04_build + # - caffe2_conda2_ubuntu16_04_build + # - caffe2_conda2_ubuntu16_04_test: + # requires: + # - caffe2_conda2_ubuntu16_04_build + # - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_gcc4_9_ubuntu14_04_build + # - caffe2_py2_clang3_8_ubuntu16_04_build + # - caffe2_py2_clang3_9_ubuntu16_04_build + # - caffe2_py2_gcc6_ubuntu16_04_build + # - caffe2_py2_gcc7_ubuntu16_04_build + # - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build + # - caffe2_py2_android_ubuntu16_04_build + # - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build + # - caffe2_py2_cuda9_0_cudnn7_centos7_build # - caffe2_py2_ios_macos10_13_build # - caffe2_py2_system_macos10_13_build From 781737f84c5f6e067e8e02648c8bfc3a2df2dbaa Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 12:07:01 -0700 Subject: [PATCH 041/237] Remove time prefix from rsync (#11525) Summary: This fails with zsh saying "time: command not found". cc soumith Pull Request resolved: https://github.com/pytorch/pytorch/pull/11525 Differential Revision: D9772522 Pulled By: apaszke fbshipit-source-id: b80d108fa6b174d68ada08a9fdbf7260ee37e08f --- tools/build_pytorch_libs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 3134bcafa6f968..579cde711eae43 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -12,7 +12,7 @@ set -ex SYNC_COMMAND="cp" if [ -x "$(command -v rsync)" ]; then - SYNC_COMMAND="time rsync -lptgoD" + SYNC_COMMAND="rsync -lptgoD" fi # Options for building only a subset of the libraries From fbc17321fdd96ce4dcc275ff61264d04f2fbd6e2 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 11 Sep 2018 12:24:52 -0700 Subject: [PATCH 042/237] Update pybind11 to fix Python 3.7 support for script (#11473) Summary: Fixes https://github.com/pytorch/pytorch/issues/11419 In particular pulling in https://github.com/pybind/pybind11/pull/1454 as well as pending bugfix in https://github.com/pybind/pybind11/pull/1517 (documenting in comment) Pull Request resolved: https://github.com/pytorch/pytorch/pull/11473 Differential Revision: D9776003 Pulled By: jamesr66a fbshipit-source-id: a225dcfb66c06bcae98fd2508d9e690c24be551a --- third_party/pybind11 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/pybind11 b/third_party/pybind11 index add56ccdcac23a..5c8746ff135abb 160000 --- a/third_party/pybind11 +++ b/third_party/pybind11 @@ -1 +1 @@ -Subproject commit add56ccdcac23a6c522a2c1174a866e293c61dab +Subproject commit 5c8746ff135abb390bf95944be593e895a586a50 From 5952acc041db96b99dc0d463a28d74da9d3b28e5 Mon Sep 17 00:00:00 2001 From: Will Feng Date: Tue, 11 Sep 2018 12:29:47 -0700 Subject: [PATCH 043/237] Add "merge to master" step before build in CircleCI (#11443) Summary: This PR adds the "merge to master" step before the build step in CircleCI, so that all PR commits are built against master instead of against the PR's branch. Note that all PRs still need to rebase to master to pick up this new config, so it won't apply to old PR branches retroactively. To check in CI: make sure it's performing the git merge to master appropriately in "Merge Onto Master" step. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11443 Differential Revision: D9775628 Pulled By: yf225 fbshipit-source-id: 8083db6b098d234a44ae4481f40a486e9906f6f8 --- .circleci/config.yml | 89 +++++++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 22 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2390d6ac5c0a87..476e9867cf2a4a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,11 +5,37 @@ docker_config_defaults: &docker_config_defaults aws_access_key_id: AKIAJ2J6FIG5OSZTQ3IA aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} +# NOTE: We only perform the merge in build step and not in test step, because +# all source files will be shared from build to test +merge_pull_request_onto_master: &merge_pull_request_onto_master + name: Merge Onto Master + no_output_timeout: "10h" + command: | + if [[ "${CIRCLE_BRANCH}" != "master" ]]; then + git config --global user.email "circleci.ossci@gmail.com" + git config --global user.name "CircleCI" + + git config remote.origin.url https://github.com/pytorch/pytorch.git + git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master + git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=50 --quiet + + export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master` + echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET} + export GIT_COMMIT=${CIRCLE_SHA1} + echo "GIT_COMMIT: " ${GIT_COMMIT} + + git checkout -f ${GIT_COMMIT} + git reset --hard ${GIT_COMMIT} + git merge --no-edit --no-ff ${GIT_MERGE_TARGET} + fi + pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults resource_class: large working_directory: /var/lib/jenkins/workspace steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -22,7 +48,7 @@ pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults # This IAM user allows write access to S3 bucket for sccache export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init + git submodule sync && git submodule update --init .jenkins/pytorch/build.sh .jenkins/pytorch/test.sh @@ -31,6 +57,8 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults working_directory: /var/lib/jenkins/workspace steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -46,16 +74,18 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults # This IAM user allows write access to S3 bucket for sccache export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init || git submodule update --init || git submodule update --init + git submodule sync && git submodule update --init .jenkins/pytorch/build.sh - mkdir -p pytorch-ci-env/ - cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch pytorch-ci-env/torch - cp -r build/bin pytorch-ci-env/cpp_test_bin + export PYTORCH_CI_ENV_DIR=/var/lib/jenkins/pytorch-ci-env + mkdir -p ${PYTORCH_CI_ENV_DIR} + cp -r /var/lib/jenkins/workspace ${PYTORCH_CI_ENV_DIR}/build_workspace # This copies all source files from build step to the next step + cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch ${PYTORCH_CI_ENV_DIR}/torch + cp -r build/bin ${PYTORCH_CI_ENV_DIR}/cpp_test_bin if [ -d "../cpp-build" ]; then - cp -r ../cpp-build pytorch-ci-env/cpp-build + cp -r ../cpp-build ${PYTORCH_CI_ENV_DIR}/cpp-build fi - persist_to_workspace: - root: /var/lib/jenkins/workspace/pytorch-ci-env + root: /var/lib/jenkins/pytorch-ci-env paths: - "*" @@ -63,7 +93,6 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults machine: image: default steps: - - checkout - run: name: Prepare workspace command: | @@ -107,12 +136,16 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) fi pwd + + cp -r /opt/workspace/build_workspace/. /home/circleci/project # This copies all source files from build step to the current step + echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env # This IAM user allows write access to S3 bucket for sccache echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env + mkdir -p /home/circleci/project/build cp -r /opt/workspace/cpp_test_bin /home/circleci/project/build/bin docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" @@ -122,9 +155,9 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults docker cp "/opt/workspace/cpp-build" "$id:/var/lib/jenkins/cpp-build" fi if [ -n "${MULTI_GPU}" ]; then - (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash + (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash else - (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash + (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash fi caffe2_linux_build_defaults: &caffe2_linux_build_defaults @@ -132,6 +165,8 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults working_directory: /var/lib/jenkins/workspace steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -152,7 +187,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults cd third_party/onnx && git fetch --tags --progress origin +refs/pull/*:refs/remotes/origin/pr/* && cd - # Reinitialize submodules - git submodule update --init --recursive + git submodule sync && git submodule update --init --recursive # Ensure jenkins can write to the ccache root dir. sudo chown jenkins:jenkins "${HOME}/.ccache" @@ -189,16 +224,18 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults fi # Copy all necessary binaries to shared workspace - mkdir -p caffe2-ci-env - cp -r third_party/onnx caffe2-ci-env/onnx + export CAFFE2_CI_ENV_DIR=/var/lib/jenkins/caffe2-ci-env + mkdir -p ${CAFFE2_CI_ENV_DIR} + cp -r /var/lib/jenkins/workspace ${CAFFE2_CI_ENV_DIR}/build_workspace # This copies all source files from build step to the next step + cp -r third_party/onnx ${CAFFE2_CI_ENV_DIR}/onnx if [ -d "/usr/local/caffe2" ]; then - cp -r /usr/local/caffe2 caffe2-ci-env/caffe2 + cp -r /usr/local/caffe2 ${CAFFE2_CI_ENV_DIR}/caffe2 fi if [ -d "/opt/conda" ]; then - cp -r /opt/conda caffe2-ci-env/conda_env + cp -r /opt/conda ${CAFFE2_CI_ENV_DIR}/conda_env fi - persist_to_workspace: - root: /var/lib/jenkins/workspace/caffe2-ci-env + root: /var/lib/jenkins/caffe2-ci-env paths: - "*" @@ -206,7 +243,6 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults machine: image: default steps: - - checkout - run: name: Prepare workspace command: | @@ -250,6 +286,7 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) fi pwd + cp -r /opt/workspace/build_workspace/. /home/circleci/project # This copies all source files from build step to the current step echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env # This IAM user allows write access to S3 bucket for sccache @@ -315,6 +352,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults xcode: "9.0" steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build no_output_timeout: "10h" @@ -326,7 +365,7 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults brew install cmake # Reinitialize submodules - git submodule update --init --recursive + git submodule sync && git submodule update --init --recursive # Reinitialize path (see man page for path_helper(8)) eval `/usr/libexec/path_helper -s` @@ -525,6 +564,8 @@ jobs: xcode: "9.0" steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build environment: @@ -544,9 +585,12 @@ jobs: export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init + git submodule sync && git submodule update --init chmod a+x .jenkins/pytorch/macos-build.sh .jenkins/pytorch/macos-build.sh + + # TODO: need to share source files from build to test, when macOS builds are enabled + - persist_to_workspace: root: /Users/distiller/pytorch-ci-env paths: @@ -556,7 +600,6 @@ jobs: macos: xcode: "9.0" steps: - - checkout - run: name: Prepare workspace command: | @@ -570,9 +613,9 @@ jobs: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3 no_output_timeout: "10h" command: | + # TODO: need to share source files from build to test, when macOS builds are enabled set -ex export IN_CIRCLECI=1 - git submodule update --init chmod a+x .jenkins/pytorch/macos-test.sh .jenkins/pytorch/macos-test.sh @@ -581,6 +624,8 @@ jobs: xcode: "9.0" steps: - checkout + - run: + <<: *merge_pull_request_onto_master - run: name: Build environment: @@ -616,7 +661,7 @@ jobs: export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET} - git submodule update --init + git submodule sync && git submodule update --init chmod a+x .jenkins/pytorch/macos-build.sh .jenkins/pytorch/macos-build.sh From c56a7cfc37aae8b5e1515da37b314ed03bff8641 Mon Sep 17 00:00:00 2001 From: vishwakftw Date: Tue, 11 Sep 2018 12:40:56 -0700 Subject: [PATCH 044/237] More use of AT_CHECK and AT_ERROR (#11457) Summary: Considering these increase the size of the message stack, I didn't touch the code outside `ATen/native` Differential Revision: D9754283 Pulled By: soumith fbshipit-source-id: 04198ec4fd0c4abae09eeba92c493a783408537a --- aten/src/ATen/native/Convolution.cpp | 147 +++++++----------- aten/src/ATen/native/Indexing.cpp | 52 +++---- aten/src/ATen/native/Normalization.cpp | 83 ++++------ aten/src/ATen/native/RoiPooling.cpp | 2 +- aten/src/ATen/native/SpectralOpsUtils.h | 2 +- aten/src/ATen/native/TensorShape.cpp | 108 +++++-------- aten/src/ATen/native/cuda/CuFFTPlanCache.h | 30 ++-- aten/src/ATen/native/cuda/SpectralOps.cu | 6 +- .../ATen/native/cudnn/AffineGridGenerator.cpp | 4 +- aten/src/ATen/native/cudnn/BatchNorm.cpp | 4 +- aten/src/ATen/native/cudnn/Conv.cpp | 38 ++--- aten/src/ATen/native/cudnn/GridSampler.cpp | 4 +- aten/src/ATen/native/cudnn/LossCTC.cpp | 2 +- aten/src/ATen/native/cudnn/RNN.cpp | 147 +++++++----------- .../ATen/native/miopen/BatchNorm_miopen.cpp | 4 +- aten/src/ATen/native/miopen/Conv_miopen.cpp | 36 ++--- aten/src/ATen/native/mkl/SpectralOps.cpp | 20 +-- aten/src/ATen/native/mkldnn/Conv.cpp | 8 +- 18 files changed, 274 insertions(+), 423 deletions(-) diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index f359d67c72e786..b3fcff1b2ccd29 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -155,70 +155,51 @@ static void check_input_shape_forward(const at::Tensor& input, int64_t k = input.ndimension(); int64_t weight_dim = weight.ndimension(); - if (weight_dim != k) { - std::stringstream ss; - ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim - << "-dimensional weight " << weight.sizes() << ", but got input of size " - << input.sizes() << " instead"; - throw std::runtime_error(ss.str()); - } - if (weight.size(0) < groups) { - std::stringstream ss; - ss << "Given groups=" << groups << ", expected weight to be at least " - << groups << " at dimension 0, but got weight of size " << weight.sizes() - << " instead"; - throw std::runtime_error(ss.str()); - } - if (weight.size(0) % groups != 0) { - std::stringstream ss; - ss << "Given groups=" << groups << ", expected weight to be divisible by " - << groups << " at dimension 0, but got weight of size " << weight.sizes() - << " instead"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(weight_dim == k, + "Expected ", weight_dim, "-dimensional input for ", weight_dim, + "-dimensional weight ", weight.sizes(), ", but got input of size ", + input.sizes(), " instead"); + AT_CHECK(weight.size(0) >= groups, + "Given groups=", groups, ", expected weight to be at least ", groups, + " at dimension 0, but got weight of size ", weight.sizes(), " instead"); + AT_CHECK(weight.size(0) % groups == 0, + "Given groups=", groups, ", expected weight to be divisible by ", + groups, " at dimension 0, but got weight of size ", weight.sizes(), + " instead"); if (!transposed) { - if (input.size(1) != (weight.size(1) * groups)) { - std::stringstream ss; - ss << "Given groups=" << groups << ", weight of size " << weight.sizes() - << ", expected input" << input.sizes() << " to have " - << (weight.size(1) * groups) << " channels, but got " << input.size(1) - << " channels instead"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(0))) { - std::stringstream ss; - ss << "Given weight of size " << weight.sizes() - << ", expected bias to be 1-dimensional with " << weight.size(0) << " elements" - << ", but got bias of size " << bias.sizes() << " instead"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.size(1) == (weight.size(1) * groups), + "Given groups=", groups, ", weight of size ", weight.sizes(), + ", expected input", input.sizes(), " to have ", + (weight.size(1) * groups), " channels, but got ", input.size(1), + " channels instead"); + AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(0)), + "Given weight of size ", weight.sizes(), + ", expected bias to be 1-dimensional with ", weight.size(0), " elements", + ", but got bias of size ", bias.sizes(), " instead"); } else { // transposed - if (input.size(1) != weight.size(0)) { - std::stringstream ss; - ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() - << ", expected input" << input.sizes() << " to have " - << weight.size(0) << " channels, but got " << input.size(1) - << " channels instead"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(1) * groups)) { - std::stringstream ss; - ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes() - << ", expected bias to be 1-dimensional with " << weight.size(1) * groups << " elements" - << ", but got bias of size " << bias.sizes() << " instead"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.size(1) == weight.size(0), + "Given transposed=", transposed, ", weight of size ", weight.sizes(), + ", expected input", input.sizes(), " to have ", weight.size(0), + " channels, but got ", input.size(1), " channels instead"); + AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(1) * groups), + "Given transposed=", transposed, ", weight of size ", weight.sizes(), + ", expected bias to be 1-dimensional with ", weight.size(1) * groups, " elements", + ", but got bias of size ", bias.sizes(), " instead"); } } static auto view4d(const at::Tensor& tensor) -> at::Tensor { - if (tensor.ndimension() != 3) throw std::runtime_error("expected 3D tensor"); + AT_CHECK(tensor.ndimension() == 3, + "expected 3D tensor, got tensor with ", tensor.ndimension(), + " dimensions instead"); return tensor.unsqueeze(2); } static auto view3d(const at::Tensor& tensor) -> at::Tensor { - if (tensor.ndimension() != 4) throw std::runtime_error("expected 4D tensor"); + AT_CHECK(tensor.ndimension() == 4, + "expected 4D tensor, got tensor with ", tensor.ndimension(), + " dimensions instead"); return tensor.squeeze(2); } @@ -293,7 +274,7 @@ static inline std::vector convolution_expand_param_if_needed( ss << "expected " << param_name << " to be a single integer value or a " << "list of " << expected_dim << " values to match the convolution " << "dimensions, but got " << param_name << "=" << list_param; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } else { return list_param.vec(); } @@ -311,9 +292,7 @@ at::Tensor _convolution( auto k = weight.ndimension(); int64_t dim = k - 2; - if (dim <= 0) { - throw std::runtime_error("weight should have at least two dimensions"); - } + AT_CHECK(dim > 0, "weight should at least have at least two dimensions"); ConvParams params; params.stride = convolution_expand_param_if_needed(stride_, "stride", dim); @@ -326,8 +305,8 @@ at::Tensor _convolution( params.deterministic = deterministic; params.cudnn_enabled = cudnn_enabled; - if (params.is_padding_neg()) throw std::runtime_error("negative padding is not supported"); - if (params.is_output_padding_neg()) throw std::runtime_error("negative output_padding is not supported"); + AT_CHECK(!params.is_padding_neg(), "negative padding is not supported"); + AT_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported"); check_input_shape_forward(input, weight, bias, params.groups, params.transposed); @@ -349,16 +328,12 @@ at::Tensor _convolution( output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation); } else if (params.use_cudnn(input)) { - if (input.type() != weight.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && input.type() != bias.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.type() == weight.type(), + "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), + ") should be the same"); + AT_CHECK(!bias.defined() || (input.type() == bias.type()), + "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), + ") should be the same"); if (params.transposed) { output = at::cudnn_convolution_transpose( @@ -370,16 +345,12 @@ at::Tensor _convolution( params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); } } else if (params.use_miopen(input)) { - if (input.type() != weight.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && input.type() != bias.type()){ - std::stringstream ss; - ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.type() == weight.type(), + "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), + ") should be the same"); + AT_CHECK(!bias.defined() || (input.type() == bias.type()), + "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), + ") should be the same"); if (params.transposed) { output = at::miopen_convolution_transpose( @@ -392,16 +363,12 @@ at::Tensor _convolution( } } else if (params.use_mkldnn(input)) { #if AT_MKLDNN_ENABLED() - if (input.type() != weight.type()){ - std::stringstream ss; - ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } - if (bias.defined() && input.type() != bias.type()){ - std::stringstream ss; - ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(input.type() == weight.type(), + "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), + ") should be the same"); + AT_CHECK(!bias.defined() || (input.type() == bias.type()), + "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), + ") should be the same"); output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups); #endif @@ -487,7 +454,7 @@ at::Tensor _convolution_nogroup( } } - throw std::runtime_error("unsupported ConvNd parameters"); + AT_ERROR("unsupported ConvNd parameters"); } static Tensor subvariable(const Tensor& var, int dim, int groups, int g) { diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp index 288fa283abe660..5566fd397320aa 100644 --- a/aten/src/ATen/native/Indexing.cpp +++ b/aten/src/ATen/native/Indexing.cpp @@ -39,7 +39,7 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, ss << "The shape of the mask " << mask.sizes() << " at index " << maskIdx; ss << " does not match the shape of the indexed tensor " << self.sizes(); ss << " at index " << idx; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } static void checkIndexTensorTypes(TensorList indices) { @@ -47,9 +47,8 @@ static void checkIndexTensorTypes(TensorList indices) { if (tensor.defined()) { auto& type = tensor.type(); auto scalarType = type.scalarType(); - if (scalarType != kLong && scalarType != kByte) { - throw std::runtime_error("tensors used as indices must be long or byte tensors"); - } + AT_CHECK(scalarType == kLong || scalarType == kByte, + "tensors used as indices must be long or byte tensors"); } } } @@ -146,12 +145,10 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) if (index.numel() != 0) { auto max_idx = index.max().toCLong(); auto min_idx = index.min().toCLong(); - if (max_idx >= dim_size) { - AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); - } - if (min_idx < -dim_size) { - AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); - } + AT_CHECK(max_idx < dim_size, + "index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); + AT_CHECK(min_idx >= -dim_size, + "index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size); } return index.remainder(dim_size); } @@ -230,9 +227,8 @@ static std::tuple makeLinearIndex(Tensor self, TensorList orig) } Tensor index(const Tensor & self, TensorList indices) { - if (indices.size() > (size_t)self.dim()) { - AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - } + AT_CHECK(indices.size() <= (size_t)self.dim(), + "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); Tensor src, linearIndex; std::tie(src, linearIndex) = makeLinearIndex(self, indices); @@ -240,9 +236,8 @@ Tensor index(const Tensor & self, TensorList indices) { } Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) { - if (indices.size() > (size_t)self.dim()) { - AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - } + AT_CHECK(indices.size() <= (size_t)self.dim(), + "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); Tensor src, linearIndex, expandedValue; std::tie(src, linearIndex) = makeLinearIndex(self, indices); @@ -252,9 +247,8 @@ Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) } Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) { - if (indices.size() > (size_t)self.dim()) { - AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - } + AT_CHECK(indices.size() <= (size_t)self.dim(), + "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); Tensor src, linearIndex, expandedValue; std::tie(src, linearIndex) = makeLinearIndex(self, indices); @@ -265,18 +259,14 @@ Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) { Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { dim = maybe_wrap_dim(dim, self.dim()); - if (index.dim() >= 2) { - AT_ERROR( - "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); - } + AT_CHECK(index.dim() < 2, + "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); + int64_t numIndices = index.numel(); - if (source.dim() == 0 && numIndices != 1) { - AT_ERROR( - "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); - } - if (index.type().scalarType() != ScalarType::Long) { - AT_ERROR("index_copy_(): Expected LongTensor for index"); - } + AT_CHECK(source.dim() != 0 || numIndices == 1, + "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); + AT_CHECK(index.type().scalarType() == ScalarType::Long, + "index_copy_(): Expected LongTensor for index"); // Check that source and destination slices have the same size auto selfSlicedSizes = self.sizes().vec(); @@ -294,7 +284,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten ss << "index_copy_(): Source/destination tensor must have same slice shapes. "; ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << dim; ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0."; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } if (source.dim() > 0 && numIndices != source.size(dim)) { AT_ERROR( diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 24d8a41fb50271..750e42568715d7 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -11,11 +11,8 @@ namespace at { namespace native { namespace { void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){ - if (actual != expected){ - std::stringstream ss; - ss << arg_name << " should contain " << expected << " elements not " << actual ; - throw std::runtime_error(ss.str()); - } + AT_CHECK(actual == expected, + arg_name, " should contain ", expected, " elements not ", actual); } } @@ -28,12 +25,12 @@ Tensor batch_norm( if (running_mean.defined()) { check_dims_match_num_input_features("running_mean", num_features, running_mean.numel()); } else if (!training) { - throw std::runtime_error("running_mean must be defined in evaluation mode"); + AT_ERROR("running_mean must be defined in evaluation mode"); } if (running_var.defined()) { check_dims_match_num_input_features("running_var", num_features, running_var.numel()); } else if (!training) { - throw std::runtime_error("running_var must be defined in evaluation mode"); + AT_ERROR("running_var must be defined in evaluation mode"); } if (weight.defined()) { check_dims_match_num_input_features("weight", num_features, weight.numel()); @@ -89,29 +86,19 @@ Tensor layer_norm(const Tensor& input, IntList normalized_shape, int64_t normalized_ndim = normalized_shape.size(); - if (normalized_ndim < 1) { - std::stringstream ss; - ss << "Expected normalized_shape to be at least 1-dimensional, i.e., " - << "containing at least one element, but got normalized_shape=" - << normalized_shape; - throw std::runtime_error(ss.str()); - } + AT_CHECK(normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape=", + normalized_shape); - if (weight.defined() && !weight.sizes().equals(normalized_shape)) { - std::stringstream ss; - ss << "Expected weight to be of same shape as normalized_shape, but got " - << "weight of shape " << weight.sizes() << " and normalized_shape=" - << normalized_shape; - throw std::runtime_error(ss.str()); - } - - if (bias.defined() && !bias.sizes().equals(normalized_shape)) { - std::stringstream ss; - ss << "Expected bias to be of same shape as normalized_shape, but got " - << "bias of shape " << bias.sizes() << " and normalized_shape=" - << normalized_shape; - throw std::runtime_error(ss.str()); - } + AT_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", weight.sizes(), " and normalized_shape=", + normalized_shape); + AT_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", bias.sizes(), " and normalized_shape=", + normalized_shape); auto input_shape = input.sizes(); auto input_ndim = input.dim(); @@ -125,7 +112,7 @@ Tensor layer_norm(const Tensor& input, IntList normalized_shape, ss << ", " << size; } ss << "], but got input of size" << input_shape; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } int64_t n = 1; @@ -159,29 +146,19 @@ Tensor group_norm(const Tensor& input, int64_t num_groups, int64_t b = input.size(0); int64_t c = input.size(1); - if (c % num_groups != 0) { - std::stringstream ss; - ss << "Expected number of channels in input to be divisible by " - << "num_groups, but got input of shape " << input.sizes() << " and " - << "num_groups=" << num_groups; - throw std::runtime_error(ss.str()); - } - - if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) { - std::stringstream ss; - ss << "Expected weight to be a vector of size equal to the number of " - << "channels in input, but got weight of shape " << weight.sizes() - << " and input of shape " << input.sizes(); - throw std::runtime_error(ss.str()); - } - - if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) { - std::stringstream ss; - ss << "Expected bias to be a vector of size equal to the number of " - << "channels in input, but got bias of shape " << weight.sizes() - << " and input of shape " << input.sizes(); - throw std::runtime_error(ss.str()); - } + AT_CHECK(c % num_groups == 0, + "Expected number of channels in input to be divisible by ", + "num_groups, but got input of shape ", input.sizes(), " and " + "num_groups=", num_groups); + + AT_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c), + "Expected weight to be a vector of size equal to the number of ", + "channels in input, but got weight of shape ", weight.sizes(), + " and input of shape ", input.sizes()); + AT_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c), + "Expected bias to be a vector of size equal to the number of ", + "channels in input, but got bias of shape ", weight.sizes(), + " and input of shape ", input.sizes()); // Apply group norm auto input_reshaped = input.contiguous().view({1, b * num_groups, -1}); diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp index 5995e43ef1e536..1a089a9f473c17 100644 --- a/aten/src/ATen/native/RoiPooling.cpp +++ b/aten/src/ATen/native/RoiPooling.cpp @@ -134,7 +134,7 @@ Tensor RoiPooling2d_backward_cpu( double spatialScale, const Tensor& gradOutput, const Tensor& argmaxes) { - throw std::runtime_error("not implemented"); + AT_ERROR("not implemented"); } } diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h index 7518d1f945a5fd..875c7aa12b68cf 100644 --- a/aten/src/ATen/native/SpectralOpsUtils.h +++ b/aten/src/ATen/native/SpectralOpsUtils.h @@ -51,7 +51,7 @@ inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size, std::ostringstream ss; ss << "expected real signal size " << expected_size << " is incompatible " << "with onesided complex frequency size " << complex_size; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } } diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 822bea31fee942..c22374d0964e81 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -20,9 +20,8 @@ std::vector broadcast_tensors(TensorList tensors) { static void check_cat_no_zero_dim(TensorList tensors) { for(size_t i = 0; i < tensors.size(); ++i) { auto& t = tensors[i]; - if (t.dim() == 0) { - AT_ERROR("zero-dimensional tensor (at position ", i, ") cannot be concatenated"); - } + AT_CHECK(t.dim() > 0, + "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); } } @@ -39,12 +38,11 @@ Tensor cat(TensorList tensors, int64_t dim) { } std::vector chunk(const Tensor& self, int64_t chunks, int64_t dim) { - if (self.dim() == 0) { - AT_ERROR("chunk expects at least a 1-dimensional tensor"); - } - if (chunks <= 0) { - AT_ERROR("chunk expects `chunks` to be greater than 0, got: ", chunks); - } + AT_CHECK(self.dim() > 0, + "chunk expects at least a 1-dimensional tensor"); + AT_CHECK(chunks > 0, + "chunk expects `chunks` to be greater than 0, got: ", chunks); + int64_t split_size = (self.size(dim) + chunks - 1) / chunks; // We need to call split_with_sizes in the case where split_size and dimension size are 0, because @@ -117,14 +115,11 @@ Tensor expand(const Tensor& self, IntList size, bool implicit) { // distinguish between expands inserted by broadcasts and those explicitly // requested by the user, because it is legal to remove implicit expands // from the graph, but not legal to remove the explicit ones. - if (size.size() < (size_t)self.dim()) { - std::ostringstream ss; - ss << "expand(" << self.type() << "{" << self.sizes() << "}, size=" << size - << "): the number of sizes provided (" << size.size() << ") " - << "must be greater or equal to the number of dimensions in the tensor (" - << self.dim() << ")"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(size.size() >= (size_t)self.dim(), + "expand(", self.type(), "{", self.sizes(), "}, size=", size, + "): the number of sizes provided (", size.size(), ") ", + "must be greater or equal to the number of dimensions in the tensor (", + self.dim(), ")"); std::vector expandedSizes; std::vector expandedStrides; @@ -159,17 +154,15 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { if (start != cur_size) { // start being the end is valid, but not a valid dim specification. start = maybe_wrap_dim(start, cur_size); } - if (length < 0 || start > cur_size - length) { - AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); - } + AT_CHECK(length >= 0 && start <= cur_size - length, + "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); return at::slice(self, dim, start, start + length, 1); } Tensor permute(const Tensor& self, IntList dims) { auto nDims = self.dim(); - if (dims.size() != (size_t)nDims) { - AT_ERROR("number of dims don't match in permute"); - } + AT_CHECK(dims.size() == (size_t)nDims, + "number of dims don't match in permute"); auto oldSizes = self.sizes(); auto oldStrides = self.strides(); std::vector newSizes(nDims); @@ -177,9 +170,8 @@ Tensor permute(const Tensor& self, IntList dims) { std::vector seen(nDims); for (int64_t i = 0; i < nDims; i++) { auto dim = maybe_wrap_dim(dims[i], nDims); - if (seen[dim]) { - AT_ERROR("repeated dim in permute"); - } + AT_CHECK(!seen[dim], + "repeated dim in permute"); seen[dim] = true; newSizes[i] = oldSizes[dim]; newStrides[i] = oldStrides[dim]; @@ -188,9 +180,8 @@ Tensor permute(const Tensor& self, IntList dims) { } Tensor repeat(const Tensor& self, IntList repeats) { - if (repeats.size() < (size_t)self.dim()) { - AT_ERROR("Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); - } + AT_CHECK(repeats.size() >= (size_t)self.dim(), + "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); // Add new leading dimensions to the tensor if the // number of target dimensions is larger than the @@ -238,12 +229,9 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) { AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor."); dim = maybe_wrap_dim(dim, ndim); auto size = self.size(dim); - if (index < -size || index >= size) { - std::stringstream ss; - ss << "select(): index " << index << " out of range for tensor of size "; - ss << self.sizes() << " at dimension " << dim; - throw std::runtime_error(ss.str()); - } + AT_CHECK(index >= -size && index < size, + "select(): index ", index, " out of range for tensor of size ", + self.sizes(), " at dimension ", dim); if (index < 0) { index += size; } @@ -261,10 +249,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ dim = maybe_wrap_dim(dim, ndim); auto sizes = self.sizes().vec(); auto strides = self.strides().vec(); - if (step <= 0) { - // TODO: support negative strides - throw std::runtime_error("slice step must be positive"); - } + // TODO: support negative strides + AT_CHECK(step > 0, "slice step must be positive"); if (start < 0) { start += sizes[dim]; } @@ -322,22 +308,15 @@ std::vector split_with_sizes(const Tensor& self, IntList split_sizes, in for (i = 0; i < num_splits; ++i) { auto length = split_sizes[i]; - if (length < 0) { - std::ostringstream ss; - ss << "split_with_sizes expects split_sizes have only non-negative " - << "entries, but got split_sizes=" << split_sizes; - throw std::runtime_error(ss.str()); - } + AT_CHECK(length >= 0, + "split_with_sizes expects split_sizes have only non-negative ", + "entries, but got split_sizes=", split_sizes); splits[i] = self.narrow(dim, start_idx, length); start_idx += length; } - if (start_idx != dim_size) { - std::ostringstream ss; - ss << "split_with_sizes expects split_sizes to sum exactly to " - << dim_size << " (input tensor's size at dimension " << dim << "), " - << "but got split_sizes=" << split_sizes; - throw std::runtime_error(ss.str()); - } + AT_CHECK(start_idx == dim_size, + "split_with_sizes expects split_sizes to sum exactly to ", dim_size, + " (input tensor's size at dimension ", dim, "), ", "but got split_sizes=", split_sizes); return splits; } @@ -350,28 +329,24 @@ static inline std::vector get_stack_inputs(TensorList tensors, int64_t d } Tensor stack(TensorList tensors, int64_t dim) { - if (tensors.size() == 0) { - throw std::runtime_error("stack expects a non-empty TensorList"); - } + AT_CHECK(tensors.size() > 0, + "stack expects a non-empty TensorList"); dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); return at::cat(get_stack_inputs(tensors, dim), dim); } Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) { - if (tensors.size() == 0) { - throw std::runtime_error("stack expects a non-empty TensorList"); - } + AT_CHECK(tensors.size() > 0, + "stack expects a non-empty TensorList"); dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); return at::cat_out(result, get_stack_inputs(tensors, dim), dim); } static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) { int64_t nsparseDims = self._sparseDims(); - if (dim0 >= nsparseDims || dim1 >= nsparseDims) { - AT_ERROR( - "sparse transpose: transposed dimensions must be sparse ", - "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1); - } + AT_CHECK(dim0 < nsparseDims && dim1 < nsparseDims, + "sparse transpose: transposed dimensions must be sparse ", + "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1); if (self._indices().numel() == 0 && self._values().numel() == 0) { auto sizes = self.sizes().vec(); @@ -442,10 +417,9 @@ static void check_t(const Tensor& self, const char *fn) { if (self.is_sparse()) { int64_t sparseDims = self._sparseDims(); int64_t denseDims = self._denseDims(); - if (!(sparseDims == 2 && denseDims == 0)) { - AT_ERROR(fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ", - sparseDims, " sparse and ", denseDims, " dense dimensions"); - } + AT_CHECK(sparseDims == 2 && denseDims == 0, + fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ", + sparseDims, " sparse and ", denseDims, " dense dimensions"); } else if (self.dim() != 2) { AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D"); } diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index 37c8f3a364f75a..8715a9ef460ee6 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -112,22 +112,16 @@ class CuFFTConfig { if (input.type().scalarType() == ScalarType::Half) { // cuFFT on half requires compute capability of at least SM_53 auto dev_prop = at::cuda::getCurrentDeviceProperties(); - if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) { - std::ostringstream ss; - ss << "cuFFT doesn't support signals of half type with compute " - << "capability less than SM_53, but the device containing input half " - << "tensor only has SM_" << dev_prop->major << dev_prop->minor; - throw std::runtime_error(ss.str()); - } + AT_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + "cuFFT doesn't support signals of half type with compute " + "capability less than SM_53, but the device containing input half " + "tensor only has SM_", dev_prop->major, dev_prop->minor); for (int64_t i = 0; i < signal_ndim; i++) { auto signal_size = checked_signal_sizes[i]; - if (!is_pow_of_two(signal_size)) { - std::ostringstream ss; - ss << "cuFFT doesn't support signals of half type with size at any " - << "dimension that is not a power of two, but got a signal size of " - << checked_signal_sizes; - throw std::runtime_error(ss.str()); - } + AT_CHECK(is_pow_of_two(signal_size), + "cuFFT doesn't support signals of half type with size at any ", + "dimension that is not a power of two, but got a signal size of ", + checked_signal_sizes); } clone_input |= input.stride(signal_ndim) != 1; } @@ -212,7 +206,7 @@ class CuFFTConfig { } else if (!complex_input && complex_output) { exec_type = HIPFFT_R2C; } else { - throw std::runtime_error("hipFFT doesn't support r2r (float)"); + AT_ERROR("hipFFT doesn't support r2r (float)"); } } else if (input.type().scalarType() == ScalarType::Double) { if (complex_input && complex_output) { @@ -222,13 +216,13 @@ class CuFFTConfig { } else if (!complex_input && complex_output) { exec_type = HIPFFT_D2Z; } else { - throw std::runtime_error("hipFFT doesn't support r2r (double)"); + AT_ERROR("hipFFT doesn't support r2r (double)"); } } else { std::ostringstream ss; ss << "hipFFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } #else @@ -249,7 +243,7 @@ class CuFFTConfig { std::ostringstream ss; ss << "cuFFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } #endif diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index c82f4e7afb87de..38b1dddb496276 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -206,7 +206,7 @@ static inline Tensor _run_cufft( CUFFT_CHECK(hipfftExecR2C(plan, static_cast(input.data_ptr()), static_cast(output.data_ptr()))); } else { - throw std::runtime_error("hipFFT doesn't support r2r (float)"); + AT_ERROR("hipFFT doesn't support r2r (float)"); } } else if (input.type().scalarType() == ScalarType::Double) { if (complex_input && complex_output) { @@ -220,13 +220,13 @@ static inline Tensor _run_cufft( CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(input.data_ptr()), static_cast(output.data_ptr()))); } else { - throw std::runtime_error("hipFFT doesn't support r2r (double)"); + AT_ERROR("hipFFT doesn't support r2r (double)"); } } else { std::ostringstream ss; ss << "hipFFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } #else CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(), diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp index 6856c465e9e8ef..463d4ffea3cf04 100644 --- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp +++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp @@ -12,13 +12,13 @@ namespace at { namespace native { Tensor cudnn_affine_grid_generator_forward( const Tensor& theta, int64_t N, int64_t C, int64_t H, int64_t W) { - throw std::runtime_error("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); } Tensor cudnn_affine_grid_generator_backward( const Tensor& grad_theta, int64_t N, int64_t C, int64_t H, int64_t W) { - throw std::runtime_error("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); } }} diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp index 7cd7466a285035..d54fe256b29152 100644 --- a/aten/src/ATen/native/cudnn/BatchNorm.cpp +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -13,7 +13,7 @@ std::tuple cudnn_batch_norm( const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double exponential_average_factor, double epsilon) { - throw std::runtime_error("cudnn_batch_norm: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_batch_norm: ATen not compiled with cuDNN support"); } std::tuple cudnn_batch_norm_backward( @@ -21,7 +21,7 @@ std::tuple cudnn_batch_norm_backward( const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_var, double epsilon) { - throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); } }} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp index a94916532a3f02..afbd7653aefa67 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -14,61 +14,61 @@ at::Tensor cudnn_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_input( IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_bias( const at::Tensor& grad_output) { - throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support"); } std::tuple cudnn_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose_backward_input( const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); } std::tuple cudnn_convolution_transpose_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); } }} @@ -194,16 +194,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { // Used on pad, stride and dilation static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name) { - if (args.size() > expected_size){ - std::stringstream ss; - ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } - else if (args.size() < expected_size){ - std::stringstream ss; - ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + AT_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); if (num_negative_values > 0){ @@ -211,7 +207,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const ss << arg_name << " should be greater than zero but got ("; std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } } @@ -449,7 +445,7 @@ perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) { return perfResults[i]; } } - throw std::runtime_error("no deterministic convolution algorithms available in CuDNN"); + AT_ERROR("no deterministic convolution algorithms available in CuDNN"); } else { return perfResults[0]; } diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp index 1ce92cf7e18d2a..e859344bcc3691 100644 --- a/aten/src/ATen/native/cudnn/GridSampler.cpp +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -11,13 +11,13 @@ namespace at { namespace native { Tensor cudnn_grid_sampler_forward( const Tensor& input_t, const Tensor& grid_t) { - throw std::runtime_error("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support"); } std::tuple cudnn_grid_sampler_backward( const Tensor& input_t, const Tensor& grid_t, const Tensor& grad_output_t) { - throw std::runtime_error("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support"); + AT_ERROR("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support"); } }} diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp index 966aa20e0a128d..98c0cb7918f02f 100644 --- a/aten/src/ATen/native/cudnn/LossCTC.cpp +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -14,7 +14,7 @@ namespace at { namespace native { // See Note [ATen preprocessor philosophy] std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) { - throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); + AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); } }} diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index e3b6aaadf0f0b6..876590409c43c4 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -22,7 +22,7 @@ Tensor _cudnn_rnn_flatten_weight( int64_t fn_num_layers, bool batch_first, bool fn_bidirectional ) { - throw std::runtime_error("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support"); } std::tuple _cudnn_rnn( @@ -34,7 +34,7 @@ std::tuple _cudnn_rnn( bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes, const Tensor& fn_dropout_state ) { - throw std::runtime_error("_cudnn_rnn: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support"); } std::tuple> _cudnn_rnn_backward( @@ -47,11 +47,11 @@ std::tuple> _cudnn_rnn_backward( const Tensor& dropout_state, const Tensor& reserve, std::array output_mask ) { - throw std::runtime_error("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); } Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) { - throw std::runtime_error("_cudnn_init_dropout_state: ATen not compiled with cuDNN support"); + AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support"); } }} // namespace at::native @@ -123,7 +123,7 @@ namespace { { std::ostringstream oss; oss << "unrecognized cuDNN RNN mode " << fn_mode; - throw std::runtime_error(oss.str()); + AT_ERROR(oss.str()); } } } @@ -131,7 +131,7 @@ namespace { void set_bidirectional(bool fn_bidirectional) { bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; } - + void set_algo(cudnnRNNAlgo_t algo){ this->algo = algo; } @@ -570,7 +570,7 @@ namespace { if (prop->major == 7 && rnn.datatype == CUDNN_DATA_HALF && !tensors.is_input_packed()) { if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && tensors.input_size <=1024 && rnn.num_directions() == 1 && rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){ - //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, + //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, //weed them out if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){ if ((tensors.seq_length >=40 && bsize <=128) || @@ -599,9 +599,8 @@ Tensor _cudnn_rnn_flatten_weight( bool fn_bidirectional ) { - if (weight_arr.size() == 0) { - throw std::runtime_error("_cudnn_rnn_flatten_weight_: cannot flatten empty weight list"); - } + AT_CHECK(weight_arr.size() > 0, + "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list"); auto any_param = weight_arr[0]; @@ -671,9 +670,8 @@ std::tuple _cudnn_rnn( // TODO: Set device to input if (fn.rnn.mode != CUDNN_LSTM) { - if (cx.defined()) { - throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); - } + AT_CHECK(!cx.defined(), + "rnn: illegal defined cx for non-LSTM RNN"); } // TODO: can batch_first be a wrapper around this function? @@ -685,12 +683,10 @@ std::tuple _cudnn_rnn( auto hidden_size = _hidden_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - if (!hx.is_contiguous()) { - throw std::runtime_error("rnn: hx is not contiguous"); - } - if (cx.defined() && !cx.is_contiguous()) { - throw std::runtime_error("rnn: cx is not contiguous"); - } + AT_CHECK(hx.is_contiguous(), + "rnn: hx is not contiguous"); + AT_CHECK(!cx.defined() || cx.is_contiguous(), + "rnn: cx is not contiguous"); auto x = input.contiguous(); auto output = input.type().tensor(output_size); @@ -723,11 +719,8 @@ std::tuple _cudnn_rnn( w_desc.set(weight_buf, 3); } - if (cx.defined() && !cx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes(); - throw std::runtime_error(oss.str()); - } + AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), + "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes()); size_t workspace_size; auto x_descs_arr = descs.get_x_descs(); @@ -817,9 +810,8 @@ std::tuple _cudnn_rnn_backward_input( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - if (cx.defined()) { - throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); - } + AT_CHECK(!cx.defined(), + "rnn: illegal defined cx for non-LSTM RNN"); } auto is_input_packed = fn_batch_sizes.size() != 0; @@ -833,12 +825,10 @@ std::tuple _cudnn_rnn_backward_input( auto hidden_size = _hidden_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - if (!hx.is_contiguous()) { - throw std::runtime_error("rnn: hx is not contiguous"); - } - if (cx.defined() && !cx.is_contiguous()) { - throw std::runtime_error("rnn: cx is not contiguous"); - } + AT_CHECK(hx.is_contiguous(), + "rnn: hx is not contiguous"); + AT_CHECK(!cx.defined() || cx.is_contiguous(), + "rnn: cx is not contiguous"); auto x = input.contiguous(); auto dy = grad_output.contiguous(); @@ -851,42 +841,25 @@ std::tuple _cudnn_rnn_backward_input( AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN"); auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor(); - if (!fn_train) { - throw std::runtime_error("cudnn RNN backward can only be called in training mode"); - } - if (!input.sizes().equals(input_size)) { - std::ostringstream oss; - oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes(); - throw std::runtime_error(oss.str()); - } - if (!output.sizes().equals(output_size)) { - std::ostringstream oss; - oss << "Expected output size " << IntList{output_size} << ", got " << output.sizes(); - throw std::runtime_error(oss.str()); - } - if (hx.defined() && !hx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes(); - throw std::runtime_error(oss.str()); - } - if (cx.defined() && !cx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes(); - throw std::runtime_error(oss.str()); - } - if (dhy.defined() && !dhy.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected d_hidden size " << IntList{hidden_size} << ", got " << dhy.sizes(); - throw std::runtime_error(oss.str()); - } - if (dcy.defined() && !dcy.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected d_cell size " << IntList{hidden_size} << ", got " << dcy.sizes(); - throw std::runtime_error(oss.str()); - } - if (!dhy.is_cuda() || !dy.is_cuda() || (dcy.defined() && !dcy.is_cuda())) { - throw std::runtime_error("Gradients aren't CUDA tensors"); - } + AT_CHECK(fn_train, + "cudnn RNN backward can only be called in training mode"); + + AT_CHECK(input.sizes().equals(input_size), + "Expected input size ", IntList{input_size}, ", got ", input.sizes()); + AT_CHECK(output.sizes().equals(output_size), + "Expected output size ", IntList{output_size}, ", got ", output.sizes()); + + AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), + "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes()); + AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), + "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes()); + AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size), + "Expected d_hidden size ", IntList{hidden_size}, ", got ", dhy.sizes()); + AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size), + "Expected d_cell size ", IntList{hidden_size}, ", got ", dcy.sizes()); + + AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), + "Gradients aren't CUDA tensors"); cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors); fn.rnn.set_algo(algo); @@ -959,9 +932,8 @@ std::vector _cudnn_rnn_backward_weight( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - if (cx.defined()) { - throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN"); - } + AT_CHECK(!cx.defined(), + "rnn: illegal defined cx for non-LSTM RNN"); } auto is_input_packed = fn_batch_sizes.size() != 0; @@ -973,28 +945,21 @@ std::vector _cudnn_rnn_backward_weight( auto input_size = _input_size(fn.tensors); auto hidden_size = _hidden_size(fn.rnn, fn.tensors); - if (!fn_train) { - throw std::runtime_error("cudnn RNN backward can only be called in training mode"); - } - if (!input.sizes().equals(input_size)) { - std::ostringstream oss; - oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes(); - throw std::runtime_error(oss.str()); - } - if (hx.defined() && !hx.sizes().equals(hidden_size)) { - std::ostringstream oss; - oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes(); - throw std::runtime_error(oss.str()); - } + AT_CHECK(fn_train, + "cudnn RNN backward can only be called in training mode"); + + AT_CHECK(input.sizes().equals(input_size), + "Expected input size ", IntList{input_size}, ", got ", input.sizes()); + AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), + "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes()); + // TODO: the above were the only checks in rnn.py, but it doesn't seem // like these checks are enough - if (!hx.is_contiguous()) { - throw std::runtime_error("rnn: hx is not contiguous"); - } - if (cx.defined() && !cx.is_contiguous()) { - throw std::runtime_error("rnn: cx is not contiguous"); - } + AT_CHECK(hx.is_contiguous(), + "rnn: hx is not contiguous"); + AT_CHECK(!cx.defined() || cx.is_contiguous(), + "rnn: cx is not contiguous"); auto x = input.contiguous(); const auto& y = output; diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index 997431b7a86170..c9d25780bd65d3 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -14,7 +14,7 @@ std::tuple miopen_batch_norm( const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double exponential_average_factor, double epsilon) { - throw std::runtime_error("miopen_batch_norm: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_batch_norm: ATen not compiled with MIOpen support"); } std::tuple miopen_batch_norm_backward( @@ -22,7 +22,7 @@ std::tuple miopen_batch_norm_backward( const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_var, double epsilon) { - throw std::runtime_error("miopen_batch_norm_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_batch_norm_backward: ATen not compiled with MIOpen support"); } }} // namespace at::native diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 1ae36edd5c7b76..9aeaad73558617 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -13,61 +13,61 @@ at::Tensor miopen_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_backward_input( IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_backward_input: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward_input: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_backward_weight: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward_weight: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_backward_bias( const at::Tensor& grad_output) { - throw std::runtime_error("miopen_convolution_backward_bias: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward_bias: ATen not compiled with MIOpen support"); } std::tuple miopen_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("miopen_convolution_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_backward: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_transpose( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_transpose: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_transpose_backward_input( const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_transpose_backward_weight( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) { - throw std::runtime_error("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support"); } std::tuple miopen_convolution_transpose_backward( const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array output_mask) { - throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); + AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support"); } }} @@ -180,16 +180,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { // Used on pad, stride and dilation static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name) { - if (args.size() > expected_size){ - std::stringstream ss; - ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } - else if (args.size() < expected_size){ - std::stringstream ss; - ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + AT_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); if (num_negative_values > 0){ @@ -197,7 +193,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const ss << arg_name << " should be greater than zero but got ("; std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } } diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index c3451824c05113..2c81d69d3b8435 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -12,7 +12,7 @@ Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes) { - throw std::runtime_error("fft: ATen not compiled with MKL support"); + AT_ERROR("fft: ATen not compiled with MKL support"); } }} @@ -191,12 +191,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, osize = output_sizes[i]; istride = complex_input ? input.stride(i) >> 1 : input.stride(i); ostride = onumel; - if (isize > MKL_LONG_MAX || osize > MKL_LONG_MAX || ostride > MKL_LONG_MAX) { - std::ostringstream ss; - ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ " - << MKL_LONG_MAX << "]"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX, + "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); if (!need_contiguous && istride > MKL_LONG_MAX) { // If we didn't plan to contiguous-fy but the `istride` exceeds bound, // check if we can stride (equal to `inumel`) get back within bound if @@ -205,12 +201,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, // fine as `inumel` is non-decreasing. need_contiguous = true; } - if (need_contiguous && inumel > MKL_LONG_MAX) { - std::ostringstream ss; - ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ " - << MKL_LONG_MAX << "]"; - throw std::runtime_error(ss.str()); - } + AT_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX, + "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); inumel *= isize; onumel *= osize; } @@ -227,7 +219,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, std::ostringstream ss; ss << "MKL FFT doesn't support tensor of type: " << at::toString(input.type().scalarType()); - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } // signal type DFTI_CONFIG_VALUE signal_type; diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp index 00f4e8f95b92d4..ddbd6977645e74 100644 --- a/aten/src/ATen/native/mkldnn/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/Conv.cpp @@ -9,25 +9,25 @@ namespace at { namespace native { at::Tensor mkldnn_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, IntList padding, IntList stride, IntList dilation, int64_t groups) { - throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_forward: ATen not compiled with MKLDNN support"); } at::Tensor mkldnn_convolution_backward_input( IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) { - throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support"); } std::tuple mkldnn_convolution_backward_weights( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) { - throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support"); } std::tuple mkldnn_convolution_backward( const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array output_mask) { - throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support"); + AT_ERROR("mkldnn_convolution_backward: ATen not compiled with MKLDNN support"); } }} From c1dce21fd5505085ef148aa9d70a312d8d14257f Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Tue, 11 Sep 2018 13:06:47 -0700 Subject: [PATCH 045/237] Cuda TensorAccessor (#11373) Summary: Provide a TensorAccessor-Like interface for CUDA as discussed in #8366. Compared to TensorAccessor - the CUDATensorAccessor copies the sizes and strides while on the host (I didn't implement a host indexing function, though) to enable transfer to the device, on the device, `[]` works like for TensorAccessors, - instantiation is from TensorAccessors in order to allow using `.accessor<..>`. The drawback is that it you cannot use `auto` for the variable declaration, but the alternative would be a cuda-specific `.accessor`-like function, - there is a PtrTraits argument to enable `__restrict__`, Example for the intended use: ``` ... template __global__ void apply_homography_2d_kernel(cuda::CUDATensorAccessor dest_a, cuda::CUDATensorAccessor src_a, cuda::CUDATensorAccessor transform) { ... } template Tensor apply_homography_2d_template(Tensor& res, const Tensor& image, const Tensor& transform) { ... cuda::CUDATensorAccessor image_a(image.accessor()); cuda::CUDATensorAccessor res_a(res.accessor()); cuda::CUDATensorAccessor transform_a(transform.accessor()); auto stream = at::cuda::getCurrentCUDAStream(); apply_homography_2d_kernel <<>>(res_a, image_a, transform_a); return res; } ... ``` I could use a hint where to put a test for this (e.g. doing a plain vanilla matrix multiplication with a custom kernel) and comparing with the aten mm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11373 Differential Revision: D9735573 Pulled By: ezyang fbshipit-source-id: 482b218a0d514e19a8b692bbc77c0e37082cfded --- aten/src/ATen/core/TensorAccessor.h | 131 +++++++++++++++--- aten/src/ATen/templates/Tensor.h | 16 +++ aten/src/ATen/test/CMakeLists.txt | 3 +- .../test/cuda_packedtensoraccessor_test.cu | 46 ++++++ 4 files changed, 179 insertions(+), 17 deletions(-) create mode 100644 aten/src/ATen/test/cuda_packedtensoraccessor_test.cu diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index e3a73a3cbea2e5..cbd130ed775d09 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -5,45 +5,144 @@ namespace at { +// The PtrTraits argument to the TensorAccessor/PackedTensorAccessor +// is used to enable the __restrict__ keyword/modifier for the data +// passed to cuda. +template +struct DefaultPtrTraits { + typedef T* PtrType; +}; + +#ifdef __CUDACC__ +template +struct RestrictPtrTraits { + typedef T* __restrict__ PtrType; +}; +#endif + +#ifndef AT_HOSTDEVICE +#ifdef __CUDACC__ +#define AT_HOSTDEVICE __host__ __device__ +#define AT_HOST __host__ +#define AT_DEVICE __device__ +#else +#define AT_HOSTDEVICE +#define AT_HOST +#define AT_DEVICE +#endif +#endif -template +// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. +// For CUDA tensors it is used in device code (only). This means that we restrict ourselves +// to functions and types available there (e.g. IntList isn't). + +// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers. +template class PtrTraits = DefaultPtrTraits> class TensorAccessorBase { public: - TensorAccessorBase(T * data_, const int64_t * sizes_, const int64_t * strides_) + typedef typename PtrTraits::PtrType PtrType; + + AT_HOSTDEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : data_(data_), sizes_(sizes_), strides_(strides_) {} - IntList sizes() { + AT_HOST IntList sizes() { return IntList(sizes_,N); } - IntList strides() { + AT_HOST IntList strides() { return IntList(strides_,N); } - int64_t stride(int64_t i) { return strides()[i]; } - int64_t size(int64_t i) { return sizes()[i]; } + AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; } + AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; } protected: - T * data_; + PtrType data_; const int64_t* sizes_; const int64_t* strides_; }; -template -class TensorAccessor : public TensorAccessorBase { +// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using +// `Tensor.accessor()`. +// For CUDA `Tensor`s, `PackedTensorAccessor` is used on the host and only +// indexing on the device uses `TensorAccessor`s. +template class PtrTraits = DefaultPtrTraits> +class TensorAccessor : public TensorAccessorBase { public: - TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_) + typedef typename PtrTraits::PtrType PtrType; + + AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : TensorAccessorBase(data_,sizes_,strides_) {} - TensorAccessor operator[](int64_t i) { + AT_HOSTDEVICE TensorAccessor operator[](int64_t i) { return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); } }; -template -class TensorAccessor : public TensorAccessorBase { +template class PtrTraits> +class TensorAccessor : public TensorAccessorBase { public: - TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_) - : TensorAccessorBase(data_,sizes_,strides_) {} - T & operator[](int64_t i) { + typedef typename PtrTraits::PtrType PtrType; + + AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : TensorAccessorBase(data_,sizes_,strides_) {} + AT_HOSTDEVICE T & operator[](int64_t i) { + return this->data_[this->strides_[0]*i]; + } +}; + + +// PackedTensorAccessorBase and PackedTensorAccessor are used on for CUDA `Tensor`s on the host +// and as +// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host) +// in order to transfer them on the device when calling kernels. +// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s. +// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__. +// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available +// on the device, so those functions are host only. +template class PtrTraits = DefaultPtrTraits> +class PackedTensorAccessorBase { +public: + typedef typename PtrTraits::PtrType PtrType; + AT_HOST PackedTensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : data_(data_) + { + std::copy(sizes_, sizes_ + N, std::begin(this->sizes_)); + std::copy(strides_, strides_ + N, std::begin(this->strides_)); + } + AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; } + AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; } +protected: + PtrType data_; + int64_t sizes_[N]; + int64_t strides_[N]; +}; + +template class PtrTraits = DefaultPtrTraits> +class PackedTensorAccessor : public PackedTensorAccessorBase { +public: + typedef typename PtrTraits::PtrType PtrType; + + AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : PackedTensorAccessorBase(data_, sizes_, strides_) {}; + + AT_DEVICE TensorAccessor operator[](int64_t i) { + int64_t* new_sizes = this->sizes_+1; + int64_t* new_strides = this->strides_+1; + return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); + } +}; + +template class PtrTraits> +class PackedTensorAccessor : public PackedTensorAccessorBase { +public: + typedef typename PtrTraits::PtrType PtrType; + AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + : PackedTensorAccessorBase(data_, sizes_, strides_) {}; + + AT_DEVICE T & operator[](int64_t i) { return this->data_[this->strides_[0]*i]; } }; } + +#undef AT_HOSTDEVICE +#undef AT_HOST +#undef AT_DEVICE diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index b80e354b60b347..50e280be8f2fe9 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -194,6 +194,8 @@ struct AT_API Tensor { AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE) #undef TO_C_TYPE + // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and + // dimension. template TensorAccessor accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); @@ -203,6 +205,20 @@ struct AT_API Tensor { template TensorAccessor accessor() && = delete; + // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and + // dimension. You can optionally specify RestrictPtrTraits as a template parameter to + // cast the data pointer to a __restrict__ pointer. + // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor + // as an argument. + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() const& { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return PackedTensorAccessor(static_cast::PtrType>(data()),sizes().data(),strides().data()); + } + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() && = delete; + Tensor operator-() const; Tensor& operator+=(const Tensor & other); Tensor& operator+=(Scalar other); diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 4fcbeaa137ae78..8103f025988aab 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -27,7 +27,8 @@ list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/apply_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/stream_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu - ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu) + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu) if (CUDNN_FOUND) list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_test.cpp) diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu new file mode 100644 index 00000000000000..f1eb5cbbe4b1f3 --- /dev/null +++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu @@ -0,0 +1,46 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "ATen/ATen.h" +#include "test_seed.h" +#include "ATen/core/TensorAccessor.h" +#include "ATen/cuda/CUDAContext.h" + +#include + +using namespace at; + +__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor resa, + PackedTensorAccessor t1a, + PackedTensorAccessor t2a){ + for (int64_t i = 0; i < resa.size(0); i++) { + float val = 0.0f; + for (int64_t j = 0; j < t1a.size(1); j++) { + val += t1a[i][j] * t2a[j]; + } + resa[i] = val; + } +} + +TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) { + manual_seed(123, at::kCPU); + manual_seed(123, at::kCUDA); + + Tensor t1 = rand({4, 4}, CUDA(kFloat)); + Tensor t2 = rand({4}, CUDA(kFloat)); + Tensor res = empty({4}, CUDA(kFloat)); + + auto t1a = t1.packed_accessor(); + auto t2a = t2.packed_accessor(); + auto resa = res.packed_accessor(); + + auto stream = at::cuda::getCurrentCUDAStream(); + + test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a); + cudaError_t err = cudaDeviceSynchronize(); + REQUIRE(err == cudaSuccess); + + auto expected = mv(t1, t2); + + REQUIRE(res.allclose(expected)); +} From 4db21a1d8e97dd087293988e46c7d43a7ac59ebe Mon Sep 17 00:00:00 2001 From: Lukasz Wesolowski Date: Tue, 11 Sep 2018 13:18:43 -0700 Subject: [PATCH 046/237] Optimize LengthsTileOp on GPU to run a kernel instead of a sequence of memcopies (#11413) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11413 LengthsTileOp was implemented using a sequence of device memcopies initiated on the CPU. This was very slow. I changed it to use a kernel. TUM benchmark QPS improved from 13k QPS to 20k QPS as a result. Reviewed By: manojkris, xianjiec Differential Revision: D9724988 fbshipit-source-id: 2f98c697730982734d7c6a26d0b6967310d49900 --- caffe2/operators/lengths_tile_op.cc | 44 +++++++ caffe2/operators/lengths_tile_op.cu | 110 ++++++++++++++++++ caffe2/operators/lengths_tile_op.h | 40 +------ caffe2/operators/lengths_tile_op_gpu.cc | 6 - .../operator_test/lengths_tile_op_test.py | 4 +- 5 files changed, 158 insertions(+), 46 deletions(-) create mode 100644 caffe2/operators/lengths_tile_op.cu delete mode 100644 caffe2/operators/lengths_tile_op_gpu.cc diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc index e832fe8723a0ce..d5af0a91bd65c0 100644 --- a/caffe2/operators/lengths_tile_op.cc +++ b/caffe2/operators/lengths_tile_op.cc @@ -1,6 +1,50 @@ #include "caffe2/operators/lengths_tile_op.h" namespace caffe2 { + +template <> +bool LengthsTileOp::RunOnDevice() { + auto& data = Input(DATA); + auto& lengths = Input(LENGTHS); + auto* output = Output(0); + + CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D"); + CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); + CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); + + // Context::CopyFrom and math::Sum need the same context to avoid race + // conditions + // why? CPUContext is not used in Sum + lengths_host_.CopyFrom(lengths, &context_); + context_.FinishDeviceComputation(); + auto lengths_size = lengths_host_.size(); + auto* lengths_data = lengths_host_.data(); + + int32_t total_length = 0; + CPUContext cpuContext; + math::Sum( + lengths_size, lengths_data, &total_length, &cpuContext); + + auto shape = data.dims(); + shape[0] = total_length; + output->Resize(shape); + + auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize(); + auto src = static_cast(data.raw_data()); + auto out = static_cast(output->raw_mutable_data(data.meta())); + + for (TIndex i = 0; i < lengths_size; ++i) { + auto length = lengths_data[i]; + CAFFE_ENFORCE_GE(length, 0); + for (int32_t j = 0; j < length; ++j) { + context_.CopyBytesSameDevice(block_bytesize, src, out); + out += block_bytesize; + } + src += block_bytesize; + } + return true; +} + REGISTER_CPU_OPERATOR(LengthsTile, LengthsTileOp); OPERATOR_SCHEMA(LengthsTile) diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu new file mode 100644 index 00000000000000..aebb33c1460a56 --- /dev/null +++ b/caffe2/operators/lengths_tile_op.cu @@ -0,0 +1,110 @@ +#include "caffe2/core/context_gpu.h" +#include "caffe2/operators/lengths_tile_op.h" + +namespace caffe2 { + +template +__global__ void lengthsTileKernel( + int numElements, + int rowSize, + const T* input, + T* output, + const int32_t* inputRowOffsets) { + CUDA_1D_KERNEL_LOOP(i, numElements) { + auto outputRowIndex = i / rowSize; + auto inputBlockOffset = inputRowOffsets[outputRowIndex]; + auto indexInRow = i - outputRowIndex * rowSize; + output[i] = input[inputBlockOffset + indexInRow]; + } +} + +template <> +bool LengthsTileOp::RunOnDevice() { + auto& data = Input(DATA); + auto& lengths = Input(LENGTHS); + auto* output = Output(0); + + CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D"); + CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); + CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); + + lengths_host_.CopyFrom(lengths, &context_); + context_.FinishDeviceComputation(); + auto lengths_size = lengths_host_.size(); + auto* lengths_data = lengths_host_.data(); + + int32_t total_length = 0; + CPUContext cpuContext; + math::Sum( + lengths_size, lengths_data, &total_length, &cpuContext); + + auto shape = data.dims(); + shape[0] = total_length; + output->Resize(shape); + + auto numElementsPerRow = data.size_from_dim(1); + auto numElements = total_length * numElementsPerRow; + auto numBlocks = CAFFE_GET_BLOCKS(numElements); + + rowMappingHost_.Resize(total_length); + rowMappingDevice_.Resize(total_length); + auto* rowOffsets = rowMappingHost_.mutable_data(); + int32_t outputRow = 0; + for (TIndex i = 0; i < lengths_size; i++) { + auto length = lengths_data[i]; + for (int32_t j = 0; j < length; j++) { + rowOffsets[outputRow++] = i * numElementsPerRow; + } + } + + context_.CopyFromCPU( + total_length, + rowMappingHost_.data(), + rowMappingDevice_.mutable_data()); + context_.FinishDeviceComputation(); + + if (data.template IsType()) { + lengthsTileKernel<<< + numBlocks, + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + numElements, + numElementsPerRow, + data.data(), + output->mutable_data(), + rowMappingDevice_.data()); + } else if (data.template IsType()) { + lengthsTileKernel<<< + numBlocks, + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + numElements, + numElementsPerRow, + data.data(), + output->mutable_data(), + rowMappingDevice_.data()); + } else if (data.template IsType()) { + lengthsTileKernel<<< + numBlocks, + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + numElements, + numElementsPerRow, + data.data(), + output->mutable_data(), + rowMappingDevice_.data()); + } else { + CAFFE_THROW( + "LengthsTile operator only supports 32-bit float, int and int64_t" + " types but input was of type ", + data.meta().name()); + } + return true; +} + +REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp); + +} // namespace caffe2 diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h index c92adcb956d916..c9bd5ef500d996 100644 --- a/caffe2/operators/lengths_tile_op.h +++ b/caffe2/operators/lengths_tile_op.h @@ -13,44 +13,6 @@ class LengthsTileOp : public Operator { USE_SIMPLE_CTOR_DTOR(LengthsTileOp); bool RunOnDevice() override { - auto& data = Input(DATA); - auto& lengths = Input(LENGTHS); - auto* output = Output(0); - - CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D"); - CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); - CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0)); - - // Context::CopyFrom and math::Sum need the same context to avoid race - // conditions - // why? CPUContext is not used in Sum - lengths_host_.CopyFrom(lengths, &context_); - context_.FinishDeviceComputation(); - auto lengths_size = lengths_host_.size(); - auto* lengths_data = lengths_host_.data(); - - int32_t total_length = 0; - CPUContext cpuContext; - math::Sum( - lengths_size, lengths_data, &total_length, &cpuContext); - - auto shape = data.dims(); - shape[0] = total_length; - output->Resize(shape); - - auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize(); - auto src = static_cast(data.raw_data()); - auto out = static_cast(output->raw_mutable_data(data.meta())); - - for (TIndex i = 0; i < lengths_size; ++i) { - auto length = lengths_data[i]; - CAFFE_ENFORCE_GE(length, 0); - for (int32_t j = 0; j < length; ++j) { - context_.CopyBytesSameDevice(block_bytesize, src, out); - out += block_bytesize; - } - src += block_bytesize; - } return true; } @@ -58,6 +20,8 @@ class LengthsTileOp : public Operator { private: Tensor lengths_host_{CPU}; + Tensor rowMappingHost_{CPU}; + Tensor rowMappingDevice_{Context::GetDeviceType()}; }; } // namespace caffe2 diff --git a/caffe2/operators/lengths_tile_op_gpu.cc b/caffe2/operators/lengths_tile_op_gpu.cc deleted file mode 100644 index 65ed44b735de43..00000000000000 --- a/caffe2/operators/lengths_tile_op_gpu.cc +++ /dev/null @@ -1,6 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/operators/lengths_tile_op.h" - -namespace caffe2 { -REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp); -} // namespace caffe2 diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index d37904c08ec08b..42ef87ac364f70 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -15,7 +15,7 @@ class TestLengthsTileOp(hu.HypothesisTestCase): @given( inputs=st.integers(min_value=1, max_value=20).flatmap( lambda size: st.tuples( - hu.arrays([size]), + hu.arrays([size], dtype=np.float32), hu.arrays([size], dtype=np.int32, elements=st.integers(min_value=0, max_value=20)), ) @@ -32,7 +32,7 @@ def lengths_tile_op(data, lengths): op = core.CreateOperator( "LengthsTile", ["data", "lengths"], - ["output"] + ["output"], ) self.assertReferenceChecks( From 17776db2ee8cb9248233c22f33810345859352b8 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Tue, 11 Sep 2018 13:29:34 -0700 Subject: [PATCH 047/237] Add gtest dependency on aten tests. (#11429) Summary: ezyang delivering my promise to you :) Basically, now aten tests can use gtest as part of our test harness unification effort. I also converted one test (atest.cpp) to show how one can do this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11429 Reviewed By: ezyang Differential Revision: D9762934 Pulled By: Yangqing fbshipit-source-id: 68ec3a748403c6bd88399b1e756200985a4e07e3 --- .jenkins/caffe2/test.sh | 15 ++++++------ aten/src/ATen/test/atest.cpp | 46 ++++++++++++++++++------------------ caffe2/CMakeLists.txt | 31 ++---------------------- 3 files changed, 33 insertions(+), 59 deletions(-) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index b0f9c41382601f..6ca0850f940968 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -62,19 +62,20 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do */mkl_utils_test|*/aten/integer_divider_test) continue ;; - */aten/*) - # ATen uses test framework Catch2 - # NB: We do NOT use the xml test reporter, because - # Catch doesn't support multiple reporters + *) + # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While + # planning to migrate to gtest as the common PyTorch c++ test suite, we + # currently do NOT use the xml test reporter, because Catch doesn't + # support multiple reporters # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223 # which means that enabling XML output means you lose useful stdout # output for Jenkins. It's more important to have useful console # output than it is to have XML output for Jenkins. + # Note: in the future, if we want to use xml test reporter once we switch + # to all gtest, one can simply do: + # "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" "$test" ;; - *) - "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" - ;; esac done diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp index dd37fa86af3a36..8dffa3d7c02c75 100644 --- a/aten/src/ATen/test/atest.cpp +++ b/aten/src/ATen/test/atest.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" @@ -19,26 +18,26 @@ void trace() { trace += foo_a[i][i]; } - REQUIRE(foo.trace().toCFloat() == Approx(trace)); + EXPECT_FLOAT_EQ(foo.trace().toCFloat(), trace); } -TEST_CASE( "atest", "[]" ) { - +// TEST_CASE( "atest", "[]" ) { +TEST(atest, atest) { manual_seed(123, at::kCPU); manual_seed(123, at::kCUDA); auto foo = rand({12,6}); - REQUIRE(foo.data() == foo.toFloatData()); + EXPECT_EQ(foo.data(), foo.toFloatData()); - REQUIRE(foo.size(0) == 12); - REQUIRE(foo.size(1) == 6); + EXPECT_EQ(foo.size(0), 12); + EXPECT_EQ(foo.size(1), 6); foo = foo+foo*3; foo -= 4; Scalar a = 4; float b = a.to(); - REQUIRE(b == 4); + EXPECT_EQ(b, 4); foo = (foo*foo) == (foo.pow(3)); foo = 2 + (foo+1); @@ -51,7 +50,7 @@ TEST_CASE( "atest", "[]" ) { } } - REQUIRE(foo.equal(4 * ones({12, 6}, kByte))); + EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte))); trace(); @@ -61,17 +60,18 @@ TEST_CASE( "atest", "[]" ) { auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3}); auto f_a = f.accessor(); - REQUIRE(f_a[0][0][0] == 1.0); - REQUIRE(f_a[0][1][1] == 5.0); + EXPECT_EQ(f_a[0][0][0], 1.0); + EXPECT_EQ(f_a[0][1][1], 5.0); - REQUIRE(f.strides()[0] == 6); - REQUIRE(f.strides()[1] == 3); - REQUIRE(f.strides()[2] == 1); - REQUIRE(f.sizes()[0] == 1); - REQUIRE(f.sizes()[1] == 2); - REQUIRE(f.sizes()[2] == 3); + EXPECT_EQ(f.strides()[0], 6); + EXPECT_EQ(f.strides()[1], 3); + EXPECT_EQ(f.strides()[2], 1); + EXPECT_EQ(f.sizes()[0], 1); + EXPECT_EQ(f.sizes()[1], 2); + EXPECT_EQ(f.sizes()[2], 3); - REQUIRE_THROWS(f.resize_({3,4,5})); + // TODO(ezyang): maybe do a more precise exception type. + ASSERT_THROW(f.resize_({3,4,5}), std::exception); { int isgone = 0; { @@ -79,7 +79,7 @@ TEST_CASE( "atest", "[]" ) { isgone++; }); } - REQUIRE(isgone == 1); + EXPECT_EQ(isgone, 1); } { int isgone = 0; @@ -90,9 +90,9 @@ TEST_CASE( "atest", "[]" ) { }); a_view = f2.view({3,2,1}); } - REQUIRE(isgone == 0); + EXPECT_EQ(isgone, 0); a_view.reset(); - REQUIRE(isgone == 1); + EXPECT_EQ(isgone, 1); } if(at::hasCUDA()) { @@ -103,6 +103,6 @@ TEST_CASE( "atest", "[]" ) { isgone++; }); } - REQUIRE(isgone==1); + EXPECT_EQ(isgone, 1); } } diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 8672f836d89c69..99aca34a860d79 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -32,9 +32,8 @@ if (NOT BUILD_ATEN_MOBILE) # Add source, includes, and libs to lists list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS}) - # ATen tests use catch instead of gtest so keep separate for now - # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) - # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) + list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) + list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE}) list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE}) @@ -428,32 +427,6 @@ if (BUILD_TEST) endforeach() endif() - if (NOT USE_ROCM) - set(__aten_test_dir "test/aten") - foreach(test_src ${ATen_CPU_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - add_executable(${test_name} "${test_src}") - target_include_directories(${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE}) - target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS}) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION ${__aten_test_dir}) - endforeach() - - if(USE_CUDA OR USE_ROCM) - foreach(test_src ${ATen_CUDA_TEST_SRCS}) - get_filename_component(test_name ${test_src} NAME_WE) - torch_cuda_based_add_executable(${test_name} "${test_src}") - target_include_directories(${test_name} PRIVATE $) - target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE}) - target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS}) - add_test(NAME ${test_name} COMMAND $) - install(TARGETS ${test_name} DESTINATION ${__aten_test_dir}) - endforeach() - endif() - endif() endif() if (BUILD_PYTHON) From 289a8c9b7def723b64b77dc32786252a9d35865b Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Tue, 11 Sep 2018 15:01:48 -0700 Subject: [PATCH 048/237] Allow train/eval, and non-Tensor arguments to python functions (#11505) Summary: This whitelists train/eval functions in script modules, and tests that nested nn.Modules still work. This also changes the code for calling python functions from script to allow non-tensor inputs/outputs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11505 Differential Revision: D9765466 Pulled By: zdevito fbshipit-source-id: 1177bff931324422b69e18fa0bbaa82e3c98ec69 --- setup.py | 9 ++ test/test_jit.py | 94 ++++++++++++++++++++- torch/csrc/autograd/python_function.cpp | 29 +++++-- torch/csrc/jit/ir.cpp | 4 +- torch/csrc/jit/ir.h | 4 +- torch/csrc/jit/passes/onnx.cpp | 4 +- torch/csrc/jit/pybind_utils.h | 105 ++++++++++++------------ torch/csrc/jit/python_interpreter.cpp | 43 ++-------- torch/csrc/jit/python_ir.cpp | 23 ------ torch/csrc/jit/script/compiler.cpp | 1 - torch/csrc/jit/script/init.cpp | 37 +++------ torch/jit/__init__.py | 2 +- torch/jit/annotations.py | 18 +--- 13 files changed, 205 insertions(+), 168 deletions(-) diff --git a/setup.py b/setup.py index f99a2bf945c162..1ed87cc7819cf7 100644 --- a/setup.py +++ b/setup.py @@ -532,6 +532,14 @@ def load(filename): for f in ninja_files + cmake_files for entry in load(f)] + # cquery does not like c++ compiles that start with gcc. + # It forgets to include the c++ header directories. + # We can work around this by replacing the gcc calls that python + # setup.py generates with g++ calls instead + for command in all_commands: + if command['command'].startswith("gcc "): + command['command'] = "g++ " + command['command'][4:] + new_contents = json.dumps(all_commands, indent=2) contents = '' if os.path.exists('compile_commands.json'): @@ -540,6 +548,7 @@ def load(filename): if contents != new_contents: with open('compile_commands.json', 'w') as f: f.write(new_contents) + if not USE_NINJA: print("WARNING: 'develop' is not building C++ code incrementally") print("because ninja is not installed. Run this to enable it:") diff --git a/test/test_jit.py b/test/test_jit.py index 23b76d815d4f07..0c64a48f0bdcaa 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -460,6 +460,52 @@ def method(self, x): finally: torch.jit._enabled = True + def test_train_eval(self): + class Sub(nn.Module): + def forward(self, input): + if self.training: + return input + else: + return -input + + class MyModule(torch.jit.ScriptModule): + def __init__(self): + super(MyModule, self).__init__() + self.sub = Sub() + + @torch.jit.script_method + def forward(self, input): + return self.sub(input) + 1 + + m = MyModule() + input = torch.rand(3, 4) + self.assertEqual(input + 1, m(input)) + m.eval() + self.assertEqual(-input + 1, m(input)) + + def test_train_eval_const(self): + class MyModule(torch.jit.ScriptModule): + __constants__ = ['training'] + + def __init__(self): + super(MyModule, self).__init__() + # TODO: it is illegal to try to call + # eval/train because training has already + # been set. Consider allowing + # constants to be mutable until the end of __init__ + + @torch.jit.script_method + def forward(self, input): + if self.training: + x = 2 * input + else: + x = -input + return x + 1 + + m = MyModule() + input = torch.rand(3, 4) + self.assertEqual(2 * input + 1, m(input)) + # Backwards tracing was broken for indexing by a constant, # because it's internally implemented using as_strided, # and we attempted to trace its derivative (which is not @@ -1418,6 +1464,25 @@ def fn(x): fn(x) fn(y) + def test_python_function_tup(self): + class MyFn(Function): + @staticmethod + def forward(ctx, x): + return x + 1, x - 1 + + @staticmethod + def backward(ctx, grad_output): + return grad_output, grad_output + + @_trace(torch.zeros(2)) + def fn(x): + a, b = MyFn.apply(x + 2) + return a + b + 3 + x = torch.tensor([1., 2., 3.]) + y = torch.randn(2, 2, requires_grad=True) + fn(x) + fn(y) + def test_decompose_addmm(self): @torch.jit.script def addmm(mat, mat1, mat2, alpha, beta): @@ -4438,6 +4503,32 @@ def forward(self, x, seq_lens): f = io.BytesIO() torch.onnx._export(m, (x, seq_lens), f, verbose=False) + def test_python_call_non_tensor(self): + def foo(a, b, c): + # type: (Tensor, int, Tuple[Tensor, int]) -> Tuple[int, Tensor] + d, e = c + return b + e, a + d + + @torch.jit.script + def bar(): + x = torch.ones(3, 4) + a, b = foo(x, 3, (x, 3)) + return a, b + + self.assertEqual((6, torch.ones(3, 4) + 1), bar()) + + def test_python_call_non_tensor_wrong(self): + with self.assertRaisesRegex(RuntimeError, r"but instead got value of type tuple"): + def foo(): + # type: () -> Tensor + return ((3, 4),) + + @torch.jit.script + def bar(): + return foo() + + bar() + def test_tuples(self): @torch.jit.script def foo(i): @@ -5770,7 +5861,7 @@ def test_fn(): return foo(torch.full([1], 1), torch.full([1], 2), torch.full([1], 3)) def test_wrong_return_type(self): - with self.assertRaisesRegex(RuntimeError, 'Python functions can currently only return Tensors'): + with self.assertRaisesRegex(RuntimeError, 'but instead got value of type tuple'): def somefunc(): # type: () -> Tuple[Tuple[Tensor, Tensor]] return torch.zeros(3, 4), torch.zeros(4, 5) @@ -5778,6 +5869,7 @@ def somefunc(): @torch.jit.script def wrong_return_type(): return somefunc() + wrong_return_type() # Tests for calling between different front-end modes def test_call_python_fn_from_tracing_fn(self): diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 2301df22e1a661..ace53fdfb0aed9 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -575,9 +575,9 @@ static Node* _trace_pre_record( for (int i = 0; i < num_args; i++) { PyObject *arg_object = PyTuple_GET_ITEM(input_objects, i); if (THPVariable_Check(arg_object)) { - arg_types.push_back('t'); + arg_types.push_back('d'); } else { - arg_types.push_back('s'); + arg_types.push_back('c'); Py_INCREF(arg_object); scalar_args.emplace_back(arg_object); } @@ -597,21 +597,34 @@ static void _trace_post_record( PyObject* op_obj, const variable_list& input_vars, PyObject *output_objects, - bool is_inplace) { + bool is_inplace, + bool unpack_output) { if (!jit::tracer::isTracing()) { return; } + node->i_(attr::inplace, is_inplace); + // Isolate C variable ptrs in a vector int num_outputs = PyTuple_GET_SIZE(output_objects); variable_list output_vars(num_outputs); + auto graph = node->owningGraph(); + node->addOutput(); + if (!unpack_output) { + std::vector tuple_values(num_outputs, DynamicType::get()); + TypePtr tuple_type = TupleType::create(std::move(tuple_values)); + node->output()->setType(tuple_type); + auto unpacked = graph->createTupleUnpack(node->output())->insertAfter(node); + node = unpacked; + } for (int i = 0; i < num_outputs; ++i) { auto var = (THPVariable*)PyTuple_GET_ITEM(output_objects, i); - jit::tracer::addOutput(node, var->cdata); + Value* value = node->outputs()[i]; + if (var->cdata.defined()) { + value->inferTypeFrom(var->cdata); + jit::tracer::setValueTrace(autograd::as_variable_ref(var->cdata), value); + } } - - node->i_(attr::inplace, is_inplace); - } PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const UnpackedInput& unpacked, @@ -637,7 +650,7 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked bool is_inplace = static_cast(grad_fn->dirty_tensors); _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable); - _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace); + _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output); if (is_executable) { _save_variables(grad_fn); } else { diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index 48da3ec9cea913..4cc59e8b9bb8e1 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -331,9 +331,9 @@ void Node::lint() const { IR_ELSEIFM_CONST(PythonOp) size_t n_scalars = 0, n_tensors = 0; for (auto c : value->cconv) { - if (c == 's') { + if (c == 'c') { n_scalars++; - } else if (c == 't') { + } else if (c == 'd') { n_tensors++; } else { JIT_ASSERT(0); diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 9ac5059a2d801e..ae3c8191922c8c 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -1391,8 +1391,8 @@ struct PythonOp : public Node { // TraceInterpreterState for execution semantics. THPObjectPtr pyobj; // The calling convention for the Python function. - // 's' -- python scalar argument - // 't' -- tensor argument + // 'c' -- constant argument + // 'd' -- dynamic argument std::string cconv; // Scalar arguments to the Python function. Not necessarily passed to // the function in this order; see cconv for the correct order. diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp index 75fb063c761a31..6efcd11bfaadf3 100644 --- a/torch/csrc/jit/passes/onnx.cpp +++ b/torch/csrc/jit/passes/onnx.cpp @@ -156,10 +156,10 @@ void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExpo auto scalar_it = op->scalar_args.begin(); for (auto arg_type : op->cconv) { py::object obj; - if (arg_type == 's') { + if (arg_type == 'c') { JIT_ASSERTM(scalar_it != op->scalar_args.end(), "expected too many scalar args"); obj = py::reinterpret_borrow(py::handle((scalar_it++)->get())); - } else if (arg_type == 't') { + } else if (arg_type == 'd') { JIT_ASSERTM(node_it != inputs.end(), "expected too many inputs"); obj = py::cast(envFn(*node_it++)); } else { diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h index 4b76872cb3f967..b9f9e6bb85fd4e 100644 --- a/torch/csrc/jit/pybind_utils.h +++ b/torch/csrc/jit/pybind_utils.h @@ -18,6 +18,12 @@ namespace torch { namespace jit { namespace detail { + +// error reporting: when reporting user-caused errors, these functions should +// not use AT_ERROR macros, since these macros add stack trace information +// that is confusing to display to the end user since it always reports +// locations in libtorch code rather than user code. + inline void findErrorInKwargs( const FunctionSchema& schema, py::kwargs kwargs) { @@ -26,22 +32,22 @@ inline void findErrorInKwargs( // any argument in the schema. for (const auto& kwarg : kwargs) { const auto key = py::cast(kwarg.first); - AT_CHECK( - std::count_if( + if(!std::count_if( arguments.begin(), arguments.end(), - [&key](const Argument& argument) { return argument.name == key; }), - "Unknown keyword argument '", key, "' for operator '", - schema.name, "'. Schema: ", schema); + [&key](const Argument& argument) { return argument.name == key; })) { + throw std::runtime_error(at::str("Unknown keyword argument '", key, "' for operator '", + schema.name, "'. Schema: ", schema)); + } } // If there are unconsumed kwargs but none of them were unknown, the first // positional argument present in the kwargs is duplicated. for (const auto& argument : arguments) { if (kwargs.contains(argument.name.c_str())) { AT_ASSERT(!argument.default_value); - AT_ERROR( + throw std::runtime_error(at::str( "Argument '", argument.name, "' specified both as positional and ", - "keyword argument. Schema: ", schema); + "keyword argument. Schema: ", schema)); } } } @@ -78,19 +84,6 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) { return ConstantList::create(std::move(elems)); } -struct ConvertError : public std::exception { - ConvertError(std::string msg) - : msg_(std::move(msg)) {} - const char* what() const noexcept override { - return msg_.c_str(); - } -private: - std::string msg_; -}; - -#define TORCH_CONVERT_ERROR(...) \ - throw ConvertError(at::str(__VA_ARGS__)) - inline IValue toIValue(py::handle obj, const TypePtr& type) { switch (type->kind()) { case TypeKind::DynamicType: @@ -104,11 +97,14 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) { case TypeKind::NoneType: return {}; case TypeKind::TupleType: { + if(!PyTuple_Check(obj.ptr())) + throw py::cast_error(); // note: the py::cast does not throw cast_error + // because it attempts to iterate a non-tuple py::tuple tuple = py::cast(obj); size_t tuple_size = tuple.size(); const auto & elem_types = type->cast()->elements(); if (elem_types.size() != tuple_size) { - TORCH_CONVERT_ERROR("Expected ", elem_types.size(), " tuple elements for argument, but got ", tuple_size); + throw py::cast_error(); } std::vector values; values.reserve(tuple_size); @@ -134,11 +130,10 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) { } } case TypeKind::NumberType: - TORCH_CONVERT_ERROR("Insufficient type information to convert input"); case TypeKind::GeneratorType: - TORCH_CONVERT_ERROR("Generators are not supported yet."); + break; } - AT_ERROR("Missing cases in toIValue! File a bug report."); + AT_ERROR("Missing cases in toIValue for type: ", type->str(), "! File a bug report."); } inline IValue argumentToIValue( @@ -149,21 +144,28 @@ inline IValue argumentToIValue( try { return toIValue(object, argument.type); } catch (const py::cast_error& error) { - AT_ERROR( + throw std::runtime_error(at::str( schema.name, "() expected value of type ", argument.type->str(), " for argument '", argument.name, "' in position ", argumentPosition, ", but instead got value of type ", - py::str(object.get_type().attr("__name__")), - ".\nDeclaration: ", schema); - } catch (const ConvertError& error) { - AT_ERROR( - schema.name, "(): ", error.what(), - "\n for argument '", argument.name, - "' in position ", argumentPosition, - ", but instead got value of type ", - py::str(object.get_type().attr("__name__")), - ".\nDeclaration: ", schema); + py::str(object.get_type().attr("__name__")), ".", + "\nValue: ", py::repr(object), + "\nDeclaration: ", schema)); + } +} + +inline IValue returnToIValue( + const TypePtr& type, + py::handle object) { + try { + return toIValue(object, type); + } catch (const py::cast_error& error) { + throw std::runtime_error(at::str( + " expected value of type ", type->str(), + " for return value but instead got value of type ", + py::str(object.get_type().attr("__name__")), ".", + "\nValue: ", py::repr(object))); } } @@ -199,12 +201,12 @@ inline Stack createStackForSchema( const FunctionSchema& schema, py::args args, py::kwargs kwargs = py::kwargs()) { - AT_CHECK( - args.size() + kwargs.size() <= schema.arguments.size(), - schema.name, "() expected at most ", schema.arguments.size(), - " argument(s) but received ", - args.size() + kwargs.size(), " argument(s). Declaration: ", schema); - + if(args.size() + kwargs.size() > schema.arguments.size()) { + throw std::runtime_error(at::str( + schema.name, "() expected at most ", schema.arguments.size(), + " argument(s) but received ", + args.size() + kwargs.size(), " argument(s). Declaration: ", schema)); + } Stack stack; stack.reserve(schema.arguments.size()); @@ -226,9 +228,9 @@ inline Stack createStackForSchema( } else if (arg.default_value) { push(stack, *arg.default_value); } else { - AT_ERROR( + throw std::runtime_error(at::str( schema.name, "() is missing value for argument '", arg.name, - "'. Declaration: ", schema); + "'. Declaration: ", schema)); } } @@ -285,18 +287,13 @@ inline py::object invokeOperatorFromPython( const Operator& op, py::args args, py::kwargs kwargs) { - try { - // Create a stack full of the arguments and keyword arguments. - auto stack = - createStackForSchema(op.schema(), std::move(args), std::move(kwargs)); + // Create a stack full of the arguments and keyword arguments. + auto stack = + createStackForSchema(op.schema(), std::move(args), std::move(kwargs)); - // Invoke the operation, which puts the return values onto the stack. - op.getOperation()(stack); + // Invoke the operation, which puts the return values onto the stack. + op.getOperation()(stack); - return createPyObjectForStack(std::move(stack)); - } catch (const at::Error& error) { - // We don't want to show the backtrace in the error message in Python. - throw std::runtime_error(error.what_without_backtrace()); - } + return createPyObjectForStack(std::move(stack)); } }} // namespace torch::jit diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp index 86bd4dfaf96733..5cc4b70b2e9637 100644 --- a/torch/csrc/jit/python_interpreter.cpp +++ b/torch/csrc/jit/python_interpreter.cpp @@ -10,6 +10,7 @@ #include "torch/csrc/jit/custom_operator.h" #include "torch/csrc/jit/graph_executor.h" #include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/pybind_utils.h" #include "torch/csrc/variable_tensor_functions.h" @@ -31,9 +32,10 @@ Operation createPythonOperation(Node* op_) { py::function func = py::reinterpret_borrow(py::handle(op->pyobj.get())); size_t num_inputs = 0; for(auto arg_type : op->cconv) { - if(arg_type == 't') + if(arg_type == 'd') num_inputs++; } + JIT_ASSERT(op->outputs().size() == 1); return [=](Stack & stack) { AutoGIL gil; py::tuple py_inputs(op->cconv.size()); @@ -41,47 +43,18 @@ Operation createPythonOperation(Node* op_) { size_t next_scalar = 0; size_t next_tensor = 0; for (auto arg_type : op->cconv) { - if (arg_type == 's') { + if (arg_type == 'c') { py_inputs[i] = py::reinterpret_borrow( op->scalar_args[next_scalar++].get()); - } else if (arg_type == 't') { - auto var = std::move(peek(stack, next_tensor, num_inputs)).toTensor(); - py_inputs[i] = - py::reinterpret_steal(THPVariable_Wrap(var)); + } else if (arg_type == 'd') { + py_inputs[i] = toPyObject(std::move(peek(stack, next_tensor, num_inputs))); next_tensor++; } i++; } drop(stack, num_inputs); - py::object py_outputs(func(*py_inputs)); - - auto num_outputs = op->outputs().size(); - auto addOutput = [&](py::handle entry) { - if (!THPVariable_Check(entry.ptr())) { - throw std::runtime_error( - "Function application returned a non-Variable output"); - } - THPVariable* var = (THPVariable*)entry.ptr(); - auto cdata = var->cdata; - stack.push_back(std::move(cdata)); - }; - - if (!PyTuple_Check(py_outputs.ptr())) { - if (num_outputs != 1) { - throw std::runtime_error( - "Function.apply returned the wrong number of outputs."); - } - addOutput(py_outputs); - } else { - auto output_tuple = py::tuple(py_outputs); - if (output_tuple.size() != num_outputs) { - throw std::runtime_error( - "Function application returned the wrong number of outputs."); - } - for (py::handle entry : py::tuple(py_outputs)) { - addOutput(entry); - } - } + py::object py_output(func(*py_inputs)); + stack.push_back(returnToIValue(op->output()->type(), py_output)); return 0; }; } diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index 11ec332251b6c5..c745f9f6d8f122 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -180,29 +180,6 @@ void initPythonIRBindings(PyObject * module_) { py::arg("defer_weight_export")=false, py::arg("operator_export_type")=::torch::onnx::OperatorExportTypes::ONNX, py::arg("google_printer")=false) - .def("wrapPyFuncWithSymbolic", [](Graph &g, py::function func, std::vector inputs, size_t n_outputs, py::function symbolic) { - // This function should be used for situations where we have a Python function - // that should have different behavior when exporting for JIT interpreter - // execution v.s. for ONNX export. For example, nn.utils.rnn.pack_padded_sequence - // emits a placeholder under ONNX export, but we want to keep the ability to - // run this in the interpreter, thus we emit a PythonOp for that use case. - - // Concretely, this function emits a PythonOp wrapping the passed-in - // parameter `func`, while storing the function `symbolic` for use by the - // ONNX export - std::string cconv(inputs.size(), 't'); - func.attr("symbolic") = symbolic; - Node* new_node = g.insertNode(g.createPythonOp( - THPObjectPtr(func.release().ptr()), cconv, {})); - for (auto i : inputs) - new_node->addInput(i); - std::vector outputs; - for (size_t i = 0; i < n_outputs; ++i) - new_node->addOutput(); - auto sl = std::make_shared(tracer::getPythonInterpreterStackTrace()); - new_node->setSourceLocation(sl); - return py::make_iterator(new_node->outputs().begin(), new_node->outputs().end()); - }, py::return_value_policy::reference_internal) .def("inputs",[](Graph &g) { return py::make_iterator(g.inputs().begin(), g.inputs().end()); }) diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 988ac6e1139fb0..a04e4ae0fc3f03 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -12,7 +12,6 @@ #include "ATen/core/optional.h" - #include #include diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index 2afc56df8c2c4e..9f8185621a1c0e 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -69,18 +69,15 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { // introspection. size_t actual_n_args = n_args; if (!signature.is_none()) { - std::vector arg_types, ret_types; - std::tie(arg_types, ret_types) = py::cast, std::vector>>(signature); + std::vector arg_types; + TypePtr ret_type; + std::tie(arg_types, ret_type) = py::cast, TypePtr>>(signature); args.reserve(arg_types.size()); size_t idx = 0; // Fake argument names by putting in the index for (auto &arg_type : arg_types) { args.push_back(Argument(std::to_string(idx++), std::move(arg_type), {}, {}, false)); } - rets.reserve(ret_types.size()); - idx = 0; - for (auto &ret_type : ret_types) { - rets.push_back(Argument(std::to_string(idx++), std::move(ret_type), {}, {}, false)); - } + rets.push_back(Argument("0", std::move(ret_type), {}, {}, false)); } else { // Create a default signature using what information we have @@ -99,10 +96,12 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { for (size_t i=0; i < actual_n_args; ++i) { args.push_back(Argument(std::to_string(i), DynamicType::get(), {}, {}, false)); } - rets.reserve(n_binders); - for (size_t i = 0; i < n_binders; ++i) { - rets.push_back(Argument(std::to_string(i), DynamicType::get(), {}, {}, false)); + TypePtr ret_type = DynamicType::get(); + if(n_binders != 1) { + std::vector tuple_values(n_binders, ret_type); + ret_type = TupleType::create(std::move(tuple_values)); } + rets.push_back(Argument("0", ret_type, {}, {}, false)); } return FunctionSchema("", std::move(args), std::move(rets)); } @@ -120,27 +119,17 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { // Release the function object so we can wrap it in a PythonOp py::object func = self; - std::string cconv(inputs.size(), 't'); + std::string cconv(inputs.size(), 'd'); Node* new_node = m.graph()->insertNode(m.graph()->createPythonOp( THPObjectPtr(func.release().ptr()), cconv, {})); new_node->setSourceLocation(std::make_shared(loc)); for(auto &i : *all_inputs) new_node->addInput(i); - // This is really dumb, but relaxing the constraints on return types would - // require us to change the implementation of PythonOps in the interpreter. - // Note that this effectively makes the return type of Tuple[Tensor] and Tensor - // equivalent, but the PythonOp impl ends with an optional tuple unpack, so we need - // to do it. - for (auto & ret_arg : schema.returns) { - if (!ret_arg.type->isSubtypeOf(DynamicType::get())) { - throw ErrorReport(loc) << "Python functions can currently only return Tensors"; - } - } - std::vector outputs; - for(size_t i = 0; i < schema.returns.size(); ++i) - outputs.push_back(new_node->addOutput()); + for(auto & ret_arg : schema.returns) { + outputs.push_back(new_node->addOutput()->setType(ret_arg.type)); + } return std::make_shared(packOutputs(*m.graph(), outputs)); } diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index dd497d5c50da90..c31504ad83ab22 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -998,7 +998,7 @@ def _get_methods(cls): '_named_members', 'parameters', 'named_parameters', 'buffers', 'named_buffers', 'children', 'named_children', 'modules', 'named_modules', 'zero_grad', 'share_memory', '_get_name', 'extra_repr', - '_slow_forward', '_tracing_name' + '_slow_forward', '_tracing_name', 'eval', 'train', } diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index f5d824130f4167..6930d61ed5a64e 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -105,16 +105,6 @@ def get_num_params(fn): return num_params -def flatten_return_type(type): - if isinstance(type, TupleType): - return_types = [] - for elem_type in type.elements(): - return_types.append(elem_type) - return return_types - else: - return [type] - - def parse_type_line(type_line): """Parses a type annotation specified as a comment. @@ -138,9 +128,7 @@ def parse_type_line(type_line): raise RuntimeError("Failed to parse the return type of a type annotation") arg_types = [ann_to_type(ann) for ann in arg_ann] - ret_types = flatten_return_type(ann_to_type(ret_ann)) - - return arg_types, ret_types + return arg_types, ann_to_type(ret_ann) def get_type_line(source): @@ -191,8 +179,8 @@ def as_ann(ann): arg_types = [ann_to_type(as_ann(p.annotation)) for p in sig.parameters.values()] - return_types = flatten_return_type(ann_to_type(as_ann(sig.return_annotation))) - return arg_types, return_types + return_type = ann_to_type(as_ann(sig.return_annotation)) + return arg_types, return_type def ann_to_type(ann): From 3a8e39b21506752f77c24229008a48a0e33a66c9 Mon Sep 17 00:00:00 2001 From: Roger-luo Date: Tue, 11 Sep 2018 15:54:18 -0700 Subject: [PATCH 049/237] Support load and store between Py_complex and std::complex (#11493) Summary: Printing for complex numbers requires loading and storing between `Py_complex` and `std::complex`. This patch aims to support this for the plugin. Differential Revision: D9771808 Pulled By: ezyang fbshipit-source-id: 024865f1945d63ddb5efc775a35438c8ea06408e --- torch/csrc/utils/python_numbers.h | 9 +++++++++ torch/csrc/utils/python_scalars.h | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h index 6a292e215108bb..0d3fa2f5cc37d6 100644 --- a/torch/csrc/utils/python_numbers.h +++ b/torch/csrc/utils/python_numbers.h @@ -126,3 +126,12 @@ inline double THPUtils_unpackDouble(PyObject* obj) { } return value; } + +inline std::complex THPUtils_unpackComplexDouble(PyObject *obj) { + Py_complex value = PyComplex_AsCComplex(obj); + if (value.real == -1.0 && PyErr_Occurred()) { + throw python_error(); + } + + return std::complex(value.real, value.imag); +} diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h index 820e6d2776596f..85ed0a0a1bc88f 100644 --- a/torch/csrc/utils/python_scalars.h +++ b/torch/csrc/utils/python_scalars.h @@ -20,6 +20,8 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) { break; case at::kFloat: *(float*)data = (float)THPUtils_unpackDouble(obj); break; case at::kDouble: *(double*)data = THPUtils_unpackDouble(obj); break; + case at::kComplexFloat: *(std::complex*)data = (std::complex)THPUtils_unpackComplexDouble(obj); break; + case at::kComplexDouble: *(std::complex*)data = THPUtils_unpackComplexDouble(obj); break; default: throw std::runtime_error("invalid type"); } } @@ -34,6 +36,8 @@ inline PyObject* load_scalar(void* data, at::ScalarType scalarType) { case at::kHalf: return PyFloat_FromDouble(at::convert(*(at::Half*)data)); case at::kFloat: return PyFloat_FromDouble(*(float*)data); case at::kDouble: return PyFloat_FromDouble(*(double*)data); + case at::kComplexFloat: return PyComplex_FromCComplex(*reinterpret_cast((std::complex*)data)); + case at::kComplexDouble: return PyComplex_FromCComplex(*reinterpret_cast((std::complex*)data)); default: throw std::runtime_error("invalid type"); } } From 3a39006d38e797f0a9aea2edb1b2846ec81c7c30 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Tue, 11 Sep 2018 16:25:15 -0700 Subject: [PATCH 050/237] Fix some more doc Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11531 Differential Revision: D9776541 Pulled By: SsnL fbshipit-source-id: 8725485639ea6e9479b6ea95a49f5b75a9457db7 --- torch/_tensor_docs.py | 39 +++++++++++++++++++++++---------------- torch/_torch_docs.py | 6 ++++-- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index c3039583b04fb7..151b0dee218876 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -482,7 +482,7 @@ def add_docstr_all(method, docstr): .. math:: - f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - median)^2 + \sigma^2} + f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2} """) add_docstr_all('ceil', @@ -1229,17 +1229,18 @@ def add_docstr_all(method, docstr): In-place version of :meth:`~Tensor.log2` """) -add_docstr_all('log_normal_', u""" +add_docstr_all('log_normal_', r""" log_normal_(mean=1, std=2, *, generator=None) Fills :attr:`self` tensor with numbers samples from the log-normal distribution -parameterized by the given mean (\u00B5) and standard deviation (\u03C3). -Note that :attr:`mean` and :attr:`stdv` are the mean and standard deviation of -the underlying normal distribution, and not of the returned distribution: +parameterized by the given mean :math:`\mu` and standard deviation +:math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and +standard deviation of the underlying normal distribution, and not of the +returned distribution: .. math:: - f(x) = \\dfrac{1}{x \\sigma \\sqrt{2\\pi}}\ e^{-\\dfrac{(\\ln x - \\mu)^2}{2\\sigma^2}} + f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}} """) add_docstr_all('logsumexp', @@ -1745,13 +1746,16 @@ def callable(a, b) -> number r""" reshape(*shape) -> Tensor -Returns a tensor with the same data and number of elements as :attr:`self`, -but with the specified shape. +Returns a tensor with the same data and number of elements as :attr:`self` +but with the specified shape. This method returns a view if :attr:`shape` is +compatible with the current shape. See :meth:`torch.Tensor.view` on when it is +possible to return a view. + +See :func:`torch.reshape` Args: shape (tuple of ints or int...): the desired shape -See :func:`torch.reshape` """) add_docstr_all('reshape_as', @@ -1760,8 +1764,10 @@ def callable(a, b) -> number Returns this tensor as the same shape as :attr:`other`. ``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``. +This method returns a view if ``other.sizes()`` is compatible with the current +shape. See :meth:`torch.Tensor.view` on when it is possible to return a view. -Please see :meth:`~Tensor.reshape` for more information about ``reshape``. +Please see :meth:`reshape` for more information about ``reshape``. Args: other (:class:`torch.Tensor`): The result tensor has the same shape @@ -2524,10 +2530,10 @@ def callable(a, b) -> number add_docstr_all('view', r""" -view(*args) -> Tensor +view(*shape) -> Tensor Returns a new tensor with the same data as the :attr:`self` tensor but of a -different size. +different :attr:`shape`. The returned tensor shares the same data and must have the same number of elements, but may have a different size. For a tensor to be viewed, the new @@ -2538,13 +2544,14 @@ def callable(a, b) -> number .. math:: - stride[i] = stride[i+1] \times size[i+1] + \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1] -Otherwise, :func:`contiguous` needs to be called before the tensor can be -viewed. +Otherwise, :meth:`contiguous` needs to be called before the tensor can be +viewed. See also: :meth:`reshape`, which returns a view if the shapes are +compatible, and copies (equivalent to calling :meth:`contiguous`) otherwise. Args: - args (torch.Size or int...): the desired size + shape (torch.Size or int...): the desired size Example:: diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index f66a4e1c17d5bc..19a1fcbc625003 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -3346,7 +3346,7 @@ def parse_kwargs(desc): inv = (u^T u)^{-1} If :attr:`upper` is ``False``, :attr:`u` is lower triangular -such that the tensor value is +such that the returned tensor is .. math:: inv = (uu^{T})^{-1} @@ -3993,6 +3993,8 @@ def parse_kwargs(desc): with compatible strides can be reshaped without copying, but you should not depend on the copying vs. viewing behavior. +See :meth:`torch.Tensor.view` on when it is possible to return a view. + A single dimension may be -1, in which case it's inferred from the remaining dimensions and the number of elements in :attr:`input`. @@ -5308,7 +5310,7 @@ def parse_kwargs(desc): .. math:: out_i = \begin{cases} - x_i & \text{if } condition_i \\ + x_i & \text{if } \text{condition}_i \\ y_i & \text{otherwise} \\ \end{cases} From b6b0b5222db91309fbdb4e4b2ff40548216f06b6 Mon Sep 17 00:00:00 2001 From: Soumith Chintala Date: Tue, 11 Sep 2018 17:24:44 -0700 Subject: [PATCH 051/237] fix missing libnccl.so.1 error (#11553) Summary: what it says on the tin. I broke the build in https://github.com/pytorch/pytorch/pull/11487 but contbuild didn't end up catching it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11553 Differential Revision: D9781557 Pulled By: soumith fbshipit-source-id: 2a1fa314af4b85b5491d74110bfee3d80599aa95 --- tools/build_pytorch_libs.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 579cde711eae43..e0593badd97614 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -237,7 +237,7 @@ function build_nccl() { fi ${CMAKE_INSTALL} -j"$MAX_JOBS" mkdir -p ${INSTALL_DIR}/lib - $SYNC_COMMAND "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1" + find lib -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "${INSTALL_DIR}/lib/" if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so" fi @@ -312,8 +312,8 @@ function build_caffe2() { # This is needed by the aten tests built with caffe2 if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then - # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl - $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" + # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl + $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" fi ${CMAKE_INSTALL} -j"$MAX_JOBS" From 8b196d671b1a2be26fd6fe73f55aa45fb8a91e1e Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 17:41:28 -0700 Subject: [PATCH 052/237] Allow tracing random functions (only when using default generators) (#11539) Summary: Fixes #11504. zdevito, neerajprad, fritzo Pull Request resolved: https://github.com/pytorch/pytorch/pull/11539 Differential Revision: D9777897 Pulled By: apaszke fbshipit-source-id: 56983260f5b93da7d5540a6242769ea7bd50eb06 --- test/test_jit.py | 23 +++++++++++++++++++ tools/jit/gen_jit_dispatch.py | 10 ++++---- torch/csrc/jit/constants.cpp | 4 ++++ torch/csrc/jit/ir.h | 4 +++- .../csrc/jit/passes/constant_propagation.cpp | 2 +- torch/csrc/jit/register_prim_ops.cpp | 2 +- torch/csrc/jit/tracer.cpp | 9 +++++++- 7 files changed, 44 insertions(+), 10 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 0c64a48f0bdcaa..68aaf4df5d3afd 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1118,6 +1118,17 @@ def fn(x, y): self.assertExpectedGraph(traced_fn.graph) self.assertExportImport(traced_fn.graph, (x, y)) + def test_trace_random(self): + def f(mean, std): + return torch.normal(mean, std) + + traced = torch.jit.trace(f, (torch.zeros(2, 3), torch.ones(2, 3)), check_trace=False) + mean, std = torch.zeros(5, 5), torch.ones(5, 5) + with torch.random.fork_rng(devices=[]): + output = f(mean, std) + traced_output = traced(mean, std) + self.assertEqual(output, traced_output) + def test_trace_tensor_factory(self): def run(**kwargs): inputs_require_grads = kwargs.pop('inputs_require_grads', True) @@ -2563,6 +2574,18 @@ def func(x): x = torch.rand(10, dtype=torch.float, requires_grad=True) self.checkScript(func, [x], optimize=True) + def test_random(self): + @torch.jit.script + def f(mean, std): + return torch.normal(mean, std) + + mean, std = torch.zeros(5, 5), torch.ones(5, 5) + with torch.random.fork_rng(devices=[]): + output = torch.normal(mean, std) + with torch.random.fork_rng(devices=[]): + script_output = f(mean, std) + self.assertEqual(output, script_output) + def _check_code(self, code_str, fn_name, inputs): scope = {} exec(code_str, globals(), scope) diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 2f24986c39cdc3..67bba43913ef16 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -68,6 +68,7 @@ def jit_type_of(arg): 'double': '{}.toDouble()', 'int64_t': '{}.toInt()', 'std::string': '{}.toString()->string()', + 'Generator': 'nullptr', 'std::array': 'as_bool_array<2>({}.toIntList()->elements())', 'std::array': 'as_bool_array<3>({}.toIntList()->elements())', 'std::array': 'as_bool_array<4>({}.toIntList()->elements())', @@ -197,12 +198,9 @@ def emit_decl_variant(decl): real_inputs = 0 for arg in decl['arguments']: - if arg['simple_type'] in default_only_types: - arguments.append(arg['default']) - else: - value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs) - arguments.append(from_ivalue(arg, value)) - real_inputs += 1 + value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs) + arguments.append(from_ivalue(arg, value)) + real_inputs += 1 call = get_invocation(decl, arguments, num_inputs) diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index 8bf1ab566b31a9..4cdb193d8434d8 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -35,6 +35,10 @@ Value* insertConstant( } else if(val.isString()) { n->s_(attr::value, val.toString()->string()); n->output()->setType(StringType::get()); + } else if(val.isNone()) { + n->destroy(); + n = g.create(prim::None); + n->output()->setType(NoneType::get()); } else { throw constant_not_supported_error("Unsupported value kind: " + val.tagKind()); } diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index ae3c8191922c8c..5e70adfe400916 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -993,7 +993,9 @@ friend struct Block; return create(prim::Undefined); } Node * createNoneGenerator() { - return create(prim::NoneGenerator); + auto n = create(prim::NoneGenerator); + n->output()->setType(GeneratorType::get()); + return n; } Node * createFusionGroup(int device) { auto n = create(prim::FusionGroup, 0); diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index bfd8ec9b9f1764..e83971bfaf9dfb 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -19,7 +19,6 @@ std::unordered_set skip_list = { //FIXME treated as having side effects but ONNX depends on them being removed prim::Print, //all the rand functions from native_functions.yaml - aten::permute, aten::rand, aten::rand_out, aten::rand_like, @@ -33,6 +32,7 @@ std::unordered_set skip_list = { aten::randperm_out, prim::Constant, prim::Undefined, + prim::NoneGenerator, // TODO (zach): we should consider skipping tensor factories in the cases // where the constant tensor would be large but cheap to create. }; diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 7e70550a38094b..67274f8a9192d6 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -146,7 +146,7 @@ RegisterOperators reg({ prim::NoneGenerator, [](Node* node) { return [](Stack& stack) { - stack.push_back(at::Tensor()); + stack.emplace_back(); return 0; }; }), diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 4f496420956b0b..0f8d65dcc5873a 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -42,7 +42,14 @@ void addInputs(Node *n, const char * name, const at::Scalar& value) { detail::g void addInputs(Node *n, const char * name, const std::string& value) { detail::genericAddInput(n, value); } void addInputs(Node *n, const char * name, const at::Tensor& value) { n->addInput(getValueTrace(value)); } void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); } -void addInputs(Node *n, const char * name, at::Generator * value) { detail::badArgType(value); } +void addInputs(Node *n, const char * name, at::Generator * value) { + if (value) { + detail::badArgType(value); + } + Graph * g = n->owningGraph(); + Value * undef_gen = g->insertNode(g->createNoneGenerator())->output(); + n->addInput(undef_gen); +} void addInputs(Node *n, const char * name, at::ScalarType value) { detail::badArgType(value); } void addInputs(Node *n, const char * name, at::TensorList value) { From cda74ac476471e753ca06bd3491c84dc0bcaa813 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 11 Sep 2018 17:42:03 -0700 Subject: [PATCH 053/237] fix nested no_grad decorator and with-statement (#11479) Summary: - fixes https://github.com/pytorch/pytorch/issues/10858 - allow `no_grad` decorator to apply `with torch.no_grad()` at the correct context - current behavior: ``` import torch torch.no_grad() def nothing(x): return x testin = torch.Tensor([0]) with torch.no_grad(): print(torch.is_grad_enabled()) # False testout = nothing(testin) print(torch.is_grad_enabled()) # False ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11479 Differential Revision: D9758691 Pulled By: weiyangfb fbshipit-source-id: 87de2219c6c45f65a2c0406ae152c3ad760be8f2 --- test/test_autograd.py | 6 ++++++ torch/autograd/grad_mode.py | 10 ++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index faba5efb1847d5..47ff4c66be0d33 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -724,6 +724,12 @@ def adder(x, y): self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5))) self.assertIsNone(z.grad_fn) + # test nested decorator and with-statement on no_grad + with torch.no_grad(): + self.assertFalse(torch.is_grad_enabled()) + w = adder(x, y) + self.assertFalse(torch.is_grad_enabled()) + def test_no_grad_python_function(self): """Python Functions should respect grad mode.""" x = torch.ones(5, 5, requires_grad=True) diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py index 35dada34265d12..a4a768b633e331 100644 --- a/torch/autograd/grad_mode.py +++ b/torch/autograd/grad_mode.py @@ -28,11 +28,8 @@ class no_grad(object): >>> z.requires_grad False """ - - def __init__(self): - self.prev = torch.is_grad_enabled() - def __enter__(self): + self.prev = torch.is_grad_enabled() torch._C.set_grad_enabled(False) def __exit__(self, *args): @@ -75,11 +72,8 @@ class enable_grad(object): True """ - - def __init__(self): - self.prev = torch.is_grad_enabled() - def __enter__(self): + self.prev = torch.is_grad_enabled() torch._C.set_grad_enabled(True) def __exit__(self, *args): From bbf54ea37cc063503d6e30ffe2124031f3c87f59 Mon Sep 17 00:00:00 2001 From: Fritz Obermeyer Date: Tue, 11 Sep 2018 18:24:02 -0700 Subject: [PATCH 054/237] Ensure .enumerate_support() methods are jittable (#11542) Summary: This works around #11535 by avoiding `arange(n, out=x)` and `eye(n, out=x)` in `torch.distributions`. I've confirmed that the `.enumerate_support()` methods are now jittable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11542 Differential Revision: D9777805 Pulled By: apaszke fbshipit-source-id: fa38f2f1acfc0a289f725fd8c92478573cfdbefb --- torch/distributions/bernoulli.py | 3 +-- torch/distributions/binomial.py | 5 ++--- torch/distributions/categorical.py | 4 +--- torch/distributions/constraints.py | 2 +- torch/distributions/one_hot_categorical.py | 7 +++++-- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py index 067a6eba9a43c4..ddbbcde7bed300 100644 --- a/torch/distributions/bernoulli.py +++ b/torch/distributions/bernoulli.py @@ -97,8 +97,7 @@ def entropy(self): return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none') def enumerate_support(self, expand=True): - values = self._new((2,)) - torch.arange(2, out=values) + values = torch.arange(2, dtype=self._param.dtype, device=self._param.device) values = values.view((-1,) + (1,) * len(self._batch_shape)) if expand: values = values.expand((-1,) + self._batch_shape) diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py index ad56856c90aef1..5e168ffe1d7a2f 100644 --- a/torch/distributions/binomial.py +++ b/torch/distributions/binomial.py @@ -98,7 +98,7 @@ def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) + (max_count,) bernoullis = torch.bernoulli(self.probs.unsqueeze(-1).expand(shape)) if self.total_count.min() != max_count: - arange = torch.arange(max_count, out=self.total_count.new_empty(max_count)) + arange = torch.arange(max_count, dtype=self._param.dtype, device=self._param.device) mask = arange >= self.total_count.unsqueeze(-1) bernoullis.masked_fill_(mask, 0.) return bernoullis.sum(dim=-1) @@ -119,8 +119,7 @@ def enumerate_support(self, expand=True): total_count = int(self.total_count.max()) if not self.total_count.min() == total_count: raise NotImplementedError("Inhomogeneous total count not supported by `enumerate_support`.") - values = self._new(1 + total_count,) - torch.arange(1 + total_count, out=values) + values = torch.arange(1 + total_count, dtype=self._param.dtype, device=self._param.device) values = values.view((-1,) + (1,) * len(self._batch_shape)) if expand: values = values.expand((-1,) + self._batch_shape) diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py index 267bed8826e7de..2ad69cb1bacec1 100644 --- a/torch/distributions/categorical.py +++ b/torch/distributions/categorical.py @@ -120,10 +120,8 @@ def entropy(self): def enumerate_support(self, expand=True): num_events = self._num_events - values = torch.arange(num_events).long() + values = torch.arange(num_events, dtype=torch.long, device=self._param.device) values = values.view((-1,) + (1,) * len(self._batch_shape)) if expand: values = values.expand((-1,) + self._batch_shape) - if self._param.is_cuda: - values = values.cuda(self._param.get_device()) return values diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py index f214cf1caab1ea..83205353761fd5 100644 --- a/torch/distributions/constraints.py +++ b/torch/distributions/constraints.py @@ -269,7 +269,7 @@ def check(self, value): lower_triangular = (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0] n = value.size(-1) - diag_mask = torch.eye(n, n, out=value.new(n, n)) + diag_mask = torch.eye(n, n, dtype=value.dtype, device=value.device) positive_diagonal = (value * diag_mask > (diag_mask - 1)).min(-1)[0].min(-1)[0] return lower_triangular & positive_diagonal diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py index c28949b079a115..dc37b4c7fabb8e 100644 --- a/torch/distributions/one_hot_categorical.py +++ b/torch/distributions/one_hot_categorical.py @@ -49,6 +49,10 @@ def expand(self, batch_shape, _instance=None): def _new(self, *args, **kwargs): return self._categorical._new(*args, **kwargs) + @property + def _param(self): + return self._categorical._param + @property def probs(self): return self._categorical.probs @@ -89,8 +93,7 @@ def entropy(self): def enumerate_support(self, expand=True): n = self.event_shape[0] - values = self._new((n, n)) - torch.eye(n, out=values) + values = torch.eye(n, dtype=self._param.dtype, device=self._param.device) values = values.view((n,) + (1,) * len(self.batch_shape) + (n,)) if expand: values = values.expand((n,) + self.batch_shape + (n,)) From 35d52dbb0e026f7f0b24f6b81bc1d10c0cb0983a Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Tue, 11 Sep 2018 18:24:55 -0700 Subject: [PATCH 055/237] re-enable USE_MPI (#11416) Summary: The previous error was caused by mpi_test not depending on MPI_CXX_LIBRARIES. This might solve the problem. Not tested locally - waiting for CI test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11416 Reviewed By: mingzhe09088 Differential Revision: D9771694 Pulled By: Yangqing fbshipit-source-id: 53e7b4f64eadc88313bc4dd9b8e3f7931cda6e91 --- CMakeLists.txt | 2 +- caffe2/CMakeLists.txt | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8dc6dc146ffb79..60d69ef2d9bc68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,7 @@ option(USE_IDEEP "Use IDEEP interface in MKL BLAS" ON) option(USE_MKLML "Use MKLML interface in MKL BLAS" ON) option(USE_DISTRIBUTED "Use distributed" ON) cmake_dependent_option( - USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." OFF + USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON "USE_DISTRIBUTED" OFF) cmake_dependent_option( USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 99aca34a860d79..4e9b3eef8937f4 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -427,6 +427,13 @@ if (BUILD_TEST) endforeach() endif() + # For special tests that explicitly uses dependencies, we add them here + if (USE_MPI) + target_link_libraries(mpi_test ${MPI_CXX_LIBRARIES}) + if (USE_CUDA) + target_link_libraries(mpi_gpu_test ${MPI_CXX_LIBRARIES}) + endif() + endif() endif() if (BUILD_PYTHON) From 92fd69f256a7bf2ec9e022aef5a66b97b9b82132 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 11 Sep 2018 20:10:21 -0700 Subject: [PATCH 056/237] Split Type into TypeExtendedInterface and Type (#11520) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11520 Previously, we had Type which was a catch all interface for all functions and methods we could possibly want to do dynamic dispatch on. However, we want to check in a non-autogenerated Tensor class to ATen/core, and to do this, we must also check in a non-autogenerated Type class which we can do dispatch on. In principle, we could put the full Type interface in ATen/core, but this would be a bad developer experience, since any time you add a new free function, you'd have to regenerate the checked in Type header. For a better dev experience, we split Type into a two parts, Type, which will be checked in (though not in this diff), and TypeExtendedInterface, which will NOT be checked in. Type contains just enough methods to let Tensor be defined, and leaves the rest to TypeExtendedInterface. Some complications: - We (very unfortunately) have overloaded virtual methods. Because of C++'s rules, we cannot move one overload without doing some extra work to make sure that overload in a superclass and an overload in a subclass resolve together. I've chosen to resolve this problem simply by moving ALL overloads of a method which occurs in Tensor to Type. - There are some places where we take a type() object and call a method on it, which is not a Tensor base method. I've eliminated some where possible, but in other cases calling the method on type is the ONLY way to invoke it; in that case, I've just inserted a cast. Further refactoring is necessary. Reviewed By: gchanan Differential Revision: D9771708 fbshipit-source-id: c59d39fe919cd6f42be6dca699d474346ea3c614 --- aten/src/ATen/function_wrapper.py | 64 ++++++++++++++----- aten/src/ATen/gen.py | 5 +- aten/src/ATen/native/LegacyBridge.cpp | 16 ++--- aten/src/ATen/native/TensorFactories.cpp | 18 ++++-- aten/src/ATen/preprocess_declarations.py | 26 ++++++++ aten/src/ATen/templates/Functions.h | 17 +++-- aten/src/ATen/templates/TypeDefault.h | 6 +- .../ATen/templates/TypeExtendedInterface.h | 12 ++++ caffe2/contrib/aten/aten_op_template.h | 6 +- caffe2/contrib/aten/gen_op.py | 2 +- tools/autograd/templates/VariableType.cpp | 2 +- tools/autograd/templates/VariableType.h | 2 +- 12 files changed, 131 insertions(+), 45 deletions(-) create mode 100644 aten/src/ATen/templates/TypeExtendedInterface.h diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index b830aa3e5fe65e..287b7895008cc2 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -107,6 +107,10 @@ def TypedDict(name, attrs, total=True): # type: ignore # NB: As far as ezyang can tell, we don't *have* to codegen this, # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in # the superclass. But it doesn't seem to be harmful. +# +# TODO: self_ty is a hack to make things work for native methods which need to +# take a dtype, but also need to dispatch differently for different types. +# Eliminate it at some point. TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\ ${return_type} ${Type}::${api_name}(${type_method_formals}) const { ${device_guard_declaration} @@ -173,7 +177,7 @@ def TypedDict(name, attrs, total=True): # type: ignore # the same name (but different signature) already ZERO_DIM_CHECK = CodeTemplate("""\ if (${check_name}.dim() == 0) { - return static_cast(this)->${api_name}(${zero_dim_actuals}); + return static_cast(this)->${api_name}(${zero_dim_actuals}); }""") ZERO_DIM_ONLY = CodeTemplate("""\ @@ -183,7 +187,7 @@ def TypedDict(name, attrs, total=True): # type: ignore SPARSE_CHECK = CodeTemplate("""\ if(${check_name}.type().is_sparse()) { - return static_cast(this)->${api_name}(${sparse_actuals}); + return static_cast(this)->${api_name}(${sparse_actuals}); }""") BUFFER_DEFINITION = CodeTemplate("""\ @@ -390,6 +394,7 @@ def __getitem__(self, x): 'type_registrations': List[str], 'type_headers': List[str], 'pure_virtual_type_method_declarations': List[str], + 'pure_virtual_extended_type_method_declarations': List[str], 'type_method_declarations': List[str], 'type_method_definitions': List[str], 'type_method_inline_definitions': List[str], @@ -490,6 +495,9 @@ def __getitem__(self, x): 'formals': List[str], 'inferred_type': str, 'inplace': bool, + # This controls whether or not we generate the interface in Type or + # TypeExtendedInterface + 'extended_method': bool, 'method_actuals': List[str], 'method_formals_with_defaults': List[str], 'method_formals': List[str], @@ -836,8 +844,12 @@ def process_option(option, output_options): # NN function with no _forward/_backward suffix don't have cimpls. # They call the _forward function and discard any buffer returns abstract = False - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) top_env['type_method_declarations'].append( TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) body = emit_nn_body(option) @@ -845,17 +857,27 @@ def process_option(option, output_options): TYPE_METHOD_DEFINITION_CONCRETE.substitute( env, type_definition_body=body)) elif broadcast_arg is None: - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) top_env['type_method_declarations'].append( TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env)) top_env['type_method_definitions'].append( TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env)) else: - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) top_env['type_method_declarations'].append( TYPE_METHOD_DECLARATION_BROADCAST.substitute(env)) top_env['type_method_declarations'].append( @@ -888,7 +910,7 @@ def process_option(option, output_options): method_of.append('Tensor') if is_namespace_function: - option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor) + option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor) top_env['function_declarations'].append( FUNCTION_DECLARATION.substitute(env)) top_env['function_definitions'].append( @@ -1060,11 +1082,21 @@ def find_formal(formal_name, formals): # Factory methods are not dispatched over `Type`. if not is_factory_method: if option['deprecated']: + # Deprecated functions are always non-extended, + # because they need to be made available from Type + # (the public interface) so that code like + # tensor.type().arange(...) keeps working. Once + # we remove the deprecated functions, we can eliminate + # these methods entirely. top_env['pure_virtual_type_method_declarations'].append( DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) else: - top_env['pure_virtual_type_method_declarations'].append( - PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + if option['extended_method']: + top_env['pure_virtual_extended_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) + else: + top_env['pure_virtual_type_method_declarations'].append( + PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env)) top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env)) dispatch = option['type_method_definition_dispatch'] option['native_type_method_dispatch'] = dispatch @@ -1116,12 +1148,12 @@ def find_formal(formal_name, formals): if is_namespace_function: if dispatch_type: - option['inferred_type'] = dispatch_type['name'] + option['inferred_type'] = 'static_cast({})'.format(dispatch_type['name']) elif dispatch_tensor: - option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor) + option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor) else: # doesn't depend on a specific type, use undefined float - option['inferred_type'] = 'at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)' + option['inferred_type'] = 'detail::non_specific_type()' declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION top_env['function_declarations'].append(declaration.substitute(env)) if is_factory_method: diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 025faffe5d2fe8..5497a752dc4fb4 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -107,6 +107,7 @@ def check_all_files_written(self): SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h") +TYPE_EXTENDED_INTERFACE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeExtendedInterface.h") TYPE_DEFAULT_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.h") TYPE_DEFAULT_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.cpp") @@ -165,6 +166,7 @@ def check_all_files_written(self): 'cuda_type_registrations': [], 'cuda_type_headers': [], 'pure_virtual_type_method_declarations': [], + 'pure_virtual_extended_type_method_declarations': [], 'type_method_declarations': [], 'type_method_definitions': [], 'type_method_inline_definitions': [], @@ -330,7 +332,7 @@ def iterate_types(): # so that the script runs quickly when we are just querying the # outputs def declare_outputs(): - files = ['Declarations.yaml', 'Type.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h', + files = ['Declarations.yaml', 'Type.h', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h', 'TensorMethods.h', 'Functions.h', 'CPUCopy.cpp', 'NativeFunctions.h', 'RegisterCPU.cpp', 'RegisterCPU.h'] @@ -400,6 +402,7 @@ def generate_outputs(): backend, density, scalar_type, declarations)) file_manager.write('Type.h', TYPE_H, top_env) + file_manager.write('TypeExtendedInterface.h', TYPE_EXTENDED_INTERFACE_H, top_env) file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env) file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env) diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp index 07d7e46ff79a56..1364c0cceb1847 100644 --- a/aten/src/ATen/native/LegacyBridge.cpp +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -144,34 +144,34 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta Tensor tensor(const Type& dtype) { if (_type_has_native(dtype)) { - return dtype.native_tensor(); + return static_cast(dtype).native_tensor(); } else { - return dtype.th_tensor(); + return static_cast(dtype).th_tensor(); } } Tensor tensor(const Type& dtype, ArrayRef size) { if (_type_has_native(dtype)) { - return dtype.native_tensor(size); + return static_cast(dtype).native_tensor(size); } else { - return dtype.th_tensor(size); + return static_cast(dtype).th_tensor(size); } } Tensor sparse_coo_tensor(const Type& dtype, ArrayRef size) { - return dtype.toSparse().native_sparse_coo_tensor(size); + return static_cast(dtype.toSparse()).native_sparse_coo_tensor(size); } Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) { - return values.type().toSparse().native_sparse_coo_tensor(indices, values); + return static_cast(values.type().toSparse()).native_sparse_coo_tensor(indices, values); } Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef size) { - return values.type().toSparse().native_sparse_coo_tensor(indices, values, size); + return static_cast(values.type().toSparse()).native_sparse_coo_tensor(indices, values, size); } Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef size) { - return values.type().toSparse()._native_sparse_coo_tensor_unsafe(indices, values, size); + return static_cast(values.type().toSparse())._native_sparse_coo_tensor_unsafe(indices, values, size); } int64_t get_device(const Tensor& self) { diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 1a12549b5e70e9..20211aeedfa32b 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -59,6 +59,10 @@ void window_function_checks( window_length); } +const TypeExtendedInterface& getFactoryType(const TensorOptions& options) { + return static_cast(at::getType(options)); +} + } // namespace // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -73,7 +77,7 @@ Tensor arange( Scalar step, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._arange(start, end, step); + return getFactoryType(options)._arange(start, end, step); } Tensor& arange_out(Tensor& result, Scalar start, Scalar end) { @@ -86,7 +90,7 @@ Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) { Tensor arange(Scalar end, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._arange(end); + return getFactoryType(options)._arange(end); } Tensor& arange_out(Tensor& result, Scalar end) { @@ -94,7 +98,7 @@ Tensor& arange_out(Tensor& result, Scalar end) { } Tensor _dim_arange(const Tensor& like, int64_t dim) { - return like.type().toScalarType(at::kLong)._arange(like.size(dim)); + return static_cast(like.type().toScalarType(at::kLong))._arange(like.size(dim)); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -102,7 +106,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) { Tensor empty(IntList size, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] // Can't call a factory function, because the buck stops with us! - return at::getType(options).tensor(size); + return getFactoryType(options).tensor(size); } Tensor& empty_out(Tensor& result, IntList size) { @@ -218,7 +222,7 @@ Tensor linspace( int64_t steps, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._linspace(start, end, steps); + return getFactoryType(options)._linspace(start, end, steps); } Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) { @@ -241,7 +245,7 @@ Tensor logspace( int64_t steps, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._logspace(start, end, steps); + return getFactoryType(options)._logspace(start, end, steps); } Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) { @@ -475,7 +479,7 @@ Tensor range( Scalar step, const TensorOptions& options) { // Note [Native bindings for legacy TH factory functions] - return at::getType(options)._range(start, end, step); + return getFactoryType(options)._range(start, end, step); } Tensor& range_out(Tensor& result, Scalar start, Scalar end) { diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py index 173ac439487d26..98b22c7f8e1d6a 100644 --- a/aten/src/ATen/preprocess_declarations.py +++ b/aten/src/ATen/preprocess_declarations.py @@ -217,8 +217,20 @@ def signature(option, i=None, value=None): (raw_args - filtered_args)] +def is_extended_method(option): + if 'method' in option['variants']: + return False + elif option.get('deprecated', False): + return False + elif not option['variants']: + return False + else: + return True + + def run(declarations): declarations = [d for d in declarations if not exclude(d)] + non_extended_methods = set() for declaration in declarations: common_with_cwrap.set_declaration_defaults(declaration) declaration['options'] = [deepcopy(o) for o in declaration['options']] @@ -237,6 +249,20 @@ def run(declarations): sanitize_return(option) process_types_and_backends(option) add_variants(option) + if not is_extended_method(option): + non_extended_methods.add(option['api_name']) declaration['options'] = handle_outputs_taken_as_arguments( declaration['options']) + + # We (very unfortunately) have overloaded virtual methods. Because + # of C++'s rules, we cannot move one overload without doing some + # extra work to make sure that overload in a superclass and an + # overload in a subclass resolve together. I've chosen to resolve + # this problem simply by moving ALL overloads of a method which + # occurs in Tensor to Type. This is why we have to first compute + # which methods *names* go on type, and then move ALL overloads + # of this name to Type. + for declaration in declarations: + for option in declaration['options']: + option['extended_method'] = option['api_name'] not in non_extended_methods return declarations diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index b4a2e05e759ea3..7e2b6580343be4 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -4,6 +4,7 @@ #include "ATen/core/Scalar.h" #include "ATen/Type.h" +#include "ATen/TypeExtendedInterface.h" #include "ATen/Tensor.h" #include "ATen/core/Storage.h" #include "ATen/core/Generator.h" @@ -20,14 +21,22 @@ using native::tensor; ${function_declarations} -static inline Type & infer_type(const Tensor & t) { +namespace detail { + +static inline TypeExtendedInterface & infer_type(const Tensor & t) { AT_CHECK(t.defined(), "undefined Tensor"); - return t.type(); + return static_cast(t.type()); } -static inline Type & infer_type(const TensorList & tl) { +static inline TypeExtendedInterface & infer_type(const TensorList & tl) { AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); - return tl[0].type(); + return static_cast(tl[0].type()); +} +static inline TypeExtendedInterface & non_specific_type() { + return static_cast(at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)); } + +} // namespace detail + // function definitions are all static inline because // they are one-line statically dispatched functions that // invoke the actual dynamic dispatch on the correct argument diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h index 0f315e590d700c..e4a75abb48993e 100644 --- a/aten/src/ATen/templates/TypeDefault.h +++ b/aten/src/ATen/templates/TypeDefault.h @@ -2,13 +2,13 @@ // ${generated_comment} -#include "ATen/Type.h" +#include "ATen/TypeExtendedInterface.h" namespace at { -struct AT_API TypeDefault : public Type { +struct AT_API TypeDefault : public TypeExtendedInterface { explicit TypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined) - : Type(type_id, is_variable, is_undefined) {} + : TypeExtendedInterface(type_id, is_variable, is_undefined) {} // Make sure overload resolution considers the nullary virtual method. // (A single argument overload is generated in the list.) diff --git a/aten/src/ATen/templates/TypeExtendedInterface.h b/aten/src/ATen/templates/TypeExtendedInterface.h new file mode 100644 index 00000000000000..82cb658c9eeea8 --- /dev/null +++ b/aten/src/ATen/templates/TypeExtendedInterface.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace at { + +struct AT_API TypeExtendedInterface : public Type { + explicit TypeExtendedInterface(TensorTypeId type_id, bool is_variable, bool is_undefined) + : Type(type_id, is_variable, is_undefined) {} + ${pure_virtual_extended_type_method_declarations} +}; + +} // namespace at diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index 9f327fdd5d82d4..c573891f6f04ac 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -214,10 +214,10 @@ class ATenOp : public Operator { DEFINE_IF(int64, Long) CAFFE_THROW("unsupported type annotation: ", name); } - at::Type & stringToType(const std::string & name) { - return at::getNonVariableType(backend(), stringToScalarType(name)); + at::TypeExtendedInterface & stringToType(const std::string & name) { + return static_cast(at::getNonVariableType(backend(), stringToScalarType(name))); } - at::Type * readTypeAttribute(const std::string & name) { + at::TypeExtendedInterface * readTypeAttribute(const std::string & name) { CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); return &stringToType(OperatorBase::GetSingleArgument(name, "")); } diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index 18a3db4c7daed3..bc75ac9859dcc5 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -278,7 +278,7 @@ def find_factory_methods(decls): # first tensor input is used to define the output type. defined_inferred_type = True env['statements'].append( - 'auto inferred_type = &({}.type());'.format( + 'auto inferred_type = &(static_cast({}.type()));'.format( arg['name'])) else: init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name']) diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index d4a9a4eccab86a..589bbf8323f155 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -44,7 +44,7 @@ namespace torch { namespace autograd { VariableType::VariableType(Context* context, Type* baseType) : TypeDefault(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false) - , baseType(baseType) + , baseType(static_cast(baseType)) , id_(context->freshTypeID()) { str = std::string("Variable[") + baseType->toString() + "]"; } diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index fe3e57f4fc0246..b9d84ad85672a5 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -72,7 +72,7 @@ struct TORCH_API VariableType final : public at::TypeDefault { static at::Tensor unpack_opt(const Tensor & t, const char * name, int pos); static std::vector unpack(at::TensorList tl, const char *name, int pos); - at::Type* baseType; + at::TypeExtendedInterface* baseType; std::string str; size_t id_; }; From 3121c8f5265e0f7410d22be27c13c1d55e496043 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Tue, 11 Sep 2018 20:11:26 -0700 Subject: [PATCH 057/237] Update gtest and remove the macro guide on gtest from #11321 (#11417) Summary: Last PR seems to have test failures, re-issuing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11417 Reviewed By: orionr Differential Revision: D9784706 Pulled By: Yangqing fbshipit-source-id: 9e5f347e19fa2700ff69d2cd69ea7a9e01a91609 --- cmake/Dependencies.cmake | 13 +++++-------- third_party/googletest | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index fed64398276078..551a84180b087a 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -187,20 +187,19 @@ endif() if(BUILD_TEST) # Preserve build options. set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(TEMP_CMAKE_DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX}) # We will build gtest as static libs and embed it directly into the binary. set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) # For gtest, we will simply embed it into our test binaries, so we won't # need to install it. - set(BUILD_GTEST ON) - set(INSTALL_GTEST OFF) + set(BUILD_GTEST ON CACHE BOOL "Build gtest" FORCE) + set(INSTALL_GTEST OFF CACHE BOOL "Install gtest." FORCE) # We currently don't need gmock right now. - set(BUILD_GMOCK OFF) + set(BUILD_GMOCK OFF CACHE BOOL "Build gmock." FORCE) # For Windows, we will check the runtime used is correctly passed in. if (NOT CAFFE2_USE_MSVC_STATIC_RUNTIME) - set(gtest_force_shared_crt ON) + set(gtest_force_shared_crt ON CACHE BOOL "force shared crt on gtest" FORCE) endif() add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest) include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include) @@ -212,10 +211,8 @@ if(BUILD_TEST) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark) include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include) - # Recover build options. Unfortunately gtest modifies CMAKE_DEBUG_POSTFIX - # in some versions as detailed at https://github.com/google/googletest/issues/1334 + # Recover build options. set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) - set(CMAKE_DEBUG_POSTFIX ${TEMP_CMAKE_DEBUG_POSTFIX} CACHE BOOL "Debug postfix" FORCE) endif() # ---[ LMDB diff --git a/third_party/googletest b/third_party/googletest index 69e48e92de4396..2fe3bd994b3189 160000 --- a/third_party/googletest +++ b/third_party/googletest @@ -1 +1 @@ -Subproject commit 69e48e92de43960a316a826293510b7b3deb9eca +Subproject commit 2fe3bd994b3189899d93f1d5a881e725e046fdc2 From d95fedb43631625394120087090caa3560618d2a Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Tue, 11 Sep 2018 20:12:34 -0700 Subject: [PATCH 058/237] Use ATen dropout implementation in Dropout module and add FeatureDropout (#11458) Summary: This PR does two things: 1. Replaces the implementation of the `Dropout` module with a call to the ATen function, 2. Replaces `Dropout2d` with a new `FeatureDropout` module that shall take the place of `Dropout2d` and `Dropout3d`. I contemplated calling it `Dropout2d` and making `Dropout3d` an alias for it, but similar to our decision for `BatchNorm{1,2,3}d` (c.f. https://github.com/pytorch/pytorch/pull/9188), we can deviate from Python PyTorch in favor of the ideal-world solution, which is to have a single module, since both actually just call `feature_dropout`. I also replaced the implementation of `dropout3d` with a call to `dropout2d` in Python. The code is the same and it's easier for developers to parse than having to manually match the tokens to make sure it's really 100% the same code (which it is, if I matched the tokens correctly). ebetica ezyang SsnL Pull Request resolved: https://github.com/pytorch/pytorch/pull/11458 Differential Revision: D9756603 Pulled By: goldsborough fbshipit-source-id: fe847cd2cda2b6da8b06779255d76e32a974807c --- test/cpp/api/integration.cpp | 2 +- .../api/include/torch/nn/modules/dropout.h | 46 ++++++++++--------- torch/csrc/api/src/nn/modules/dropout.cpp | 23 ++-------- torch/nn/functional.py | 2 + 4 files changed, 33 insertions(+), 40 deletions(-) diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp index 58e716e3096cfd..a8656a16f5d658 100644 --- a/test/cpp/api/integration.cpp +++ b/test/cpp/api/integration.cpp @@ -336,7 +336,7 @@ TEST_CASE("integration/mnist", "[cuda]") { auto conv1 = model->add(Conv2d(1, 10, 5), "conv1"); auto conv2 = model->add(Conv2d(10, 20, 5), "conv2"); auto drop = Dropout(0.3); - auto drop2d = Dropout2d(0.3); + auto drop2d = FeatureDropout(0.3); auto linear1 = model->add(Linear(320, 50), "linear1"); auto linear2 = model->add(Linear(50, 10), "linear2"); diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h index f547b5b2fa2d2b..bfe230a597215f 100644 --- a/torch/csrc/api/include/torch/nn/modules/dropout.h +++ b/torch/csrc/api/include/torch/nn/modules/dropout.h @@ -9,10 +9,13 @@ namespace torch { namespace nn { + +/// Options for `Dropout` and `FeatureDropout`. struct DropoutOptions { DropoutOptions(double rate); /// The probability with which a particular component of the input is set to /// zero. + /// Changes to this parameter at runtime are effective. TORCH_ARG(double, rate) = 0.5; }; @@ -26,14 +29,7 @@ class DropoutImplBase : public torch::nn::Cloneable { void reset() override; - /// During training, applies a noise mask to the input tensor. - /// During evaluation, applies an identity function. - Tensor forward(Tensor input); - - /// Returns a noise mask that can be applied to the given input tensor. - /// Used inside `forward()` to generate the noise mask for dropout. - virtual Tensor noise_mask(Tensor input) const = 0; - + /// The options used to configure this `Dropout` module. DropoutOptions options; }; } // namespace detail @@ -45,18 +41,26 @@ class DropoutImplBase : public torch::nn::Cloneable { class DropoutImpl : public detail::DropoutImplBase { public: using detail::DropoutImplBase::DropoutImplBase; - Tensor noise_mask(Tensor input) const override; + /// During training, applies a noise mask to the input tensor. + /// During evaluation, applies an identity function. + Tensor forward(Tensor input); }; -/// Applies [Dropout](https://arxiv.org/abs/1207.0580) to inputs with -/// 2-dimensional features. +/// Applies spatial [Dropout](https://arxiv.org/abs/1207.0580) to inputs with +/// 2-D or 3-D features. /// -/// See https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d to learn more -/// about the exact semantics of this module. -class Dropout2dImpl : public detail::DropoutImplBase { +/// The equivalent in Python is +/// [Dropout2d](https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d) for +/// 2-D features and +/// [Dropout3d](https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout3d) for +/// 3-D features. This `FeatureDropout` module can instead deal with both 2-D +/// and 3-D features. +class FeatureDropoutImpl : public detail::DropoutImplBase { public: - using detail::DropoutImplBase::DropoutImplBase; - Tensor noise_mask(Tensor input) const override; + using detail::DropoutImplBase::DropoutImplBase; + /// During training, applies a noise mask to the input tensor. + /// During evaluation, applies an identity function. + Tensor forward(Tensor input); }; /// A `ModuleHolder` subclass for `DropoutImpl`. @@ -65,10 +69,10 @@ class Dropout2dImpl : public detail::DropoutImplBase { /// module storage semantics. TORCH_MODULE(Dropout); -/// A `ModuleHolder` subclass for `Dropout2dImpl`. -/// See the documentation for `Dropout2dImpl` class to learn what methods it -/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's -/// module storage semantics. -TORCH_MODULE(Dropout2d); +/// A `ModuleHolder` subclass for `FeatureDropoutImpl`. +/// See the documentation for `FeatureDropoutImpl` class to learn what methods +/// it provides, or the documentation for `ModuleHolder` to learn about +/// PyTorch's module storage semantics. +TORCH_MODULE(FeatureDropout); } // namespace nn } // namespace torch diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp index 68dbac486d2c4c..c11c0e108413ac 100644 --- a/torch/csrc/api/src/nn/modules/dropout.cpp +++ b/torch/csrc/api/src/nn/modules/dropout.cpp @@ -20,31 +20,18 @@ DropoutImplBase::DropoutImplBase(DropoutOptions options_) template void DropoutImplBase::reset() {} -template -Tensor DropoutImplBase::forward(Tensor input) { - if (options.rate_ == 0 || !this->is_training()) { - return input; - } - - auto scale = 1.0f / (1.0f - options.rate_); - auto boolean_mask = noise_mask(input).uniform_(0, 1) > options.rate_; - auto noise = boolean_mask.to(input.dtype()).mul_(scale); - - return input * noise; -} - template class DropoutImplBase; -template class DropoutImplBase; +template class DropoutImplBase; } // namespace detail DropoutOptions::DropoutOptions(double rate) : rate_(rate) {} -Tensor DropoutImpl::noise_mask(Tensor input) const { - return torch::empty_like(input); +Tensor DropoutImpl::forward(Tensor input) { + return torch::dropout(input, options.rate_, this->is_training()); } -Tensor Dropout2dImpl::noise_mask(Tensor input) const { - return torch::empty({input.size(0), input.size(1), 1, 1}, input.options()); +Tensor FeatureDropoutImpl::forward(Tensor input) { + return torch::feature_dropout(input, options.rate_, this->is_training()); } } // namespace nn } // namespace torch diff --git a/torch/nn/functional.py b/torch/nn/functional.py index d9889d631f7fb8..d0514d90bda588 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -661,6 +661,8 @@ def dropout3d(input, p=0.5, training=True, inplace=False): training: apply dropout if is ``True``. Defualt: ``True`` inplace: If set to ``True``, will do this operation in-place. Default: ``False`` """ + # This is 100% the same code as dropout2d. We duplicate this code so that + # stack traces are not confusing. if p < 0 or p > 1: raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p)) From 045f862574063dfe1f92b84f46cd97b2aeeaf829 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Tue, 11 Sep 2018 20:20:16 -0700 Subject: [PATCH 059/237] Use torch::nn::init::xavier_normal_ Summary: The PyTorch C++ API has `torch.nn.init` equivalents that the RNNG can use to initialize the state of its StackRNNs. This gets rid of the `fanInOut_` methods on `Parser` and tidies up `xavierInitialState` a little. Reviewed By: wowitsmrinal Differential Revision: D9472595 fbshipit-source-id: c202116f32383d3b4bba064c2c0d2656311e1170 --- torch/csrc/api/include/torch/nn.h | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/csrc/api/include/torch/nn.h b/torch/csrc/api/include/torch/nn.h index 408330d94232a6..7285a696350b7c 100644 --- a/torch/csrc/api/include/torch/nn.h +++ b/torch/csrc/api/include/torch/nn.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include From 54107ae8cf476b4ebe7c631f75273e0b014e748c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 11 Sep 2018 20:20:54 -0700 Subject: [PATCH 060/237] convert output_device at data_parallel from torch.device to index (#10189) Summary: - fixes #9984 Pull Request resolved: https://github.com/pytorch/pytorch/pull/10189 Differential Revision: D9545390 Pulled By: weiyangfb fbshipit-source-id: 3a6a705437553ba319e9fd4b7f676ff73857a27e --- test/test_c10d.py | 11 +++++++---- test/test_distributed.py | 27 +++++++++++++++++---------- test/test_nn.py | 18 ++++++++++++++++++ torch/nn/parallel/_functions.py | 4 ++++ torch/nn/parallel/data_parallel.py | 22 ++++++++++++---------- torch/nn/parallel/distributed.py | 13 +++++++------ torch/nn/parallel/parallel_apply.py | 8 +++++++- torch/nn/parallel/replicate.py | 3 ++- 8 files changed, 74 insertions(+), 32 deletions(-) diff --git a/test/test_c10d.py b/test/test_c10d.py index 64bedb3183a212..ff9d87be76c84f 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -567,8 +567,7 @@ class DistributedDataParallelTest(MultiProcessTestCase): def world_size(self): return 2 - def _test_ddp_with_process_group(self, process_group): - gpus = gpus_for_rank(self.world_size)[self.rank] + def _test_ddp_with_process_group(self, process_group, gpus): model = Net() ddp_model = DistributedDataParallel( copy.deepcopy(model).cuda(gpus[0]), @@ -620,14 +619,18 @@ def test_gloo_backend(self): options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) - self._test_ddp_with_process_group(process_group) + gpus = gpus_for_rank(self.world_size)[self.rank] + self._test_ddp_with_process_group(process_group, gpus) + self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus))) @skip_if_not_multigpu @skip_if_not_nccl def test_nccl_backend(self): store = c10d.TCPStore('localhost', self.port, self.is_master) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - self._test_ddp_with_process_group(process_group) + gpus = gpus_for_rank(self.world_size)[self.rank] + self._test_ddp_with_process_group(process_group, gpus) + self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus))) @skip_if_not_multigpu def test_dist_broadcast_coalesced(self): diff --git a/test/test_distributed.py b/test/test_distributed.py index 41fa5cdac0c827..abf3663966700b 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -1126,24 +1126,15 @@ def _test_DDP_2iter( # Shuffle the input so that DDP input is different input = input[torch.randperm(batch_size)] - @unittest.skipIf( - BACKEND != "nccl" and BACKEND != "gloo", - "Only Nccl & Gloo backend support DistributedDataParallel", - ) - @skip_if_no_cuda_distributed - @skip_if_no_gpu - def test_DistributedDataParallel(self): + def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None): # Run a simple end to end DDP model, use result of single node model # as baseline - group, group_id, rank = self._init_global_test() - rank_to_GPU = self._init_multigpu_helper() # cpu training setup model = self._create_Net() # single gpu training setup model_gpu = copy.deepcopy(model) - gpu_subset = list(rank_to_GPU[rank]) model_gpu.cuda(gpu_subset[0]) # DDP training setup @@ -1195,6 +1186,22 @@ def test_DistributedDataParallelCPU(self): ) self._barrier() + @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo', + "Only Nccl & Gloo backend support DistributedDataParallel") + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_DistributedDataParallel(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + gpus = list(rank_to_GPU[rank]) + self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank) + + # test output_device + self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda')) + + # test device_ids + gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) + self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda')) if BACKEND == "gloo" or BACKEND == "nccl": WORLD_SIZE = os.environ["WORLD_SIZE"] diff --git a/test/test_nn.py b/test/test_nn.py index b2597b894803f6..03a35bf10d88f8 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -3154,6 +3154,24 @@ def forward(self, input): self.assertEqual(out.get_device(), 0) self.assertEqual(out.data, expected_out) + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm + def test_data_parallel_device_args(self): + cuda0 = torch.device('cuda:0') + cuda1 = torch.device('cuda:1') + + # test output_device + l = nn.Linear(10, 5).to(cuda0, torch.float) + i = torch.randn(20, 10, dtype=torch.float, device=cuda0, requires_grad=True) + out = dp.data_parallel(l, i, device_ids=(0, 1), output_device=cuda0) + self.assertEqual(out, l(i)) + + # test device_ids + l = nn.Linear(10, 5).to(cuda0, torch.float) + i = torch.randn(20, 10, dtype=torch.float, device=cuda0, requires_grad=True) + out = dp.data_parallel(l, i, device_ids=(cuda0, cuda1), output_device=cuda0) + self.assertEqual(out, l(i)) + def test_state_dict(self): l = nn.Linear(5, 5) block = nn.Module() diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py index 48b2a77d2d598d..4b1f58356287e9 100644 --- a/torch/nn/parallel/_functions.py +++ b/torch/nn/parallel/_functions.py @@ -3,6 +3,7 @@ import torch import torch.cuda.comm as comm from torch.autograd import Function +from torch.cuda._utils import _get_device_index class Broadcast(Function): @@ -11,6 +12,7 @@ class Broadcast(Function): def forward(ctx, target_gpus, *inputs): if not all(input.is_cuda for input in inputs): raise TypeError('Broadcast function not implemented for CPU tensors') + target_gpus = list(map(lambda x: _get_device_index(x, True), target_gpus)) ctx.target_gpus = target_gpus if len(inputs) == 0: return tuple() @@ -50,6 +52,7 @@ class Gather(Function): @staticmethod def forward(ctx, target_device, dim, *inputs): assert all(map(lambda i: i.is_cuda, inputs)) + target_device = _get_device_index(target_device, True) ctx.target_device = target_device ctx.dim = dim ctx.input_gpus = tuple(map(lambda i: i.get_device(), inputs)) @@ -76,6 +79,7 @@ class Scatter(Function): @staticmethod def forward(ctx, target_gpus, chunk_sizes, dim, input): + target_gpus = list(map(lambda x: _get_device_index(x, True), target_gpus)) ctx.dim = dim ctx.input_device = input.get_device() if input.is_cuda else -1 streams = None diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py index 94b60bda836d12..231963a48d1dd8 100644 --- a/torch/nn/parallel/data_parallel.py +++ b/torch/nn/parallel/data_parallel.py @@ -5,6 +5,7 @@ from .scatter_gather import scatter_kwargs, gather from .replicate import replicate from .parallel_apply import parallel_apply +from torch.cuda._utils import _get_device_index def _check_balance(device_ids): @@ -13,7 +14,7 @@ def _check_balance(device_ids): has less than 75% of the memory or cores of GPU {}. You can do so by setting the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES environment variable.""" - + device_ids = list(map(lambda x: _get_device_index(x, True), device_ids)) dev_props = [torch.cuda.get_device_properties(i) for i in device_ids] def warn_imbalance(get_prop): @@ -77,9 +78,9 @@ class DataParallel(Module): Args: - module: module to be parallelized - device_ids: CUDA devices (default: all devices) - output_device: device location of output (default: device_ids[0]) + module (Module): module to be parallelized + device_ids (list of int or torch.device): CUDA devices (default: all devices) + output_device (int or torch.device): device location of output (default: device_ids[0]) Attributes: module (Module): the module to be parallelized @@ -104,10 +105,11 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0): device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] + self.dim = dim self.module = module - self.device_ids = device_ids - self.output_device = output_device + self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids)) + self.output_device = _get_device_index(output_device, True) _check_balance(self.device_ids) @@ -143,10 +145,10 @@ def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, mo This is the functional version of the DataParallel module. Args: - module: the module to evaluate in parallel - inputs: inputs to the module - device_ids: GPU ids on which to replicate module - output_device: GPU location of the output Use -1 to indicate the CPU. + module (Module): the module to evaluate in parallel + inputs (tensor): inputs to the module + device_ids (list of int or torch.device): GPU ids on which to replicate module + output_device (list of int or torch.device): GPU location of the output Use -1 to indicate the CPU. (default: device_ids[0]) Returns: a Tensor containing the result of module(input) located on diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 175aac70be9f3d..2ea2d451594700 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -12,6 +12,7 @@ from .replicate import replicate from .scatter_gather import scatter_kwargs, gather from .parallel_apply import parallel_apply +from torch.cuda._utils import _get_device_index class DistributedDataParallel(Module): @@ -90,10 +91,10 @@ class DistributedDataParallel(Module): :meth:`forward` method. Args: - module: module to be parallelized - device_ids: CUDA devices (default: all devices) - output_device: device location of output (default: device_ids[0]) - broadcast_buffers: flag that enables syncing (broadcasting) buffers of + module (Module): module to be parallelized + device_ids (list of int or torch.device): CUDA devices (default: all devices) + output_device (int or torch.device): device location of output (default: device_ids[0]) + broadcast_buffers (bool): flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function. (default: True) process_group: the c10d process group to be used for distributed data @@ -133,8 +134,8 @@ def __init__(self, module, device_ids=None, self.dim = dim self.module = module - self.device_ids = device_ids - self.output_device = output_device + self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids)) + self.output_device = _get_device_index(output_device, True) self.broadcast_buffers = broadcast_buffers self.allreduce_opts = dist.AllreduceOptions() diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py index 6a5ab99cacfae1..438bdb9faf4037 100644 --- a/torch/nn/parallel/parallel_apply.py +++ b/torch/nn/parallel/parallel_apply.py @@ -1,5 +1,6 @@ import threading import torch +from torch.cuda._utils import _get_device_index def get_a_var(obj): @@ -22,6 +23,11 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword) on each of :attr:`devices`. + Args: + modules (Module): modules to be parallelized + inputs (tensor): inputs to the modules + devices (list of int or torch.device): CUDA devices + :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and :attr:`devices` (if given) should all have same length. Moreover, each element of :attr:`inputs` can either be a single object as the only argument @@ -36,7 +42,7 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): assert len(modules) == len(devices) else: devices = [None] * len(modules) - + devices = list(map(lambda x: _get_device_index(x, True), devices)) lock = threading.Lock() results = {} grad_enabled = torch.is_grad_enabled() diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py index b82ef08d727bcf..309c0db8f25054 100644 --- a/torch/nn/parallel/replicate.py +++ b/torch/nn/parallel/replicate.py @@ -1,10 +1,11 @@ import torch.cuda.comm as comm +from torch.cuda._utils import _get_device_index def replicate(network, devices, detach=False): from ._functions import Broadcast - devices = tuple(devices) + devices = list(map(lambda x: _get_device_index(x, True), devices)) num_replicas = len(devices) params = list(network.parameters()) From 35348dab103316a929ae55d91a9d03e049d7fc43 Mon Sep 17 00:00:00 2001 From: Rasmus Diederichsen Date: Tue, 11 Sep 2018 20:23:03 -0700 Subject: [PATCH 061/237] WIP: Include note on cudnn determinism in each function backed by cudnn (#11434) Summary: Ping ezyang This addresses your comment in #114. Strangely, when running the doc build (`make html`) none of my changes are actually showing, could you point out what I'm doing wrong? Once #11329 is merged it might make sense to link to the reproducibility note everywhere. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11434 Differential Revision: D9751208 Pulled By: ezyang fbshipit-source-id: cc672472449564ff099323c39603e8ff2b2d35c9 --- docs/source/cudnn_deterministic.rst | 7 +++++++ torch/nn/functional.py | 14 ++++++++++++++ torch/nn/modules/conv.py | 12 ++++++++++++ torch/nn/modules/loss.py | 5 +++++ 4 files changed, 38 insertions(+) create mode 100644 docs/source/cudnn_deterministic.rst diff --git a/docs/source/cudnn_deterministic.rst b/docs/source/cudnn_deterministic.rst new file mode 100644 index 00000000000000..71c622ea3baeb8 --- /dev/null +++ b/docs/source/cudnn_deterministic.rst @@ -0,0 +1,7 @@ +.. note:: + + In some circumstances when using the CUDA backend with CuDNN, this operator + may select a nondeterministic algorithm to increase performance. If this is + undesirable, you can try to make the operation deterministic (potentially at + a performance cost) by setting ``torch.backends.cudnn.deterministic = + True``. diff --git a/torch/nn/functional.py b/torch/nn/functional.py index d0514d90bda588..7823d445a0c13a 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -67,6 +67,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.Conv1d` for details and output shape. +.. include:: cudnn_deterministic.rst + Args: input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)` weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kW)` @@ -95,6 +97,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.Conv2d` for details and output shape. +.. include:: cudnn_deterministic.rst + Args: input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)` weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kH \times kW)` @@ -124,6 +128,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.Conv3d` for details and output shape. +.. include:: cudnn_deterministic.rst + Args: input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)` weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kT \times kH \times kW)` @@ -152,6 +158,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.ConvTranspose1d` for details and output shape. +.. include:: cudnn_deterministic.rst + Args: input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)` weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kW)` @@ -183,6 +191,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.ConvTranspose2d` for details and output shape. +.. include:: cudnn_deterministic.rst + Args: input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)` weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kH \times kW)` @@ -216,6 +226,8 @@ def legacy_get_enum(size_average, reduce, emit_warning=True): See :class:`~torch.nn.ConvTranspose3d` for details and output shape. +.. include:: cudnn_deterministic.rst + Args: input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)` weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kT \times kH \times kW)` @@ -1438,6 +1450,8 @@ def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0, See :class:`~torch.nn.CTCLoss` for details. + .. include:: cudnn_deterministic.rst + Args: log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, `T = input length`, and `N = batch size`. diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index 7cbf5fbf604314..a1bfcbc08e097e 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -120,6 +120,8 @@ class Conv1d(_ConvNd): a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments :math:`(C_\text{in}=C_{in}, C_\text{out}=C_{in} \times K, ..., \text{groups}=C_{in})`. + .. include:: cudnn_deterministic.rst + Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution @@ -243,6 +245,8 @@ class Conv2d(_ConvNd): a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`. + .. include:: cudnn_deterministic.rst + Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution @@ -366,6 +370,8 @@ class Conv3d(_ConvNd): a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`. + .. include:: cudnn_deterministic.rst + Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution @@ -527,6 +533,8 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd): that :attr:`output_padding` is only used to find output shape, but does not actually add zero-padding to output. + .. include:: cudnn_deterministic.rst + Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution @@ -637,6 +645,8 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd): that :attr:`output_padding` is only used to find output shape, but does not actually add zero-padding to output. + .. include:: cudnn_deterministic.rst + Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution @@ -777,6 +787,8 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd): that :attr:`output_padding` is only used to find output shape, but does not actually add zero-padding to output. + .. include:: cudnn_deterministic.rst + Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index ef6c8971691945..1737bd5f9da687 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1177,6 +1177,11 @@ class CTCLoss(_Loss): dtype :attr:`torch.int32`. The regular implementation uses the (more common in PyTorch) `torch.long` dtype. + + + .. include:: cudnn_deterministic.rst + + """ def __init__(self, blank=0, reduction='elementwise_mean'): From f4d9f39a94f4dea478a489a7c9d31dedbad06bd5 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Tue, 11 Sep 2018 20:27:25 -0700 Subject: [PATCH 062/237] Test libtorch on cuda Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11554 Differential Revision: D9784772 Pulled By: goldsborough fbshipit-source-id: c3e071695f56c1f427984f427b1f7722722947d3 --- .jenkins/pytorch/common.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh index 5ce6ee01a46975..ca728df2b826c5 100644 --- a/.jenkins/pytorch/common.sh +++ b/.jenkins/pytorch/common.sh @@ -112,7 +112,8 @@ else exit 1 fi -if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3 ]] || \ + [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then BUILD_TEST_LIBTORCH=1 else BUILD_TEST_LIBTORCH=0 From b75c32ded9e4bfdfedfd96e41af762dcdee6697c Mon Sep 17 00:00:00 2001 From: Anders Papitto Date: Tue, 11 Sep 2018 20:31:44 -0700 Subject: [PATCH 063/237] link against TORCH_CUDA_LIBRARIES Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11475 Differential Revision: D9784616 Pulled By: anderspapitto fbshipit-source-id: bb8b443bcb308bbbe9707d265f21e5d00d717d65 --- torch/CMakeLists.txt | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 614dbf3937742d..b0f07a7ef3a73b 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -319,23 +319,11 @@ if(USE_CUDA) set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() set(TORCH_CUDA_LIBRARIES - ${CUDA_CUDA_LIB} - ${CUDA_NVRTC_LIB} ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so ${CUDA_LIBRARIES}) endif() - if(MSVC OR APPLE) - target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES}) - else() - # TODO: using the full TORCH_CUDA_LIBRARIES here causes some - # builds to fail in CI, as libcuda.so can no longer be found. It's - # not clear why this is the case, and the situation should be - # investigated/cleaned up. Note that the test+jit/test_api - # targets below do require linking against the full - # TORCH_CUDA_LIBRARIES, even on Linux - target_link_libraries(torch caffe2_gpu_library ${CUDA_LIBRARIES}) - endif() + target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES}) target_compile_definitions(torch PRIVATE USE_CUDA) endif() @@ -412,7 +400,7 @@ if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) ${ATen_CPU_INCLUDE}) if (USE_CUDA) - target_link_libraries(test_jit ${CUDA_LIBRARIES}) + target_link_libraries(test_jit ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB}) endif() endif() @@ -446,7 +434,7 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) "${TORCH_ROOT}/third_party/catch/single_include" ${ATen_CPU_INCLUDE}) - target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES}) + target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB}) if (NOT MSVC) if (APPLE) From 8aa8ad8b019cb5825ddbe3748aa34a2ad12bb600 Mon Sep 17 00:00:00 2001 From: Rasmus Diederichsen Date: Tue, 11 Sep 2018 21:01:43 -0700 Subject: [PATCH 064/237] WIP: Reproducibility note (#11329) Summary: This adds a Note on making experiments reproducible. It also adds Instructions for building the Documentation to `README.md`. Please ping if I missed any requirements. I'm not sure what to do about the submodule changes. Please advise. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11329 Differential Revision: D9784939 Pulled By: ezyang fbshipit-source-id: 5c5acbe343d1fffb15bdcb84c6d8d925c2ffcc5e --- README.md | 13 ++++++++++ docs/source/notes/randomness.rst | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 docs/source/notes/randomness.rst diff --git a/README.md b/README.md index b909001edc6f14..4b996cc1b440a6 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ We are in an early-release beta. Expect some adventures and rough edges. - [Binaries](#binaries) - [From Source](#from-source) - [Docker Image](#docker-image) + - [Building the Documentation](#building-the-documentation) - [Previous Versions](#previous-versions) - [Getting Started](#getting-started) - [Communication](#communication) @@ -224,6 +225,18 @@ Please note that PyTorch uses shared memory to share data between processes, so for multithreaded data loaders) the default shared memory segment size that container runs with is not enough, and you should increase shared memory size either with `--ipc=host` or `--shm-size` command line options to `nvidia-docker run`. +### Building the Documentation + +To build documentation in various formats, you will need Sphinx and the +readthedocs theme. + +``` +cd docs/ +pip install -r requirements.txt +``` +You can then build the documentation by running ``make `` from the +``docs/`` folder. Run ``make`` to get a list of all available output formats. + ### Previous Versions Installation instructions and binaries for previous PyTorch versions may be found diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst new file mode 100644 index 00000000000000..5ba2c87a8eecf3 --- /dev/null +++ b/docs/source/notes/randomness.rst @@ -0,0 +1,42 @@ + +Reproducibility +=============== + +Completely reproducible results are not guaranteed across PyTorch releases, +individual commits or different platforms. Furthermore, results need to be +reproducible between CPU and GPU executions, even when using identical seeds. + +However, in order to make computations deterministic on your specific problem on +one specific platform and PyTorch release, there are a couple of steps to take. + +There are two pseudorandom number generators involved in PyTorch, which you will +need to seed manually to make runs reproducible. Furthermore, you should ensure +that all other libraries your code relies on an which use random numbers also +use a fixed seed. + +PyTorch +....... +You can use :meth:`torch.manual_seed()` to seed the RNG for all devices (both +CPU and CUDA) + + import torch + torch.manual_seed(0) + + +CuDNN +..... +When running on the CuDNN backend, one further option must be set:: + + torch.backends.cudnn.deterministic = True + +.. warning:: + + Deterministic mode can have a performance impact, depending on your model. + +Numpy +..... +If you or any of the libraries you are using rely on Numpy, you should seed the +Numpy RNG as well. This can be done with:: + + import numpy as np + np.random.seed(0) From a11ebfa195db22268e9db562a6295752f55bec4a Mon Sep 17 00:00:00 2001 From: Tongliang Liao Date: Tue, 11 Sep 2018 21:04:10 -0700 Subject: [PATCH 065/237] Add explicit "this->" for nvcc. (#11196) Summary: Fix #11195 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11196 Differential Revision: D9737625 Pulled By: ezyang fbshipit-source-id: fb62076f005bd619eba53c0ed3f07683633f6d91 --- caffe2/operators/utility_ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index 634281428bb898..c661ec2f5c55ec 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -325,7 +325,7 @@ class WeightedSumOp : public Operator { template bool DoRunWithType() { - const int input_size = InputSize(); + const int input_size = this->InputSize(); CAFFE_ENFORCE_EQ(input_size % 2, 0); const auto& X0 = Input(0); const auto& weight0 = Input(1); From 1a246c9c7eea7dfe8391645540053badab59747e Mon Sep 17 00:00:00 2001 From: Soumith Chintala Date: Tue, 11 Sep 2018 21:07:03 -0700 Subject: [PATCH 066/237] guard spurious cudnn.h include (#11562) Summary: This fixes the build when CuDNN was not found on the system. From the `git blame`, it looks like the bug has been around for 2 years :) Pull Request resolved: https://github.com/pytorch/pytorch/pull/11562 Differential Revision: D9784589 Pulled By: soumith fbshipit-source-id: b33153436dced0a503c9833cdf52f7093f3394b4 --- caffe2/core/common_cudnn.h | 4 ++-- caffe2/python/pybind_state_gpu.cc | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h index a8f2808e30eeab..2bbbce7df1e4ef 100644 --- a/caffe2/core/common_cudnn.h +++ b/caffe2/core/common_cudnn.h @@ -4,8 +4,6 @@ #include #include -#include - #include "caffe2/core/common.h" #include "caffe2/core/context.h" #include "caffe2/core/logging.h" @@ -16,6 +14,8 @@ #error("This Caffe2 install is not built with cudnn, so you should not include this file."); #endif +#include + static_assert( CUDNN_VERSION >= 5000, "Caffe2 requires cudnn version 5.0 or above."); diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc index 9c15bc2145d5a9..8c547cf8eccca6 100644 --- a/caffe2/python/pybind_state_gpu.cc +++ b/caffe2/python/pybind_state_gpu.cc @@ -10,7 +10,9 @@ #include #include +#ifdef CAFFE2_USE_CUDNN #include "caffe2/core/common_cudnn.h" +#endif // CAFFE2_USE_CUDNN #include "caffe2/core/context_gpu.h" #include "caffe2/operators/operator_fallback_gpu.h" #include "caffe2/python/pybind_state_registry.h" @@ -39,10 +41,12 @@ namespace py = pybind11; void addCUDAGlobalMethods(py::module& m) { m.def("num_cuda_devices", &NumCudaDevices); m.def("get_cuda_version", &CudaVersion); +#ifdef CAFFE2_USE_CUDNN m.def("get_cudnn_version", &cudnnCompiledVersion); m.attr("cudnn_convolution_fwd_algo_count") = py::int_((int) CUDNN_CONVOLUTION_FWD_ALGO_COUNT); m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT); m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT); +#endif m.def("get_cuda_peer_access_pattern", []() { std::vector> pattern; CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern)); From a00fa2c6145685383788e5e3b08c87eadcd69ff2 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 21:47:46 -0700 Subject: [PATCH 067/237] Release GIL when calling into JIT interpreter Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11541 Differential Revision: D9777909 Pulled By: apaszke fbshipit-source-id: d0217e203721262f3f131b54ea78f898df0b54ec --- torch/csrc/jit/init.cpp | 6 +++++- torch/csrc/jit/pybind_utils.h | 6 +++++- torch/csrc/jit/python_interpreter.cpp | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index ce472707319fbc..0d607c3c364a80 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -1,4 +1,5 @@ #include "torch/csrc/utils/pybind.h" +#include "torch/csrc/utils/auto_gil.h" #include "torch/csrc/jit/python_tracer.h" #include "torch/csrc/jit/tracer.h" @@ -206,7 +207,10 @@ void initJITBindings(PyObject *module) { .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object { const auto & graph = ge.graph(); auto stack = evilDeprecatedBadCreateStackDoNotUse(args, graph->inputs()); - ge.run(stack); + { + AutoNoGIL no_gil_guard; + ge.run(stack); + } return createPyObjectForStack(std::move(stack)); }); diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h index b9f9e6bb85fd4e..54414534b06215 100644 --- a/torch/csrc/jit/pybind_utils.h +++ b/torch/csrc/jit/pybind_utils.h @@ -7,6 +7,7 @@ #include "torch/csrc/jit/type.h" #include "torch/csrc/jit/operator.h" #include "torch/csrc/utils/pybind.h" +#include "torch/csrc/utils/auto_gil.h" #include @@ -279,7 +280,10 @@ inline py::object invokeScriptMethodFromPython( script::Method& method, py::args args, py::kwargs kwargs) { auto stack = createStackForSchema(method.getSchema(), std::move(args), std::move(kwargs)); - method.run(stack); + { + AutoNoGIL no_gil_guard; + method.run(stack); + } return createPyObjectForStack(std::move(stack)); } diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp index 5cc4b70b2e9637..85f0c37c4ea7e3 100644 --- a/torch/csrc/jit/python_interpreter.cpp +++ b/torch/csrc/jit/python_interpreter.cpp @@ -28,6 +28,7 @@ namespace torch { namespace jit { namespace { Operation createPythonOperation(Node* op_) { + AutoGIL gil; PythonOp* op = static_cast(op_); py::function func = py::reinterpret_borrow(py::handle(op->pyobj.get())); size_t num_inputs = 0; From 62c9d4ac96182d08c0f17813564e54bc8afb9c06 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 21:48:22 -0700 Subject: [PATCH 068/237] Make .to() methods native functions (to fix JIT tracing) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11491 Differential Revision: D9771121 Pulled By: apaszke fbshipit-source-id: 08d11101fb12093f8cf913b06359adddf3af9da7 --- aten/src/ATen/native/TensorConversions.cpp | 51 ++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 16 +++++ aten/src/ATen/templates/Tensor.h | 6 -- aten/src/ATen/templates/TensorMethods.h | 39 ------------ test/test_jit.py | 28 +++++++++ tools/autograd/gen_python_functions.py | 2 +- tools/autograd/templates/VariableType.h | 1 + .../templates/python_variable_methods.cpp | 60 ++++++++++--------- tools/jit/gen_jit_dispatch.py | 2 +- torch/csrc/jit/tracer.cpp | 22 ++++--- torch/csrc/jit/tracer.h | 4 +- torch/onnx/symbolic.py | 18 ++++++ 12 files changed, 167 insertions(+), 82 deletions(-) create mode 100644 aten/src/ATen/native/TensorConversions.cpp diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp new file mode 100644 index 00000000000000..9605736ee112fe --- /dev/null +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -0,0 +1,51 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + +namespace at { +namespace native { + +static void ensure_has_index(Device* device) { + if (!device->is_cuda() || device->has_index()) { + return; + } + device->set_index(at::current_device()); +} + +static Tensor to_impl(const Tensor& self, const TensorOptions& options, bool non_blocking) { + return self.type().toBackend(options.backend()).toScalarType(options.dtype()) + .copy(self, non_blocking, options.device()); +} + +Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking) { + ensure_has_index(&device); + if (self.device() == device && self.dtype() == dtype) { + return self; + } + return to_impl(self, self.options().device(device).dtype(dtype), non_blocking); +} + +Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking) { + if (self.dtype() == dtype) { + return self; + } + return to_impl(self, self.options().dtype(dtype), non_blocking); +} + +Tensor to(const Tensor& self, Device device, bool non_blocking) { + ensure_has_index(&device); + if (self.device() == device) { + return self; + } + return to_impl(self, self.options().device(device), non_blocking); +} + +Tensor to(const Tensor& self, const Tensor& other, bool non_blocking) { + auto self_options = self.options(); + auto options = other.options(); + if (self_options == options) { + return self; + } + return to_impl(self, options, non_blocking); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 44fea84527acc2..024c33a19c31dc 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2030,6 +2030,22 @@ variants: function, method device_guard: False +- func: to(Tensor self, Device device, ScalarType dtype, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + +- func: to(Tensor self, ScalarType dtype, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + +- func: to(Tensor self, Device device, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + +- func: to(Tensor self, Tensor other, bool non_blocking=false) -> Tensor + variants: method + device_guard: False + - func: meshgrid(TensorList tensors) -> TensorList # This has a method dispatch to work around circular include problems diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 50e280be8f2fe9..aab8acd86a6ba5 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -154,12 +154,6 @@ struct AT_API Tensor { Tensor toType(ScalarType t) const; Tensor toBackend(Backend b) const; - /// New-style `to()` methods. - /// NB: These methods are defined in TensorOptions.h. - Tensor to(Device device, ScalarType dtype, bool non_blocking = false) const; - Tensor to(ScalarType dtype, bool non_blocking = false) const; - Tensor to(Device device, bool non_blocking = false) const; - /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`. /// Defined in Type.h because of include order issues. bool is_variable() const noexcept; diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 5573f5802809e9..ec86db721b9f04 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -43,45 +43,6 @@ inline TensorOptions Tensor::options() const { .is_variable(is_variable()); } -namespace detail { -inline Tensor to( - const Tensor& tensor, - const TensorOptions& options, - bool non_blocking) { - // Don't copy if the options match. - if (tensor.options() == options) { - return tensor; - } - AT_CHECK(tensor.is_variable() == options.is_variable(), - "cannot change is_variable, from: ", tensor.is_variable(), - " to: ", options.is_variable()); - return tensor.type().toBackend(options.backend()).toScalarType(options.dtype()) - .copy(tensor, non_blocking, options.device()); -} -} // namespace detail - -inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) - const { - if (this->device() == device && this->dtype() == dtype) { - return *this; - } - return detail::to(*this, options().device(device).dtype(dtype), non_blocking); -} - -inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const { - if (this->dtype() == dtype) { - return *this; - } - return detail::to(*this, options().dtype(dtype), non_blocking); -} - -inline Tensor Tensor::to(Device device, bool non_blocking) const { - if (this->device() == device) { - return *this; - } - return detail::to(*this, options().device(device), non_blocking); -} - inline void Tensor::backward( at::optional gradient, bool keep_graph, diff --git a/test/test_jit.py b/test/test_jit.py index 68aaf4df5d3afd..69347e2d67b111 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1081,6 +1081,34 @@ def test_trace_size(self): def test_trace_size_with_grad(self): self.do_trace_size(True) + def test_trace_casts(self): + casts = [ + lambda x: x.byte(), + lambda x: x.float(), + lambda x: x.cpu(), + lambda x: x.to(device='cpu'), + lambda x: x.to(dtype=torch.int64), + lambda x: x.to(device='cpu', dtype=torch.float), + lambda x: x.to(x) + ] + + def assertContainsCast(trace): + self.assertEqual(sum(n.kind() == 'aten::to' for n in trace.graph.nodes()), 1) + + for cast in casts: + trace = torch.jit.trace(cast, torch.randn(2, 2)) + assertContainsCast(trace) + x = torch.randn(2, 2) + self.assertEqual(trace(x), cast(x)) + + def to_tensor(x, y): + return x.to(y) + + to_tensor_trace = torch.jit.trace(to_tensor, (torch.randn(2, 2), torch.randn(1, 8))) + assertContainsCast(to_tensor_trace) + x, y = torch.randn(2, 2), torch.randn(1, 10) + self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y)) + def test_trace_warn(self): def fn(x): int(x) # Warning 1. diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index b3ea70aa87222e..8712c7c6ca2d43 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -28,7 +28,7 @@ '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*', 'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?', '_local_scalar', '_local_scalar_dense', - 'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear' + 'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to' ] # These function signatures are not exposed to Python. Note that this signature diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index b9d84ad85672a5..d1e847a141aace 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -29,6 +29,7 @@ using at::TensorList; using at::Type; using at::ScalarType; using at::optional; +using at::Device; void register_variable_type_for(at::Type* baseType); diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index a863a18e04bdf3..4f0bc94199af98 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -230,13 +230,26 @@ static PyObject * THPVariable_invert(PyObject* self, PyObject* args) { END_HANDLE_TH_ERRORS } +static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking) { + AutoNoGIL no_gil; + return self.to(device, non_blocking); +} + +static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking) { + AutoNoGIL no_gil; + return self.to(dtype, non_blocking); +} + +static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) { + AutoNoGIL no_gil; + return self.to(device, dtype, non_blocking); +} + static PyObject * THPVariable_cpu(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS auto& self_ = reinterpret_cast(self)->cdata; - auto backend = self_.is_sparse() ? Backend::SparseCPU : Backend::CPU; - auto& type = self_.type().toBackend(backend); - return wrap(torch::utils::dispatch_type_conversion(self_, type)); + return THPVariable_Wrap(dispatch_to(self_, at::Device(at::DeviceType::CPU), false)); END_HANDLE_TH_ERRORS } @@ -250,25 +263,17 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa auto& self_ = reinterpret_cast(self)->cdata; ParsedArgs<2> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); - auto backend = self_.is_sparse() ? at::Backend::SparseCUDA : at::Backend::CUDA; - auto& type = self_.type().toBackend(backend); - auto device_obj = r.device(0); - if (!r.isNone(0) && device_obj.is_cpu()) { - throw std::runtime_error("Invalid device, must be cuda device"); - } - int32_t device_index = -1; - if (device_obj.has_index() && device_obj.is_cuda()) { - device_index = device_obj.index(); - } - return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, r.toBool(1))); + auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0); + AT_CHECK(device.is_cuda(), "Invalid device, must be cuda device"); + torch::utils::cuda_lazy_init(); + return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1))); END_HANDLE_TH_ERRORS } static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType) { HANDLE_TH_ERRORS auto& self_ = reinterpret_cast(self)->cdata; - auto& type = self_.type().toScalarType(scalarType); - return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type)); + return THPVariable_Wrap(dispatch_to(self_, scalarType, false)); END_HANDLE_TH_ERRORS } static PyObject * THPVariable_byte(PyObject* self, PyObject* args) { @@ -497,18 +502,19 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg auto& device = std::get<0>(parsed); auto& scalarType = std::get<1>(parsed); auto non_blocking = std::get<2>(parsed); - if (!device) { - // device not given - auto& self_ = reinterpret_cast(self)->cdata; - auto& type = self_.type().toScalarType(scalarType.value_or(self_.type().scalarType())); - return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type)); + auto& self_ = reinterpret_cast(self)->cdata; + if (device && device->is_cuda()) { + torch::utils::cuda_lazy_init(); + } + if (!device && !scalarType) { + Py_INCREF(self); + return self; + } else if (!device) { + return THPVariable_Wrap(dispatch_to(self_, *scalarType, non_blocking)); + } else if (!scalarType) { + return THPVariable_Wrap(dispatch_to(self_, *device, non_blocking)); } else { - // device and maybe dtype are given - auto& self_ = reinterpret_cast(self)->cdata; - auto& layout = *torch::getLayout(self_.type().backend()); - auto& type = torch::getVariableType(scalarType.value_or(self_.type().scalarType()), layout, device->type()); - const int32_t device_index = type.is_cuda() ? device->index() : -1; - return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, non_blocking)); + return THPVariable_Wrap(dispatch_to(self_, *device, *scalarType, non_blocking)); } Py_RETURN_NONE; END_HANDLE_TH_ERRORS diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 67bba43913ef16..b7326e526baa86 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -124,7 +124,7 @@ def is_magic_method(api_name): return api_name.startswith('__') and api_name.endswith('__') -blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional', 'void*'} +blacklisted_types = {'SparseTensorRef', 'Storage', 'void*'} default_only_types = {'Generator'} diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 0f8d65dcc5873a..da110dd18d3fed 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -50,7 +50,18 @@ void addInputs(Node *n, const char * name, at::Generator * value) { Value * undef_gen = g->insertNode(g->createNoneGenerator())->output(); n->addInput(undef_gen); } -void addInputs(Node *n, const char * name, at::ScalarType value) { detail::badArgType(value); } +void addInputs(Node *n, const char * name, at::Device value) { + std::vector device = { + static_cast(value.type()), + static_cast(value.index())}; + detail::genericAddInput(n, std::move(device)); +} +void addInputs(Node *n, const char * name, at::Layout value) { + detail::genericAddInput(n, static_cast(value)); +} +void addInputs(Node *n, const char * name, at::ScalarType value) { + detail::genericAddInput(n, static_cast(value)); +} void addInputs(Node *n, const char * name, at::TensorList value) { Graph *g = n->owningGraph(); @@ -60,12 +71,9 @@ void addInputs(Node *n, const char * name, at::TensorList value) { void addInputs(Node* n, const char * name, const at::TensorOptions& options) { // [TensorOptions in script] - update this when you change how we schematize TensorOptions - detail::genericAddInput(n, static_cast(options.dtype())); - detail::genericAddInput(n, static_cast(options.layout())); - std::vector device = { - static_cast(options.device().type()), - static_cast(options.device().index())}; - detail::genericAddInput(n, std::move(device)); + addInputs(n, name, options.dtype()); + addInputs(n, name, options.layout()); + addInputs(n, name, options.device()); } void addInputs(Node *n, const char * name, at::IntList value) { diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 37cd9b332923ee..710aac85223044 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -172,8 +172,10 @@ TORCH_API void addInputs(Node *n, const char * name, const ArrayRef& val TORCH_API void addInputs(Node *n, const char * name, const std::string& value); TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value); TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value); -TORCH_API void addInputs(Node *n, const char * name, at::Generator * value); +TORCH_API void addInputs(Node *n, const char * name, at::Device value); +TORCH_API void addInputs(Node *n, const char * name, at::Layout value); TORCH_API void addInputs(Node *n, const char * name, at::ScalarType value); +TORCH_API void addInputs(Node *n, const char * name, at::Generator * value); template void addInputs(Node *n, const char * name, std::array value) { diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 03e93125f40755..53d4f769a4e56f 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -991,6 +991,24 @@ def topk(g, self, k, dim, largest, sorted, out=None): return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2) +def to(g, self, *args): + # ONNX doesn't have a concept of a device, so we ignore device casts + if len(args) == 2: + if args[0].type().isSubtypeOf(ListType.ofInts()): + # aten::to(Tensor, Device, bool) + return self + else: + # aten::to(Tensor, ScalarType, bool) + dtype = _get_const(args[0], 'i', 'dtype') + return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype]) + elif len(args) == 3: + # aten::to(Tensor, Device, ScalarType, bool) + dtype = _get_const(args[1], 'i', 'dtype') + return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype]) + else: + raise NotImplementedError("Unknown aten::to signature") + + def repeat(g, self, repeats): if not _is_value(repeats): repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) From 90e31f4896c2062b97f7a4efb73754b840ba5dc6 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Tue, 11 Sep 2018 21:55:22 -0700 Subject: [PATCH 069/237] Improve tracer warnings (#11545) Summary: Also, fix a performance bug in `ensureUnique`. Previously it formatted the warning string even though we weren't tracing, so all that work would *always* happen in the hot path and be for nothing. A sample of how the new warnings look like: ``` tmp.py:4: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Pytho n values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! int(x) tmp.py:5: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this fun ction to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might caus e the trace to be incorrect. torch.tensor([1.]) tmp.py:6: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator add_. This might cause t he trace to be incorrect, because all other views that also reference this data will not not reflect this change in the trace! On the other ha nd, if all other views use the same memory, but are disjoint (e.g. are outputs of torch.split), this might still be safe. torch.split(y, 2, dim=1)[0].add_(2) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11545 Differential Revision: D9782975 Pulled By: apaszke fbshipit-source-id: 5b3abd31366e59c69e0b7ff278042b5563deb5a9 --- .../templates/python_torch_functions.cpp | 8 ++--- .../templates/python_variable_methods.cpp | 14 ++++----- torch/csrc/autograd/python_variable.cpp | 2 +- torch/csrc/jit/tracer.cpp | 29 +++++++++---------- torch/csrc/jit/tracer.h | 15 +++++++++- torch/csrc/jit/tracing_state.h | 8 +++-- torch/jit/__init__.py | 2 +- 7 files changed, 46 insertions(+), 32 deletions(-) diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index a7174b52e52b88..51ded6eb792f34 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -321,7 +321,7 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS - jit::tracer::warn("torch.as_tensor"); + jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR); return THPVariable_Wrap(torch::utils::as_tensor(default_type(), args, kwargs)); END_HANDLE_TH_ERRORS } @@ -329,7 +329,7 @@ static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg) { HANDLE_TH_ERRORS - jit::tracer::warn("torch.from_numpy"); + jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR); auto data = torch::utils::tensor_from_numpy(arg); return THPVariable_Wrap(make_variable(std::move(data), /*requires_grad=*/false)); END_HANDLE_TH_ERRORS @@ -354,7 +354,7 @@ static PyObject * THPVariable__promote_types(PyObject* self, PyObject* args, PyO static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS - jit::tracer::warn("torch.sparse_coo_tensor"); + jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR); return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(default_type(), args, kwargs)); END_HANDLE_TH_ERRORS } @@ -362,7 +362,7 @@ static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS - jit::tracer::warn("torch.tensor"); + jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR); return THPVariable_Wrap(torch::utils::tensor_ctor(default_type(), args, kwargs)); END_HANDLE_TH_ERRORS } diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index 4f0bc94199af98..d92ad3dbf7688b 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -179,7 +179,7 @@ static int64_t dispatch_to_CLong(const Tensor & self) { static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS - jit::tracer::warn("Converting a tensor to a Python float"); + jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; return wrap(dispatch_to_CDouble(self_)); END_HANDLE_TH_ERRORS @@ -187,7 +187,7 @@ static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) { static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS - jit::tracer::warn("Converting a tensor to a Python integer"); + jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; if (isFloatingType(self_.type().scalarType())) { // we can't dispatch to toCLong here because we want to avoid ATen overflow checks; @@ -203,7 +203,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) { // called when used as a slice. static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS - jit::tracer::warn("Converting a tensor to a Python index"); + jit::tracer::warn("Converting a tensor to a Python index", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; // TODO: change the condition to `self_.dim() != 0` once we expose scalars // in PyTorch. @@ -320,7 +320,7 @@ static PyObject * THPVariable_element_size(PyObject* self, PyObject* args) static PyObject * THPVariable_numpy(PyObject* self, PyObject* arg) { HANDLE_TH_ERRORS - jit::tracer::warn("Converting a tensor to a NumPy array"); + jit::tracer::warn("Converting a tensor to a NumPy array", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; if (self_.requires_grad()) { throw std::runtime_error( @@ -375,7 +375,7 @@ static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyO static PyObject * THPVariable_item(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS - jit::tracer::warn("Converting a tensor to a Python number"); + jit::tracer::warn("Converting a tensor to a Python number", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; if (self_.is_floating_point()) { return wrap(dispatch_to_CDouble(self_)); @@ -523,7 +523,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg static PyObject * THPVariable_tolist(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS - jit::tracer::warn("Converting a tensor to a Python list"); + jit::tracer::warn("Converting a tensor to a Python list", jit::tracer::WARN_PYTHON_DATAFLOW); auto self_ = reinterpret_cast(self)->cdata; return torch::utils::tensor_to_list(self_.data()); END_HANDLE_TH_ERRORS @@ -570,7 +570,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa ${py_methods} static PyObject * THPVariable_bool(PyObject* self, PyObject* args) { - jit::tracer::warn("Converting a tensor to a Python boolean"); + jit::tracer::warn("Converting a tensor to a Python boolean", jit::tracer::WARN_PYTHON_DATAFLOW); return THPVariable_is_nonzero(self, args); } diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 5666da28135f1f..c3f9bd510a8289 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -126,7 +126,7 @@ static void THPVariable_dealloc(THPVariable* self) static PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs) { HANDLE_TH_ERRORS - jit::tracer::warn("torch.Tensor"); + jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR); auto& default_type = torch::tensors::get_default_tensor_type(); auto tensor = torch::utils::legacy_tensor_ctor(default_type, args, kwargs); return THPVariable_NewWithVar(type, std::move(tensor)); diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index da110dd18d3fed..c4e4948fa9248a 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -190,12 +190,22 @@ void setRecordSourceLocation(void (*v)(Node*)) { void defaultWarn(const std::string& str) { AT_WARN(str); } std::atomic warn_callback { defaultWarn }; -void _do_warn(const char * _reason) { +const char * WARN_PYTHON_DATAFLOW = + " might cause the trace to be incorrect. We can't record the data flow of " + "Python values, so this value will be treated as a constant in the future. " + "This means that the trace might not generalize to other inputs!"; +const char * WARN_CONSTRUCTOR = + " results are registered as constants in the trace. You can safely ignore this " + "warning if you use this function to create tensors out of constant variables " + "that would be the same every time you call this function. In any other case, " + "this might cause the trace to be incorrect."; + +// XXX: _kind can be a nullptr +void _do_warn(const char * _reason, const char * _kind) { std::string reason { _reason }; + std::string kind { _kind ? _kind : "" }; std::ostringstream s; - s << std::string(reason); - s << " might cause the trace to be incorrect. We can't record the data flow of " - " Python values, which means the trace might not generalize to other inputs."; + s << reason << kind; warn_callback.load()(s.str()); } @@ -203,15 +213,4 @@ void setWarn(warn_fn_type fn) { warn_callback.store(fn); } -void ensureUnique(const char * name, const at::Tensor& tensor) { - auto aliases = tensor.storage().use_count(); - if (aliases > 1) { - std::stringstream ss; - ss << "There are " << aliases - << " live references to the tensor being modified when tracing in-place operator " - << name << " which "; - warn(ss.str().c_str()); - } -} - }}} diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 710aac85223044..21a806d86a245e 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -182,7 +182,20 @@ void addInputs(Node *n, const char * name, std::array value) { throw std::runtime_error("Found an unsupported argument type in the JIT tracer. File a bug report."); } -TORCH_API void ensureUnique(const char * name, const at::Tensor& tensor); +inline void ensureUnique(const char * name, const at::Tensor& tensor) { + auto aliases = tensor.storage().use_count(); + if (isTracing() && aliases > 1) { + std::stringstream ss; + ss << "There are " << aliases + << " live references to the data region being modified when tracing in-place operator " + << name << ". This might cause the trace to be incorrect, because all other views " + << "that also reference this data will not not reflect this change in the trace! " + << "On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. " + << "are outputs of torch.split), this might still be safe."; + warn(ss.str().c_str()); + } +} + template < typename T, diff --git a/torch/csrc/jit/tracing_state.h b/torch/csrc/jit/tracing_state.h index 34808f0272e467..598c165650daa4 100644 --- a/torch/csrc/jit/tracing_state.h +++ b/torch/csrc/jit/tracing_state.h @@ -95,11 +95,13 @@ inline bool isTracing() { } using warn_fn_type = void (*)(const std::string& msg); -TORCH_API void _do_warn(const char * _reason); -inline void warn(const char * _reason) { +TORCH_API extern const char * WARN_PYTHON_DATAFLOW; +TORCH_API extern const char * WARN_CONSTRUCTOR; +TORCH_API void _do_warn(const char * _reason, const char * _kind); +inline void warn(const char * _reason, const char * _kind=nullptr) { if (auto state = getTracingState()) { if (!state->warn) return; - _do_warn(_reason); + _do_warn(_reason, _kind); } } TORCH_API void setWarn(warn_fn_type fn); diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index c31504ad83ab22..1397f0cd18af24 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -389,7 +389,7 @@ def maybe_warn_nondeterministic(): nondeterministic_ops_warning = "Trace had nondeterministic nodes. Nodes:\n" nondeterministic_ops_warning += "\n".join([indent(str(op)) for op in nondeterm_ops][:20]) nondeterministic_ops_warning += "\nThis may cause errors in trace checking. To disable trace checking,"\ - " pass disable_checks=True to torch.jit.trace()" + " pass check_trace=False to torch.jit.trace()" warnings.warn(nondeterministic_ops_warning, category=TracerWarning, stacklevel=5) def compare_outputs(original, reference, match_what): From 6dcdbd3a1d5772115aee822bec29c21ec790a442 Mon Sep 17 00:00:00 2001 From: Teng Li Date: Tue, 11 Sep 2018 22:05:22 -0700 Subject: [PATCH 070/237] Make C10d support CPU only build (#11513) Summary: This makes torch.distributed works for CPU only build. Also added one more CI test case to cover MPI CPU build. All CI tests should cover this change Pull Request resolved: https://github.com/pytorch/pytorch/pull/11513 Differential Revision: D9784546 Pulled By: teng-li fbshipit-source-id: 0976a6b0fd199670926f0273e17ad7d2805e42e7 --- .jenkins/pytorch/build.sh | 2 +- setup.py | 11 ++-- torch/csrc/distributed/c10d/init.cpp | 2 + torch/lib/c10d/CMakeLists.txt | 82 +++++++++++++++++++--------- torch/lib/c10d/ProcessGroupGloo.cpp | 56 +++++++++++++++---- torch/lib/c10d/ProcessGroupGloo.hpp | 11 ++++ torch/lib/c10d/test/CMakeLists.txt | 20 ++++--- 7 files changed, 132 insertions(+), 52 deletions(-) diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index d9e58d7e8b1604..7d491e0fccb7ef 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -11,7 +11,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0 fi -if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]] || [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then # TODO: move this to Docker sudo apt-get update sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev diff --git a/setup.py b/setup.py index 1ed87cc7819cf7..03e7b745a83539 100644 --- a/setup.py +++ b/setup.py @@ -448,9 +448,7 @@ def check_file(f): if USE_DISTRIBUTED: if IS_LINUX: libs += ['gloo'] - # TODO: make c10d build without CUDA - if USE_CUDA: - libs += ['c10d'] + libs += ['c10d'] libs += ['THD'] build_libs(libs) @@ -624,7 +622,7 @@ def run(self): if USE_DISTRIBUTED: print('-- Building with THD distributed package ') monkey_patch_THD_link_flags() - if IS_LINUX and USE_CUDA: + if IS_LINUX: print('-- Building with c10d distributed package ') monkey_patch_C10D_inc_flags() else: @@ -964,10 +962,11 @@ def run(self): ] include_dirs += [tmp_install_path + "/include/THD"] main_link_args += [THD_LIB] - if IS_LINUX and USE_CUDA: + if IS_LINUX: extra_compile_args.append('-DUSE_C10D') main_sources.append('torch/csrc/distributed/c10d/init.cpp') - main_sources.append('torch/csrc/distributed/c10d/ddp.cpp') + if USE_CUDA: + main_sources.append('torch/csrc/distributed/c10d/ddp.cpp') main_link_args.append(C10D_LIB) if USE_CUDA: diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index bfe381baf62f1f..d4f9e0f07b1d23 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -375,6 +375,7 @@ PyObject* c10d_init(PyObject* _unused) { &::c10d::ProcessGroup::Work::wait, py::call_guard()); +#ifdef USE_CUDA module.def( "_dist_broadcast_coalesced", &::c10d::distBroadcastCoalesced, @@ -392,6 +393,7 @@ PyObject* c10d_init(PyObject* _unused) { py::arg("broadcast_bucket_size"), py::arg("broadcast_buffers"), py::call_guard()); +#endif Py_RETURN_TRUE; } diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt index 9a09065bc4e380..c8afadc345d34f 100644 --- a/torch/lib/c10d/CMakeLists.txt +++ b/torch/lib/c10d/CMakeLists.txt @@ -14,6 +14,21 @@ if(NOT Caffe2_FOUND) message(FATAL_ERROR "Caffe2 not found") endif() +if(USE_CUDA) + find_package(CUDA) + if(CUDA_FOUND) + set(C10D_USE_CUDA true) + message(STATUS "Building C10D with CUDA support") + add_definitions(-DUSE_CUDA=1) + else() + set(C10D_USE_CUDA false) + message(STATUS "CUDA not found, building C10D without CUDA support") + endif() +else() + set(C10D_USE_CUDA false) + message(STATUS "Building C10D without CUDA support") +endif() + find_package(Gloo REQUIRED) if(Gloo_FOUND) message(STATUS "Gloo_LIBRARY: ${Gloo_LIBRARY}") @@ -32,27 +47,24 @@ else() message(STATUS "Not able to find MPI, will compile c10d without MPI support") endif() -find_package(NCCL) -if(NCCL_FOUND) - message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}") - message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}") - IF(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2)) - message(STATUS "NCCL Version 2 or higher found, will " - "compile with NCCL distributed backend") - SET(DISTRIBUTED_NCCL_FOUND TRUE) +if(C10D_USE_CUDA) + find_package(NCCL) + if(NCCL_FOUND) + message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}") + message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}") + if(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2)) + message(STATUS "NCCL Version 2 or higher found, will " + "compile with NCCL distributed backend") + set(DISTRIBUTED_NCCL_FOUND true) + else() + message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not " + "determinable, will not compile with NCCL distributed " + "backend") + endif() else() - message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not " - "determinable, will not compile with NCCL distributed " - "backend") + message(STATUS "Not able to find NCCL, will not " + "compile with NCCL distributed backend") endif() -else() - message(STATUS "Not able to find NCCL, will not " - "compile with NCCL distributed backend") -endif() - -find_package(CUDA REQUIRED) -if(NOT CUDA_FOUND) - message(FATAL_ERROR "CUDA not found") endif() function(copy_header file) @@ -68,7 +80,6 @@ endif() configure_file(cmake/Def.hpp.in ${CMAKE_BINARY_DIR}/include/c10d/Def.hpp @ONLY) set(C10D_SRCS - CUDAUtils.cpp FileStore.cpp ProcessGroup.cpp Store.cpp @@ -78,11 +89,21 @@ set(C10D_SRCS ProcessGroupGloo.cpp ) -set(C10D_LIBS - caffe2_gpu - ${Gloo_LIBRARY} - ${Gloo_NATIVE_LIBRARY} - ) +if(C10D_USE_CUDA) + list(APPEND C10D_SRCS CUDAUtils.cpp) + set(C10D_LIBS + caffe2_gpu + ${Gloo_LIBRARY} + ${Gloo_NATIVE_LIBRARY} + ) +else() + set(C10D_LIBS + caffe2 + ${Gloo_LIBRARY} + ${Gloo_NATIVE_LIBRARY} + ) +endif() + if(DISTRIBUTED_NCCL_FOUND) list(APPEND C10D_SRCS ProcessGroupNCCL.cpp) @@ -108,7 +129,11 @@ target_compile_options(c10d PUBLIC # c10d links to Caffe2/ATen, but the targets don't add TH/THC to the include path target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/TH) -target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC) + +if(C10D_USE_CUDA) + target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC) +endif() + # For target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..) # For torch/csrc/utils/hash.h and torch/csrc/utils/functional.h @@ -116,7 +141,10 @@ target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../..) # For target_include_directories(c10d PUBLIC ${GLOO_INCLUDE_DIR}) -copy_header(CUDAUtils.hpp) +if(C10D_USE_CUDA) + copy_header(CUDAUtils.hpp) +endif() + copy_header(FileStore.hpp) copy_header(PrefixStore.hpp) copy_header(ProcessGroup.hpp) diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index b7bf001be9f891..124b4d4a9d0cc4 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -3,15 +3,20 @@ #include #include #include + +#ifdef USE_CUDA #include #include #include +#endif + #include #include +#ifdef USE_CUDA #include - #include +#endif #define GENERATE_ALL_TYPES(type, func, args...) \ switch (type) { \ @@ -94,6 +99,7 @@ const ::gloo::ReductionFunction* reductionFunction(const ReduceOp& r) { throw std::runtime_error("Unhandled ReduceOp"); } +#ifdef USE_CUDA std::vector getStreamVector(AlgorithmEntry& entry) { std::vector streams(entry.streams.size()); for (size_t i = 0; i < entry.streams.size(); i++) { @@ -124,10 +130,18 @@ void synchronizeStreams(THCState* thcState, AlgorithmEntry* entry) { C10D_CUDA_CHECK(cudaStreamWaitEvent(privateStream, event, 0)); } } +#endif } // namespace -ProcessGroupGloo::WorkGloo::WorkGloo() : completed_(false), cuda_(false) {} +ProcessGroupGloo::WorkGloo::WorkGloo() + : completed_(false) +#ifdef USE_CUDA + , + cuda_(false) +#endif +{ +} ProcessGroupGloo::WorkGloo::~WorkGloo() {} @@ -140,6 +154,7 @@ bool ProcessGroupGloo::WorkGloo::isSuccess() const { } void ProcessGroupGloo::WorkGloo::synchronize() { +#ifdef USE_CUDA if (cuda_) { auto thcState = ::at::globalContext().lazyInitCUDA(); for (size_t i = 0; i < devices_.size(); i++) { @@ -148,6 +163,7 @@ void ProcessGroupGloo::WorkGloo::synchronize() { C10D_CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0)); } } +#endif } bool ProcessGroupGloo::WorkGloo::wait() { @@ -170,8 +186,8 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) { { std::unique_lock lock(m_); completed_ = true; +#ifdef USE_CUDA cuda_ = entry.key.type->is_cuda(); - // Populate devices and events so that we can later synchronize // with the operation associated with this work finishing. if (cuda_) { @@ -186,6 +202,7 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) { C10D_CUDA_CHECK(cudaEventRecord(event, stream)); } } +#endif } cv_.notify_all(); } @@ -203,9 +220,7 @@ void ProcessGroupGloo::WorkGloo::finishWithException( ProcessGroupGloo::SendWork::SendWork( at::Tensor& tensor, std::unique_ptr<::gloo::transport::UnboundBuffer> buffer) - : tensor_(tensor), - buffer_(std::move(buffer)) { -} + : tensor_(tensor), buffer_(std::move(buffer)) {} bool ProcessGroupGloo::SendWork::isCompleted() const { // No way to poll for completion yet @@ -235,10 +250,7 @@ ProcessGroupGloo::RecvWork::RecvWork( at::Tensor& tensor, std::unique_ptr<::gloo::transport::UnboundBuffer> buffer, int* srcRank) - : tensor_(tensor), - buffer_(std::move(buffer)), - srcRank_(srcRank) { -} + : tensor_(tensor), buffer_(std::move(buffer)), srcRank_(srcRank) {} bool ProcessGroupGloo::RecvWork::isCompleted() const { // No way to poll for completion yet @@ -295,7 +307,9 @@ ProcessGroupGloo::ProcessGroupGloo( threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this); } +#ifdef USE_CUDA thcState_ = ::at::globalContext().lazyInitCUDA(); +#endif } ProcessGroupGloo::~ProcessGroupGloo() { @@ -405,6 +419,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) { return; } +#ifdef USE_CUDA if (backend == at::Backend::CUDA) { if (getSize() < 16) { entry.algorithm = std::unique_ptr<::gloo::Algorithm>( @@ -423,6 +438,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) { } return; } +#endif throw std::runtime_error( "Unhandled backend: " + std::string(at::toString(backend))); @@ -448,6 +464,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) { return; } +#ifdef USE_CUDA if (backend == at::Backend::CUDA) { entry.algorithm = std::unique_ptr<::gloo::Algorithm>(new ::gloo::CudaBroadcastOneToAll( @@ -459,6 +476,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) { getStreamVector(entry))); return; } +#endif throw std::runtime_error( "Unhandled backend: " + std::string(at::toString(backend))); @@ -483,10 +501,18 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) { auto& srcSizes = key.srcSizes; entry->src.resize(srcSizes.size()); for (size_t i = 0; i < srcSizes.size(); i++) { +#ifdef USE_CUDA deviceGuard.set_index(key.type->is_cuda() ? key.devices[i] : -1); +#else + if (key.type->is_cuda()) { + throw std::runtime_error("ProcessGroupGloo is not built with CUDA"); + } + deviceGuard.set_index(-1); +#endif entry->src[i] = key.type->tensor(srcSizes[i]); } +#ifdef USE_CUDA // If these are CUDA tensors, create streams and events if (key.type->is_cuda()) { entry->streams.resize(key.devices.size()); @@ -497,6 +523,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) { entry->events[i] = CUDAEvent::create(); } } +#endif return entry; } @@ -561,6 +588,7 @@ std::shared_ptr ProcessGroupGloo::broadcast( entry->src[opts.rootTensor].copy_(tensors[opts.rootTensor]); } +#ifdef USE_CUDA // In case of CUDA, ensure that operations that are queued after // this collective wait for the collective to complete. if (key.type->is_cuda()) { @@ -576,13 +604,16 @@ std::shared_ptr ProcessGroupGloo::broadcast( } }; } else { +#endif entry->run = [=]() mutable { entry->algorithm->run(); for (size_t i = 0; i < tensors.size(); i++) { tensors[i].copy_(entry->src[i]); } }; +#ifdef USE_CUDA } +#endif return enqueue(entry); } @@ -607,6 +638,7 @@ std::shared_ptr ProcessGroupGloo::allreduce( entry->src[i].copy_(tensors[i]); } +#ifdef USE_CUDA // In case of CUDA, ensure that operations that are queued after // this collective wait for the collective to complete. if (key.type->is_cuda()) { @@ -622,14 +654,16 @@ std::shared_ptr ProcessGroupGloo::allreduce( } }; } else { +#endif entry->run = [=]() mutable { entry->algorithm->run(); for (size_t i = 0; i < tensors.size(); i++) { tensors[i].copy_(entry->src[i]); } }; +#ifdef USE_CUDA } - +#endif return enqueue(entry); } diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp index 9d5be5d200d3d1..0fe69250b31528 100644 --- a/torch/lib/c10d/ProcessGroupGloo.hpp +++ b/torch/lib/c10d/ProcessGroupGloo.hpp @@ -15,14 +15,19 @@ #include +#ifdef USE_CUDA #include +#endif + #include #include #include #include +#ifdef USE_CUDA // Forward declaration struct THCState; +#endif namespace c10d { @@ -95,6 +100,7 @@ struct AlgorithmEntry { std::vector dst; std::function run; +#ifdef USE_CUDA // For CUDA tensors, the following happens: // // - Input tensor A is copied to persistent tensor B on the stream @@ -120,6 +126,7 @@ struct AlgorithmEntry { // std::vector streams; std::vector events; +#endif // Used to synchronize between calling thread and worker threads. std::mutex m; @@ -189,6 +196,7 @@ class ProcessGroupGloo : public ProcessGroup { // is probably cheaper (this is highly speculative). std::unique_ptr<::gloo::Exception> ex_; +#ifdef USE_CUDA // List of devices and events so that we can synchronize the // streams of the caller with the kernels that were launched // asynchronously to finish this operation. @@ -208,6 +216,7 @@ class ProcessGroupGloo : public ProcessGroup { bool cuda_; std::vector devices_; std::vector events_; +#endif friend class ProcessGroupGloo; }; @@ -378,8 +387,10 @@ class ProcessGroupGloo : public ProcessGroup { std::condition_variable queueProduceCV_; std::condition_variable queueConsumeCV_; +#ifdef USE_CUDA // Store copy of pointer to THCState retrieved from ::at::globalContext(). THCState* thcState_; +#endif }; } // namespace c10d diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index 6bcb2b5d625fb0..caa3d2d056c83c 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -1,5 +1,7 @@ -cuda_add_library(c10d_cuda_test CUDATest.cu) -target_link_libraries(c10d_cuda_test c10d) +if(C10D_USE_CUDA) + cuda_add_library(c10d_cuda_test CUDATest.cu) + target_link_libraries(c10d_cuda_test c10d) +endif() function(c10d_add_test test_src) get_filename_component(test_name ${test_src} NAME_WE) @@ -11,12 +13,16 @@ endfunction() c10d_add_test(FileStoreTest.cpp c10d) c10d_add_test(TCPStoreTest.cpp c10d) -c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test) -c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test) + +if(C10D_USE_CUDA) + c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test) + c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test) + if(DISTRIBUTED_NCCL_FOUND) + c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test) + endif() +endif() + if(MPI_FOUND) add_definitions(-DMPIEXEC=${MPIEXEC}) c10d_add_test(ProcessGroupMPITest.cpp c10d) endif() -if(DISTRIBUTED_NCCL_FOUND) - c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test) -endif() From 3e3d8caecd2986aafe46d131a0e25210e936a3ff Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Tue, 11 Sep 2018 23:03:46 -0700 Subject: [PATCH 071/237] Allow setting deletion constant Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11529 Differential Revision: D9775398 Pulled By: goldsborough fbshipit-source-id: 8593d1afcf8be3150dcc4a58433f53307e3ae665 --- torch/csrc/autograd/function.cpp | 8 ++++---- torch/csrc/autograd/function.h | 2 ++ torch/csrc/autograd/init.cpp | 6 ++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp index e077fadcb8214d..ba33a6f116bac2 100644 --- a/torch/csrc/autograd/function.cpp +++ b/torch/csrc/autograd/function.cpp @@ -79,9 +79,9 @@ thread_local size_t deleteFunctionRecursionDepth = 0; * times have gotten the following numbers: ~8300, 3669 */ #ifdef _WIN32 -constexpr size_t kDeleteFunctionMaxRecursionDepth = 3000; +size_t deleteFunctionMaxRecursionDepth = 3000; #else -constexpr size_t kDeleteFunctionMaxRecursionDepth = 10000; +size_t deleteFunctionMaxRecursionDepth = 10000; #endif struct RecursionDepthCounter { @@ -107,7 +107,7 @@ struct RecursionDepthCounter { void deleteFunction(Function* function) { RecursionDepthCounter recursion_depth; - if (recursion_depth.value() > kDeleteFunctionMaxRecursionDepth) { + if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) { deleteFunctionQueue.push_back(function); return; } @@ -117,7 +117,7 @@ void deleteFunction(Function* function) { if (deleteFunctionQueue.empty()) { return; } - if (recursion_depth.value() != kDeleteFunctionMaxRecursionDepth) { + if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) { AT_ERROR("Only one deleter per thread should be able to process " "the delete queue. Please open an issue."); } diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index b4c90b1489a261..dfe8683197f8ee 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -33,6 +33,8 @@ using edge_list = std::vector; using saved_variable_list = std::vector; using IndexRange = std::pair; +TORCH_API extern size_t deleteFunctionMaxRecursionDepth; + // Custom deleter to prevent stack overflows. void deleteFunction(Function* function); diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index ca1575699cf08f..b306426025e025 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -5,6 +5,7 @@ #include "torch/csrc/autograd/grad_mode.h" #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/python_function.h" +#include "torch/csrc/autograd/function.h" PyObject * THPAutograd_initExtension(PyObject *_unused) { @@ -49,6 +50,11 @@ PyObject * THPAutograd_initExtension(PyObject *_unused) }); m.def("_pop_range", []() { torch::autograd::profiler::popRange(); }); + /// TODO: Replace this ASAP with a better solution for deep autograd graphs! + m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) { + torch::autograd::deleteFunctionMaxRecursionDepth = value; + }); + Py_RETURN_TRUE; } From 659777984783700c64fd73b60cbc6fa91a0a5f23 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Tue, 11 Sep 2018 23:32:44 -0700 Subject: [PATCH 072/237] Clean up some C++ cruftiness in the script lexer. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11408 Differential Revision: D9772843 Pulled By: resistor fbshipit-source-id: 07f16bf7eaf4f1d8700e46e91a485de4b2d9ed83 --- torch/csrc/jit/script/lexer.cpp | 42 +++++++++++++++ torch/csrc/jit/script/lexer.h | 90 ++++++++++----------------------- 2 files changed, 69 insertions(+), 63 deletions(-) diff --git a/torch/csrc/jit/script/lexer.cpp b/torch/csrc/jit/script/lexer.cpp index 55e63df128a4d8..d5410c39b84455 100644 --- a/torch/csrc/jit/script/lexer.cpp +++ b/torch/csrc/jit/script/lexer.cpp @@ -10,6 +10,48 @@ namespace torch { namespace jit { namespace script { +static const std::unordered_map binary_prec = { + {TK_IF, 1}, + {TK_AND, 2}, + {TK_OR, 2}, + // reserve a level for unary not + {'<', 4}, + {'>', 4}, + {TK_EQ, 4}, + {TK_LE, 4}, + {TK_GE, 4}, + {TK_NE, 4}, + {'+', 5}, + {'-', 5}, + {'*', 6}, + {'/', 6}, + {'@', 6}, + {TK_POW, 7}, +}; + +static const std::unordered_map unary_prec = { + {TK_NOT, 3}, + {'-', 8}, + {'*', 8}, +}; + +bool SharedParserData::isUnary(int kind, int* prec) { + auto it = unary_prec.find(kind); + if (it != unary_prec.end()) { + *prec = it->second; + return true; + } + return false; +} +bool SharedParserData::isBinary(int kind, int* prec) { + auto it = binary_prec.find(kind); + if (it != binary_prec.end()) { + *prec = it->second; + return true; + } + return false; +} + int stringToKind(std::string str) { static std::once_flag init_flag; static std::unordered_map str_to_kind; diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h index 45b7061c3bf3d8..617addf486c736 100644 --- a/torch/csrc/jit/script/lexer.h +++ b/torch/csrc/jit/script/lexer.h @@ -8,7 +8,7 @@ #include #include "torch/csrc/jit/assertions.h" #include "torch/csrc/jit/source_range.h" - +#include namespace torch { namespace jit { @@ -111,34 +111,28 @@ struct TokenTrie { kind = tok; return; } - auto& entry = children[*str]; - if (entry == nullptr) { - entry.reset(new TokenTrie()); + + for (size_t i = 0, e = child_chars.size(); i < e; ++i) { + if (child_chars[i] == *str) { + child_tries[i]->insert(str + 1, tok); + return; + } } - entry->insert(str + 1, tok); + + child_chars.emplace_back(*str); + child_tries.emplace_back(torch::make_unique()); + child_tries.back()->insert(str + 1, tok); } int kind; // 0 == invalid token - std::unordered_map children; + + std::vector child_chars; + std::vector child_tries; }; // stuff that is shared against all TC lexers/parsers and is initialized only // once. struct SharedParserData { SharedParserData() : head(new TokenTrie()) { - // listed in increasing order of precedence - std::vector> binary_ops = { - {TK_IF}, - {TK_AND, TK_OR}, - {}, // reserve a level for unary not - {'<', '>', TK_EQ, TK_LE, TK_GE, TK_NE}, - {'+', '-'}, - {'*', '/', '@'}, - {TK_POW}, - }; - std::vector> unary_ops = { - {'-', '*'}, - }; - std::stringstream ss; for (const char* c = valid_single_char_tokens; *c; c++) { std::string str(1, *c); @@ -151,26 +145,6 @@ struct SharedParserData { } TC_FORALL_TOKEN_KINDS(ADD_CASE) #undef ADD_CASE - - // precedence starts at 1 so that there is always a 0 precedence - // less than any other precedence - int prec = 1; - for (auto& group : binary_ops) { - for (auto& element : group) { - binary_prec[element] = prec; - } - prec++; - } - // unary ops - for (auto& group : unary_ops) { - for (auto& element : group) { - unary_prec[element] = prec; - } - prec++; - } - // add unary not separately because it slots into the precedence of - // binary operators - unary_prec[TK_NOT] = binary_prec[TK_AND] + 1; } // 1. skip whitespace // 2. handle comment or newline @@ -195,7 +169,7 @@ struct SharedParserData { return start + len <= str.size() && std::count(str.begin() + start, str.begin() + start + len, c) == len; } - // python conconcatenates all adjacent strings "a" "b" == "ab" + // python concatenates all adjacent strings "a" "b" == "ab" // strings can be enclosed with 1 or 3 single or double quotes // if enclosed with 3 quotes newlines are valid // as elsewhere, backslash and new line should be ignored @@ -314,8 +288,16 @@ struct SharedParserData { // rather the // identifier 'max' if (cur) { - auto it = cur->children.find(str[pos + i]); - cur = (it == cur->children.end()) ? nullptr : it->second.get(); + size_t child_offset = 0; + for (size_t e = cur->child_chars.size(); child_offset < e; ++child_offset) { + if (cur->child_chars[child_offset] == str[pos + i]) + break; + } + + cur = (child_offset == cur->child_chars.size()) + ? nullptr + : cur->child_tries[child_offset].get(); + if (cur && cur->kind != 0) { matched = true; *len = i + 1; @@ -325,22 +307,8 @@ struct SharedParserData { } return matched; } - bool isUnary(int kind, int* prec) { - auto it = unary_prec.find(kind); - if (it != unary_prec.end()) { - *prec = it->second; - return true; - } - return false; - } - bool isBinary(int kind, int* prec) { - auto it = binary_prec.find(kind); - if (it != binary_prec.end()) { - *prec = it->second; - return true; - } - return false; - } + bool isUnary(int kind, int* prec); + bool isBinary(int kind, int* prec); bool isRightAssociative(int kind) { switch (kind) { case '?': @@ -356,10 +324,6 @@ struct SharedParserData { return isalpha(n) || n == '_' || (i > 0 && isdigit(n)); } TokenTrieRef head; - std::unordered_map - unary_prec; // map from token to its unary precedence - std::unordered_map - binary_prec; // map from token to its binary precedence }; SharedParserData& sharedParserData(); From 76070fe73c5cce61cb9554990079594f83384629 Mon Sep 17 00:00:00 2001 From: Teng Li Date: Wed, 12 Sep 2018 01:27:18 -0700 Subject: [PATCH 073/237] Make c10d test work on CPU only build (#11567) Summary: Make test work with CPU only build, also fixed the test failures for a long time Pull Request resolved: https://github.com/pytorch/pytorch/pull/11567 Differential Revision: D9785740 Pulled By: teng-li fbshipit-source-id: 61c43b758c1ee53117e30de8074583e6faea863a --- torch/lib/c10d/test/CMakeLists.txt | 2 ++ torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp | 3 +++ torch/lib/c10d/test/ProcessGroupGlooTest.cpp | 13 +++++++++++++ 3 files changed, 18 insertions(+) diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt index caa3d2d056c83c..d7a8643e16db1a 100644 --- a/torch/lib/c10d/test/CMakeLists.txt +++ b/torch/lib/c10d/test/CMakeLists.txt @@ -20,6 +20,8 @@ if(C10D_USE_CUDA) if(DISTRIBUTED_NCCL_FOUND) c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test) endif() +else() + c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d) endif() if(MPI_FOUND) diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp index 7e7062b9ce5401..74e96acc15152b 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp @@ -52,6 +52,8 @@ class AsyncTest { // Use tiny timeout to make this test run fast ::c10d::ProcessGroupGloo::Options options; options.timeout = std::chrono::milliseconds(50); + ::gloo::transport::tcp::attr attr; + options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr)); pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>( new ::c10d::ProcessGroupGloo(store, rank, size, options)); @@ -262,4 +264,5 @@ int main(int argc, char** argv) { TemporaryFile file; runAsyncBroadcastTest(file.path, 4, 1); } + std::cout << "Test successful" << std::endl; } diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index 89f3377dce1a26..a9b272cab8f5c5 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -11,7 +11,10 @@ #include +#ifdef USE_CUDA #include +#endif + #include #include #include @@ -43,6 +46,8 @@ class SignalTest { // Use tiny timeout to make this test run fast ::c10d::ProcessGroupGloo::Options options; options.timeout = std::chrono::milliseconds(50); + ::gloo::transport::tcp::attr attr; + options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr)); ::c10d::ProcessGroupGloo pg(store, rank, size, options); @@ -125,6 +130,9 @@ class CollectiveTest { ::c10d::ProcessGroupGloo::Options options; options.timeout = std::chrono::milliseconds(50); + ::gloo::transport::tcp::attr attr; + options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr)); + pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>( new ::c10d::ProcessGroupGloo(store, rank, size, options)); } @@ -264,20 +272,25 @@ int main(int argc, char** argv) { testAllreduce(file.path, at::Backend::CPU); } +#ifdef USE_CUDA { TemporaryFile file; testAllreduce(file.path, at::Backend::CUDA); } +#endif { TemporaryFile file; testBroadcast(file.path, at::Backend::CPU); } +#ifdef USE_CUDA { TemporaryFile file; testBroadcast(file.path, at::Backend::CUDA); } +#endif + std::cout << "Test successful" << std::endl; return 0; } From efc0f6784aa94d75f2b68e2f42a54253c9729d72 Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Wed, 12 Sep 2018 07:05:06 -0700 Subject: [PATCH 074/237] Move some bmm/baddbmm to ATen (#11292) Summary: - Incorporates MKL addition by mingfeima Thank you! (but all errors are my own) - Native CPU implementation: defer to matrix multiplication for small batches and parallelize over batch dimension for large batches. - Add bmm test for CUDA just to be sure. This is a partial fix for #10661, getting down to a factor ~5. Considerable overhead is incurred for the setup in einsum. It might be more efficient to eventually define an optimized contraction functions for arbitrary and several dimensions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11292 Differential Revision: D9784941 Pulled By: ezyang fbshipit-source-id: f6dded2c6f5e8f0461fb38f31f9a824992a58358 --- aten/src/ATen/Declarations.cwrap | 27 +--- aten/src/ATen/native/LinearAlgebra.cpp | 150 +++++++++++++++++++++ aten/src/ATen/native/cuda/LinearAlgebra.cu | 25 ++++ aten/src/ATen/native/mkl/LinearAlgebra.cpp | 95 +++++++++++++ aten/src/ATen/native/native_functions.yaml | 33 +++++ test/test_cuda.py | 1 + 6 files changed, 311 insertions(+), 20 deletions(-) create mode 100644 aten/src/ATen/native/cuda/LinearAlgebra.cu create mode 100644 aten/src/ATen/native/mkl/LinearAlgebra.cpp diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index 9d67537ccdedd8..5a6687aaa53762 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -2471,11 +2471,12 @@ - THTensor* mat2 ]] [[ - name: bmm + name: _th_bmm cname: baddbmm variants: - - method - function + backends: + - CUDA return: argument 0 arguments: - arg: THTensor* result @@ -2525,10 +2526,12 @@ - THTensor* batch2 ]] [[ - name: baddbmm + name: _th_baddbmm + cname: baddbmm variants: - - method - function + backends: + - CUDA return: argument 0 arguments: - arg: THTensor* result @@ -2544,22 +2547,6 @@ - THTensor* batch1 - THTensor* batch2 ]] -[[ - name: baddbmm_ - cname: baddbmm - return: argument 0 - arguments: - - THTensor* self - - arg: real beta - default: AS_REAL(1) - kwarg_only: True - - THTensor* self - - arg: real alpha - default: AS_REAL(1) - kwarg_only: True - - THTensor* batch1 - - THTensor* batch2 -]] [[ name: addcmul variants: diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index d596a7cb99a265..50726cb99b81b9 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1,7 +1,10 @@ #include "ATen/ATen.h" #include "ATen/ExpandUtils.h" +#include "ATen/Dispatch.h" #include "ATen/NativeFunctions.h" #include "ATen/native/LinearAlgebraUtils.h" +#include "ATen/TensorUtils.h" +#include "ATen/Parallel.h" #include #include #include @@ -222,6 +225,153 @@ Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const T return at::_addr_out(result, self, vec1, vec2, beta, alpha); } +template +inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, Scalar beta_, Scalar alpha_) { + int64_t bs = result.size(0); + int64_t is = result.size(1); + int64_t js = result.size(2); + int64_t ks = self.size(2); + + scalar_t alpha = alpha_.to(); + scalar_t beta = beta_.to(); + + auto r0 = result.accessor(); + auto s0 = self.accessor(); + auto m0 = mat2.accessor(); + + int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1); + parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) { + for (int64_t b = b_begin; b < b_end; b++) { + auto r1 = r0[b]; + auto s1 = s0[b]; + auto m1 = m0[b]; + for (int64_t i = 0; i < is; i++) { + auto r2 = r1[i]; + auto s2 = s1[i]; + for (int64_t j = 0; j < js; j++) { + scalar_t &r = r2[j]; + if (is_bmm) { + r = 0; + for (int64_t k = 0; k < ks; k++) { + r += s2[k] * m1[k][j]; + } + } else { + r *= beta; + for (int64_t k = 0; k < ks; k++) { + r += alpha * s2[k] * m1[k][j]; + } + } + } + } + } + }); +} + +// This tries to apply some optimizations to bmm/baddbmm: +// - When the operand size is small, computation are parallelized over the batch +// dimension using OMP and naive matrix multiplication is applied. +// - When the operand size is larger than the threshold, if compiled with MKL, MKL's batch gemm is used. +// - Otherwise, we use a series of matrix multiplications. +// The threshold of 400 for the first has not been thoroughly benchmarked yet and may have room for further +// optimization, it likely depends on the characteristics of the CPU, MKL will be different from non-MKL etc., +// but this seems to be a first starting point. + +static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha, bool is_bmm_out) { + // is_bmm_out: true for bmm_out, false for baddbmm_ + // self_or_result is "self" for baddbmm_ and "result" for bmm_out + CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm"); + TensorArg self_arg(self_or_result, is_bmm_out ? "self" : "result", 0); + TensorArg b1_arg(batch1, "batch1", 1); + TensorArg b2_arg(batch2, "batch2", 2); + checkDim(c, b1_arg, 3); + checkDim(c, b2_arg, 3); + + int64_t bs = batch1.size(0); + checkSize(c, b2_arg, 0, bs); + int64_t contraction_size = batch1.size(2); + int64_t res_rows = batch1.size(1); + int64_t res_cols = batch2.size(2); + checkSize(c, b2_arg, 1, contraction_size); + + if (is_bmm_out) { + self_or_result.resize_({bs, res_rows, res_cols}); + } else { + checkSize(c, self_arg, 0, bs); + checkSize(c, self_arg, 1, res_rows); + checkSize(c, self_arg, 2, res_cols); + } + + // handle pathological cases that blas may not like + if (self_or_result.numel() == 0) { + return self_or_result; + } else if (contraction_size == 0) { + return self_or_result.zero_(); + } + + auto batch_items_contiguous_or_transposed = [&](const Tensor& t) { + return (t.stride(2) == 1 && t.stride(1) == t.size(2)) + || (t.stride(1) == 1 && t.stride(2) == t.size(1)); + }; + + if (contraction_size * res_rows * res_cols < 400) { + if (is_bmm_out) { + AT_DISPATCH_ALL_TYPES(batch1.type(), "bmm", [&] { + baddbmm_cpu_kernel(self_or_result, batch1, batch2, beta, alpha); + }); + } else { + AT_DISPATCH_ALL_TYPES(batch1.type(), "baddbmm", [&] { + baddbmm_cpu_kernel(self_or_result, batch1, batch2, beta, alpha); + }); + } + } else if (at::hasMKL() && at::native::is_floating_point(self_or_result) + && batch_items_contiguous_or_transposed(batch1) + && batch_items_contiguous_or_transposed(batch2) + && self_or_result.is_contiguous()) { + at::native::_baddbmm_mkl_(self_or_result, batch1, batch2, beta, alpha); + } else { // split along batch dimension + if (is_bmm_out) { + for (int64_t b = 0; b < bs; b++) { + auto r = self_or_result.select(0, b); + at::native::mm_out(r, batch1.select(0, b), batch2.select(0, b)); + } + } else { + for (int64_t b = 0; b < bs; b++) { + self_or_result.select(0, b).addmm_(batch1.select(0, b), batch2.select(0, b), beta, alpha); + } + } + } + return self_or_result; +} + + +Tensor baddbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + Tensor result = self.type().tensor(); + return at::native::baddbmm_out_cpu(result, self, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm_out_cpu(Tensor &result, const Tensor& self_, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + Tensor self; + std::tie(self) = expand_size(self_, {batch1.size(0), batch1.size(1), batch2.size(2)}, "baddbmm"); + result.resize_(self.sizes()); + result.copy_(self); + return at::native::baddbmm__cpu(result, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return bmm_out_or_baddbmm_(self, batch1, batch2, beta, alpha, false); +} + +Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) { + Tensor result = self.type().tensor(); + return at::native::bmm_out_cpu(result, self, mat2); +} + +Tensor& bmm_out_cpu(Tensor &result, const Tensor& batch1, const Tensor& batch2) { + Scalar beta(0.0); + Scalar alpha(1.0); + return bmm_out_or_baddbmm_(result, batch1, batch2, beta, alpha, true); +} + Tensor dot(const Tensor& self, const Tensor& tensor) { check_1d(self, "self", "dot"); check_1d(tensor, "tensor", "dot"); diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu new file mode 100644 index 00000000000000..1c3609f50b201c --- /dev/null +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -0,0 +1,25 @@ +#include "ATen/ATen.h" + +namespace at { namespace native { + +Tensor baddbmm_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return _th_baddbmm(self, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm_out_cuda(Tensor &result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return _th_baddbmm_out(result, self, batch1, batch2, beta, alpha); +} + +Tensor& baddbmm__cuda(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + return _th_baddbmm_out(self, self, batch1, batch2, beta, alpha); +} + +Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) { + return _th_bmm(self, mat2); +} + +Tensor& bmm_out_cuda(Tensor &result, const Tensor& batch1, const Tensor& batch2) { + return _th_bmm_out(result, batch1, batch2); +} + +} } diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp new file mode 100644 index 00000000000000..062dd56d2ca300 --- /dev/null +++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp @@ -0,0 +1,95 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Config.h" + +#if !AT_MKL_ENABLED() + +namespace at { namespace native { + +Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + AT_ERROR("bmm: ATen not compiled with MKL support"); +} + +}} + +#else // AT_MKL_ENABLED + +#include "ATen/ATen.h" +#include "ATen/Config.h" +#include "ATen/Dispatch.h" +#include "ATen/Utils.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace at { namespace native { + +static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int batch_size, const int M, const int N, const int K, const float alpha, + const float** A, const float** B, const float beta, float** C) { + const int lda = (trans_A == CblasNoTrans) ? K : M; + const int ldb = (trans_B == CblasNoTrans) ? N : K; + const int ldc = N; + + cblas_sgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha, + A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size); +} + +static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, + const int batch_size, const int M, const int N, const int K, const double alpha, + const double** A, const double** B, const double beta, double** C) { + const int lda = (trans_A == CblasNoTrans) ? K : M; + const int ldb = (trans_B == CblasNoTrans) ? N : K; + const int ldc = N; + + cblas_dgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha, + A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size); +} + +template +static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, const Tensor& mat2, Scalar beta_, Scalar alpha_) { + auto is_transposed = [&](const Tensor& t) { + return t.stride(0) == 1 && t.stride(1) == t.size(0); + }; + const CBLAS_TRANSPOSE trans_A = is_transposed(mat1[0]) ? CblasTrans : CblasNoTrans; + const CBLAS_TRANSPOSE trans_B = is_transposed(mat2[0]) ? CblasTrans : CblasNoTrans; + + const int batch_size = mat1.size(0); + const int M = mat1.size(1); + const int N = mat2.size(2); + const int K = mat1.size(2); + scalar_t alpha = alpha_.to(); + scalar_t beta = beta_.to(); + + std::vector A(batch_size); + std::vector B(batch_size); + std::vector C(batch_size); + for (int64_t batch = 0; batch < batch_size; batch++) { + A[batch] = mat1[batch].data(); + B[batch] = mat2[batch].data(); + C[batch] = res[batch].data(); + } + + gemm_batched(trans_A, trans_B, batch_size, M, N, K, alpha, A.data(), B.data(), beta, C.data()); +} + +Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { + // checks are done in native/LinearAlgebra.cpp + AT_DISPATCH_FLOATING_TYPES(self.type(), "baddbmm__mkl", [&] { + baddbmm_mkl_template(self, batch1, batch2, beta, alpha); + }); + + return self; +} + +}} // namespace at::native + +#endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 024c33a19c31dc..cdd7a2655473f1 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -245,6 +245,27 @@ CPU: _atan_out_cpu CUDA: _atan_out_cuda +- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function, method + dispatch: + CPU: baddbmm_cpu + CUDA: baddbmm_cuda + +- func: baddbmm_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: method + dispatch: + CPU: baddbmm__cpu + CUDA: baddbmm__cuda + +- func: _baddbmm_mkl_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + +- func: baddbmm_out(Tensor result, Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + variants: function + dispatch: + CPU: baddbmm_out_cpu + CUDA: baddbmm_out_cuda + - func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor - func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor @@ -281,6 +302,18 @@ - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor +- func: bmm(Tensor self, Tensor mat2) -> Tensor + variants: function, method + dispatch: + CPU: bmm_cpu + CUDA: bmm_cuda + +- func: bmm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor + variants: function + dispatch: + CPU: bmm_out_cpu + CUDA: bmm_out_cuda + - func: broadcast_tensors(TensorList tensors) -> TensorList - func: cat(TensorList tensors, int64_t dim=0) -> Tensor diff --git a/test/test_cuda.py b/test/test_cuda.py index 71e4a8220c6e68..d75f667281cf99 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -280,6 +280,7 @@ def tmp(t): types, False, "skipIfRocm:HalfTensor"), ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars', types, False, "skipIfRocm:HalfTensor"), + ('bmm', small_3d, lambda t: [small_3d(t)], '', float_types_no_half, False, "skipIfRocm:HalfTensor"), ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], '', types, False, "skipIfRocm:HalfTensor"), ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], From 6fc18a75417264f1a43b874dd16948f158d80c2a Mon Sep 17 00:00:00 2001 From: Rasmus Diederichsen Date: Wed, 12 Sep 2018 08:18:52 -0700 Subject: [PATCH 075/237] Typo fix in randomness.rst (#11571) Summary: "need to be" -> "need not be" Pull Request resolved: https://github.com/pytorch/pytorch/pull/11571 Differential Revision: D9786001 Pulled By: soumith fbshipit-source-id: 7cc408f5c8bfcc56d4b5c153646f30e1cec37539 --- docs/source/notes/randomness.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst index 5ba2c87a8eecf3..c8c7b710d8e5cf 100644 --- a/docs/source/notes/randomness.rst +++ b/docs/source/notes/randomness.rst @@ -3,7 +3,7 @@ Reproducibility =============== Completely reproducible results are not guaranteed across PyTorch releases, -individual commits or different platforms. Furthermore, results need to be +individual commits or different platforms. Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds. However, in order to make computations deterministic on your specific problem on From f0a284502ad07c3a2088454f3d282db7d81f231c Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Wed, 12 Sep 2018 09:06:39 -0700 Subject: [PATCH 076/237] Document BatchNorm and update default behavior (#11484) Summary: This PR: 1. Documents `BatchNorm`, 2. Makes a number of API changes after reconsidering some quirks: 1. The default value for the `stateful` parameter used to be `false`, but the most common usage of `BatchNorm` out of the wild is certainly stateful, and the default in Python is also statefulness. So we change the default to stateful. 2. The `pure_forward` function used to use the internal running mean and variance variables instead of the ones supplied to that function call when `stateful` was true, which certainly seems odd. When you call `pure_forward` you would certainly expect the values you pass explicitly to be used. This is now fixed. 3. Adds tests for `BatchNorm`, finally. ebetica apaszke ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11484 Reviewed By: pjh5 Differential Revision: D9779618 Pulled By: goldsborough fbshipit-source-id: 59ba760e085c01454b75644b24b22317b688e459 --- test/cpp/api/integration.cpp | 10 ++- test/cpp/api/modules.cpp | 61 ++++++++++++++++++- .../api/include/torch/nn/modules/batchnorm.h | 56 ++++++++++++++++- .../api/include/torch/nn/modules/linear.h | 1 + torch/csrc/api/src/nn/modules/batchnorm.cpp | 15 ++--- 5 files changed, 128 insertions(+), 15 deletions(-) diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp index a8656a16f5d658..8d753197754c19 100644 --- a/test/cpp/api/integration.cpp +++ b/test/cpp/api/integration.cpp @@ -281,8 +281,8 @@ TEST_CASE("integration/cartpole") { for (auto i = 0U; i < saved_log_probs.size(); i++) { auto r = rewards[i] - saved_values[i].toCFloat(); policy_loss.push_back(-r * saved_log_probs[i]); - value_loss.push_back(torch::smooth_l1_loss( - saved_values[i], torch::ones(1) * rewards[i])); + value_loss.push_back( + torch::smooth_l1_loss(saved_values[i], torch::ones(1) * rewards[i])); } auto loss = @@ -370,12 +370,10 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") { torch::manual_seed(0); auto model = std::make_shared(); auto conv1 = model->add(Conv2d(1, 10, 5), "conv1"); - auto batchnorm2d = - model->add(BatchNorm(BatchNormOptions(10).stateful(true)), "batchnorm2d"); + auto batchnorm2d = model->add(BatchNorm(10), "batchnorm2d"); auto conv2 = model->add(Conv2d(10, 20, 5), "conv2"); auto linear1 = model->add(Linear(320, 50), "linear1"); - auto batchnorm1 = - model->add(BatchNorm(BatchNormOptions(50).stateful(true)), "batchnorm1"); + auto batchnorm1 = model->add(BatchNorm(50), "batchnorm1"); auto linear2 = model->add(Linear(50, 10), "linear2"); auto forward = [&](torch::Tensor x) { diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 2da1e1c45bf6f6..928a39f7d8b44a 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -12,6 +12,8 @@ #include +using Catch::StartsWith; + using namespace torch::nn; using namespace torch::test; @@ -238,10 +240,67 @@ TEST_CASE("modules") { REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0); } { - auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1); + auto functional = + Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1); REQUIRE(functional(torch::ones({})).toCFloat() == 0); } } + + SECTION("batchnorm") { + { + BatchNorm bn(5); + + // Is stateful by default. + REQUIRE(bn->options.stateful()); + + REQUIRE(bn->running_mean.defined()); + REQUIRE(bn->running_mean.dim() == 1); + REQUIRE(bn->running_mean.size(0) == 5); + + REQUIRE(bn->running_variance.defined()); + REQUIRE(bn->running_variance.dim() == 1); + REQUIRE(bn->running_variance.size(0) == 5); + + // Is affine by default. + REQUIRE(bn->options.affine()); + + REQUIRE(bn->weight.defined()); + REQUIRE(bn->weight.dim() == 1); + REQUIRE(bn->weight.size(0) == 5); + + REQUIRE(bn->bias.defined()); + REQUIRE(bn->bias.dim() == 1); + REQUIRE(bn->bias.size(0) == 5); + } + { + BatchNorm bn(BatchNormOptions(5).stateful(false).affine(false)); + + REQUIRE(!bn->running_mean.defined()); + REQUIRE(!bn->running_variance.defined()); + REQUIRE(!bn->weight.defined()); + REQUIRE(!bn->bias.defined()); + + REQUIRE_THROWS_WITH( + bn->forward(torch::ones({2, 5})), + StartsWith("Calling BatchNorm::forward is only permitted " + "when the 'stateful' option is true (was false). " + "Use BatchNorm::pure_forward instead.")); + } + { + BatchNorm bn(BatchNormOptions(5).affine(false)); + bn->eval(); + + // Want to make sure we use the supplied values in `pure_forward` even if + // we are stateful. + auto input = torch::randn({2, 5}); + auto mean = torch::randn(5); + auto variance = torch::rand(5); + auto output = bn->pure_forward(input, mean, variance); + auto expected = + (input - mean) / torch::sqrt(variance + bn->options.eps()); + REQUIRE(output.allclose(expected)); + } + } } TEST_CASE("modules_cuda", "[cuda]") { diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h index d4dd669a286fef..ff5cebc7ed6b19 100644 --- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h +++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h @@ -8,15 +8,43 @@ namespace torch { namespace nn { + +/// Options for the `BatchNorm` module. struct BatchNormOptions { /* implicit */ BatchNormOptions(int64_t features); + /// The number of features of the input tensor. + /// Changing this parameter after construction __has no effect__. TORCH_ARG(int64_t, features); + /// Whether to learn a scale and bias that are applied in an affine + /// transformation on the input. + /// Changing this parameter after construction __has no effect__. TORCH_ARG(bool, affine) = true; - TORCH_ARG(bool, stateful) = false; + /// Whether to store and update batch statistics (mean and variance) in the + /// module. If `false`, you should call `pure_forward` and supply those batch + /// statistics yourself. + /// Changing this parameter after construction __has no effect__. + TORCH_ARG(bool, stateful) = true; + /// The epsilon value added for numerical stability. + /// Changing this parameter after construction __is effective__. TORCH_ARG(double, eps) = 1e-5; + /// A momentum multiplier for the mean and variance. + /// Changing this parameter after construction __is effective__. TORCH_ARG(double, momentum) = 0.1; }; +/// Applies [Batch Normalization](https://arxiv.org/abs/1502.03167) to an input. +/// +/// Refer to the documentation for +/// [`BatchNorm1d`](https://pytorch.org/docs/stable/nn.html#torch.nn.BatchNorm1d) +/// in PyTorch to learn more about the exact semantics of this module, __but see +/// the note below regarding differences between the Python and C++ API__. +/// +/// \rst +/// .. attention:: +/// In the Python API, there are separate implementations for 1-D, 2-D and 3-D +/// BatchNorm. In C++, there is only one `BatchNorm` module, which works for +/// any of these dimensions. +/// \endrst class BatchNormImpl : public torch::nn::Cloneable { public: explicit BatchNormImpl(int64_t features) @@ -25,16 +53,42 @@ class BatchNormImpl : public torch::nn::Cloneable { void reset() override; + /// Applies batch normalization on the `input` using the stored mean and + /// variance. + /// + /// The module must be constructed with `stateful = true` when calling this + /// method, as the module will otherwise not store running statistics. If you + /// want to supply the mean and variance yourself, use `pure_forward`. Tensor forward(Tensor input); + + /// Applies batch normalization on the `input` using the given `mean` and + /// `variance` statistics. Tensor pure_forward(Tensor input, Tensor mean, Tensor variance); + /// The options with which this module was constructed. BatchNormOptions options; + + /// The learned weight. + /// Only defined if the `affine` option was `true` upon construction. Tensor weight; + + /// The learned bias. + /// Only defined if the `affine` option was `true` upon construction. Tensor bias; + + /// The running mean. + /// Only defined if the `stateful` option was `true` upon construction. Tensor running_mean; + + /// The running variance. + /// Only defined if the `stateful` option was `true` upon construction. Tensor running_variance; }; +/// A `ModuleHolder` subclass for `BatchNormImpl`. +/// See the documentation for `BatchNormImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(BatchNorm); } // namespace nn diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h index 1d1a25da36163b..3e8044f5715826 100644 --- a/torch/csrc/api/include/torch/nn/modules/linear.h +++ b/torch/csrc/api/include/torch/nn/modules/linear.h @@ -10,6 +10,7 @@ namespace torch { namespace nn { +/// Options for the `Linear` module. struct LinearOptions { LinearOptions(int64_t in, int64_t out); /// The number of input features (columns of the input matrix). diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp index 1a9ae761c9f240..6a3920fe56a66d 100644 --- a/torch/csrc/api/src/nn/modules/batchnorm.cpp +++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp @@ -34,14 +34,15 @@ void BatchNormImpl::reset() { } Tensor BatchNormImpl::forward(Tensor input) { - return pure_forward(input, Tensor(), Tensor()); + AT_CHECK( + options.stateful_, + "Calling BatchNorm::forward is only permitted when " + "the 'stateful' option is true (was false). " + "Use BatchNorm::pure_forward instead."); + return pure_forward(input, running_mean, running_variance); } Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) { - auto& running_mean = options.stateful_ ? this->running_mean : mean; - auto& running_variance = - options.stateful_ ? this->running_variance : variance; - if (is_training()) { const auto num_channels = input.dim() > 1 ? input.size(1) : 1; AT_CHECK( @@ -53,8 +54,8 @@ Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) { input, weight, bias, - running_mean, - running_variance, + mean, + variance, is_training(), options.momentum_, options.eps_, From e5dd77c7ad362298bcc035ccedc447d161169721 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 09:09:42 -0700 Subject: [PATCH 077/237] Sync all libnccl soversions, not just libnccl.so.1 (#11575) Summary: Fixes: ``` /bin/ld: warning: libnccl.so.1, needed by /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so, not found (try using -rp ath or -rpath-link) /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclAllReduce' /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclBcast' /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclCommInitAll' /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclGetErrorString' /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclReduceScatter' /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclAllGather' /data/users/ezyang/pytorch-tmp/build/lib/libcaffe2_gpu.so: undefined reference to `ncclReduce' ``` Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11575 Differential Revision: D9789956 Pulled By: ezyang fbshipit-source-id: 63e48763cc233be9d137cec721b239159b511a24 --- tools/build_pytorch_libs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index e0593badd97614..66c8e6d65cf0b0 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -313,7 +313,7 @@ function build_caffe2() { # This is needed by the aten tests built with caffe2 if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl - $SYNC_COMMAND "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" + find "${INSTALL_DIR}/lib" -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "lib/" fi ${CMAKE_INSTALL} -j"$MAX_JOBS" From 12f4c46eea1177d64a3c91fb23744d806e21c612 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 12 Sep 2018 09:33:47 -0700 Subject: [PATCH 078/237] caffe2::StorageImpl use at::DataPtr (#11282) Summary: See title Pull Request resolved: https://github.com/pytorch/pytorch/pull/11282 Reviewed By: ezyang Differential Revision: D9658503 Pulled By: cpuhrsch fbshipit-source-id: 42fa73c979692cb1069c0345744a85d12150745c --- aten/src/ATen/core/TensorImpl.h | 6 + caffe2/contrib/aten/aten_op_template.h | 20 ++- caffe2/core/storage.h | 234 ++++++++++++++----------- caffe2/core/tensor.h | 107 +++++++---- caffe2/python/pybind_state_dlpack.h | 21 ++- 5 files changed, 238 insertions(+), 150 deletions(-) diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index e0a649a49b6ccd..3f41c1163c563b 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -100,6 +100,12 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { return storage_.data() + storage_offset_; } + inline void* data() const { + return static_cast( + static_cast(storage_.data()) + + at::elementSize(scalar_type_) * storage_offset_); + } + template inline T * unsafe_data() const { return storage_.unsafe_data() + storage_offset_; diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index c573891f6f04ac..8b12cc9dad16d8 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -91,14 +91,22 @@ class ATenOp : public Operator { void assignTo(Tensor* dst, const at::Tensor& src_) { at::Tensor src = src_.contiguous(); auto at_sizes = src.sizes(); - std::vector dims(at_sizes.begin(),at_sizes.end()); + caffe2::TypeMeta type_meta = typeMetaFor(src); + at::Device device = src.device(); + at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl(); + std::vector dims(at_sizes.begin(), at_sizes.end()); dst->Resize(dims); dst->ShareExternalPointer( - src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable { - // return a closure that holds a handle to t until it is called - // to keep the aten memory alive - return src.reset(); - }); + at::DataPtr( + src_impl->data(), + static_cast(src_impl), + [](void* t_ptr) -> void { + at::TensorImpl* local_impl = static_cast(t_ptr); + c10::raw::intrusive_ptr::decref(local_impl); + }, + device), + type_meta, + 0); } void assignListStartingAt( size_t offset, diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h index 973b07ee630642..974985625df6b1 100644 --- a/caffe2/core/storage.h +++ b/caffe2/core/storage.h @@ -16,52 +16,55 @@ #include "caffe2/core/logging.h" #include "caffe2/core/typeid.h" +#include #include #include #include namespace caffe2 { -using DataType = TypeMeta; -using DataPtr = std::shared_ptr; -using at::DeviceType; - class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target { public: StorageImpl() = delete; StorageImpl(const StorageImpl&) = delete; StorageImpl& operator=(const StorageImpl&) = delete; - explicit StorageImpl(DeviceType device_type) : device_type_(device_type) {} - StorageImpl(DeviceType device_type, TypeMeta data_type) - : data_type_(data_type), device_type_(device_type) {} - template + // Rule of Five + StorageImpl(StorageImpl&&) = default; + ~StorageImpl() = default; + StorageImpl& operator=(StorageImpl&&) = default; + StorageImpl( - DeviceType device_type, TypeMeta data_type, - void* src, - size_t capacity, - Deleter d = nullptr) - : data_type_(data_type), device_type_(device_type) { - CAFFE_ENFORCE_WITH_CALLER( - data_type_.id() != TypeIdentifier::uninitialized(), - "To create storage with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - // Check if the deleter is a MemoryDeleter and is a simple nullptr. - if (std::is_same::value && - reinterpret_cast(static_cast(&d))[0] == - nullptr) { - // Use aliasing constructor trick to avoid calling the destructor. - data_ptr_ = std::shared_ptr(std::shared_ptr(), src); - } else { - data_ptr_.reset(src, d); + int64_t numel, + at::DataPtr data_ptr, + at::Allocator* allocator, + bool resizable) + : data_type_(data_type), + data_ptr_(std::move(data_ptr)), + numel_(numel), + resizable_(resizable), + allocator_(allocator) { + if (numel > 0) { + CAFFE_ENFORCE( + data_type.id() != TypeIdentifier::uninitialized(), + "Constructing a storage with meta of unknown type and non-zero numel"); } - capacity_ = capacity; } + explicit StorageImpl(at::DeviceType device_type) + : StorageImpl(device_type, TypeMeta()) {} + StorageImpl(at::DeviceType device_type, TypeMeta data_type) + : StorageImpl( + data_type, + 0, + at::DataPtr(nullptr, at::Device(device_type)), + nullptr, + true) {} + void reset() { - data_ptr_.reset(); - capacity_ = 0; + data_ptr_.clear(); + numel_ = 0; } template @@ -77,117 +80,135 @@ class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target { return data_ptr_.get(); } - DataPtr& data_ptr() { + at::DataPtr& data_ptr() { return data_ptr_; } - const DataPtr& data_ptr() const { + const at::DataPtr& data_ptr() const { return data_ptr_; } - void set_dtype(const DataType& data_type) { + // Returns the previous data_ptr + at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { + std::swap(data_ptr_, data_ptr); + return std::move(data_ptr); + }; + + void set_dtype(const TypeMeta& data_type) { + int64_t capacity = numel_ * data_type_.itemsize(); data_type_ = data_type; + numel_ = capacity / data_type_.itemsize(); } - const DataType& dtype() const { + const TypeMeta& dtype() const { return data_type_; } + const at::Allocator* allocator() const { + return allocator_; + }; + // You generally shouldn't use this method, but it is occasionally + // useful if you want to override how a tensor will be reallocated, + // after it was already allocated (and its initial allocator was + // set) + void set_allocator(at::Allocator* allocator) { + allocator_ = allocator; + } + size_t capacity() const { - return capacity_; + return numel_ * itemsize(); } int64_t numel() const { - return capacity_ / itemsize(); + return numel_; } // TODO: remove later void set_numel(int64_t numel) { - capacity_ = numel * itemsize(); + numel_ = numel; } - inline DeviceType device_type() const { - return device_type_; + at::DeviceType device_type() const { + return data_ptr_.device().type(); } inline size_t itemsize() const { return data_type_.itemsize(); } - // Rule of Five - StorageImpl(StorageImpl&&) = default; - ~StorageImpl() = default; - StorageImpl& operator=(StorageImpl&&) = default; + bool resizable() const { + return resizable_; + }; + + void set_resizable(bool resizable) { + resizable_ = resizable; + } /** * Can only be called when use_count is 1 */ - template void UniqueStorageShareExternalPointer( void* src, - const DataType& data_type, + const TypeMeta& data_type, size_t capacity, - Deleter d = nullptr) { + MemoryDeleter d = nullptr) { + UniqueStorageShareExternalPointer( + at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity); + } + + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { data_type_ = data_type; CAFFE_ENFORCE_WITH_CALLER( data_type_.id() != TypeIdentifier::uninitialized(), "To share with a raw external pointer you need to have meta " "already set."); - // Check if the deleter is a MemoryDeleter and is a simple nullptr. - if (std::is_same::value && - reinterpret_cast(&d)[0] == nullptr) { - // Use aliasing constructor trick to avoid calling the destructor. - data_ptr_ = std::shared_ptr(std::shared_ptr(), src); - } else { - data_ptr_.reset(src, d); - } - capacity_ = capacity; + data_ptr_ = std::move(data_ptr); + // NOTE: data_type might change and so it's also possible that capacity + // might not be divisible by itemsize. There is no way for us to keep track + // of the exact capacity if we're not explicity storing is. More conrectely + // capacity() might not return the value that was set here, if itemsize does + // not evenly divide it. + numel_ = capacity / data_type_.itemsize(); } private: - int64_t capacity_ = 0; - DataType data_type_; - DataPtr data_ptr_; + TypeMeta data_type_; + at::DataPtr data_ptr_; + int64_t numel_; + bool resizable_; + at::Allocator* allocator_; // allocator_ takes precedence over StaticContext from device_type_ // Allocator* allocator_; - DeviceType device_type_ = CPU; + // at::DeviceType device_type_ = CPU; }; class CAFFE2_API Storage { public: Storage() {} - Storage(DeviceType device_type) + Storage(at::DeviceType device_type) : storage_impl_(c10::make_intrusive(device_type)) {} - Storage(DeviceType device_type, TypeMeta data_type) + Storage(at::DeviceType device_type, TypeMeta data_type) : storage_impl_( c10::make_intrusive(device_type, data_type)) {} - template Storage( - T* src, - DeviceType device_type, - size_t capacity = 0, - Deleter d = nullptr) - : storage_impl_(c10::make_intrusive( - src, - device_type, - TypeMeta::Make(), - capacity, - d)) {} - - template - Storage( - void* src, - DeviceType device_type, TypeMeta data_type, - size_t capacity, - Deleter d = nullptr) + int64_t numel, + at::DataPtr data_ptr, + at::Allocator* allocator, + bool resizable) : storage_impl_(c10::make_intrusive( - device_type, data_type, - src, - capacity, - d)) {} + numel, + std::move(data_ptr), + allocator, + resizable)) {} void reset() { storage_impl_->reset(); @@ -206,19 +227,23 @@ class CAFFE2_API Storage { return storage_impl_->data(); } - DataPtr& data_ptr() { + at::DataPtr& data_ptr() { return storage_impl_->data_ptr(); } - const DataPtr& data_ptr() const { + const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); } + // Returns the previous data_ptr + at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { + return storage_impl_->set_data_ptr(std::move(data_ptr)); + }; - void set_dtype(const DataType& data_type) { + void set_dtype(const TypeMeta& data_type) { storage_impl_->set_dtype(data_type); } - const DataType& dtype() const { + const TypeMeta& dtype() const { return storage_impl_->dtype(); } size_t capacity() const { @@ -234,10 +259,14 @@ class CAFFE2_API Storage { storage_impl_->set_numel(numel); } - DeviceType device_type() const { + at::DeviceType device_type() const { return storage_impl_->device_type(); } + const at::Allocator* allocator() const { + return storage_impl_->allocator(); + } + inline size_t itemsize() const { return storage_impl_->itemsize(); } @@ -250,38 +279,35 @@ class CAFFE2_API Storage { return storage_impl_.unique(); } - template void UniqueStorageShareExternalPointer( void* src, - const DataType& data_type, + const TypeMeta& data_type, size_t capacity, - Deleter d = nullptr) { + MemoryDeleter d = nullptr) { CAFFE_ENFORCE_WITH_CALLER( storage_impl_.unique(), "UniqueStorageShareExternalPointer can only be called when \ use_count == 1"); - storage_impl_->UniqueStorageShareExternalPointer( + storage_impl_->UniqueStorageShareExternalPointer( src, data_type, capacity, d); } + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { + CAFFE_ENFORCE_WITH_CALLER( + storage_impl_.unique(), + "UniqueStorageShareExternalPointer can only be called when \ + use_count == 1"); + storage_impl_->UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); + } + protected: c10::intrusive_ptr storage_impl_; }; -/** - * Create a Storage given an external pointer `src`. - * `device_type`: the device type of the storage - * `capacity`: the capacity of the Tensor - */ -template -Storage CreateStorage( - T* src, - DeviceType device_type, - size_t capacity = 0, - Deleter d = nullptr) { - return CreateStorage(src, device_type, TypeMeta::Make(), capacity, d); -} - } // namespace caffe2 #endif // CAFFE2_CORE_STORAGE_H_ diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 392cb523c21b44..70bdbf83189ac5 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -349,7 +349,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { return; } // Old data is discarded - storage_.data_ptr().reset(); + storage_.data_ptr().clear(); auto oldSize = numel_; auto oldDims = dims_; Resize(newCapacity); @@ -509,17 +509,34 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * using it. If a Deleter object is passed in, when this tensor is reallocated * or freed, the deleter function is going to be called. */ - template - void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) { - ShareExternalPointer(src, TypeMeta::Make(), capacity, d); + template + void + ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) { + ShareExternalPointer((void*)src, TypeMeta::Make(), capacity, d); + } + + template + void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) { + ShareExternalPointer(std::move(data_ptr), TypeMeta::Make(), capacity); } - template void ShareExternalPointer( void* src, const TypeMeta& data_type, size_t capacity = 0, - Deleter d = nullptr) { + MemoryDeleter d = nullptr) { + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + ShareExternalPointer( + at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity); + } + + void ShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { CAFFE_ENFORCE_WITH_CALLER( data_type.id() != TypeIdentifier::uninitialized(), "To share with a raw external pointer you need to pass in an " @@ -531,10 +548,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { CAFFE_ENFORCE_WITH_CALLER( numel_ >= 0, "To share data with a raw pointer, you need to set shape first."); - storage_.UniqueStorageShareExternalPointer(src, data_type, capacity, d); + storage_.UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); } else { + int64_t numel = capacity / data_type.itemsize(); // Create a new Storage - storage_ = Storage(src, GetDeviceType(), data_type, capacity, d); + storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true); } } @@ -585,50 +604,61 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) { return storage_.data(); } else { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "Tensor is not initialized. You probably need to call Resize() " + "before calling mutable_data()"); bool had_special_dtor = storage_.dtype().dtor() != nullptr; if (storage_.unique()) { storage_.set_dtype(meta); - // TODO: recalcuate numel when we store numel instead of capacity in - // Storage } else { if (storage_.dtype() != meta) { storage_ = Storage(storage_.device_type(), meta); } } - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "Tensor is not initialized. You probably need to call Resize() " - "before calling mutable_data()"); // We can reuse the existing buffer if the current data does not have // a special destructor and the new data doesn't have a special // constructor. if (numel_ == 0 || (meta.ctor() == nullptr && !had_special_dtor && - storage_.capacity() >= numel_ * storage_.itemsize())) { + storage_.numel() >= numel_)) { return storage_.data(); } + const at::Allocator* allocator = storage_.allocator(); + // TODO: Get rid of StaticContext + CAFFE_ENFORCE( + allocator == nullptr, + "Allocator is not used within Caffe2 functions, please use StaticContext instead."); if (meta.ctor()) { // For types that need placement new, we will call it, as well as // making sure that when the data is freed, it calls the right // destruction procedure. auto size = numel_; auto dtor = storage_.dtype().dtor(); - auto ptr_and_deleter = - GetStaticContext()->New(numel_ * storage_.itemsize()); - auto deleter = ptr_and_deleter.second; - storage_.data_ptr().reset( - ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void { - dtor(ptr, size); - deleter(ptr); - }); + void* ptr; + at::DeleterFnPtr deleter; + auto ptr_and_deleter = GetStaticContext()->New( + numel_ * storage_.itemsize()); // Removing this can get rid of + // InefficientStdFunctionContext + ptr = ptr_and_deleter.first; + deleter = ptr_and_deleter.second; + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr, + [size, dtor, deleter](void* local_ptr) -> void { + dtor(local_ptr, size); + deleter(local_ptr); + }, + at::Device(storage_.device_type()))); storage_.dtype().ctor()(storage_.data(), numel_); } else { // For fundamental type, new and delete is easier. auto ptr_and_deleter = GetStaticContext()->New(numel_ * storage_.itemsize()); - storage_.data_ptr().reset( - ptr_and_deleter.first, ptr_and_deleter.second); + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr_and_deleter.first, + ptr_and_deleter.second, + at::Device(storage_.device_type()))); } storage_.set_numel(numel_); return storage_.data(); @@ -1026,19 +1056,32 @@ class CAFFE2_API Tensor final { impl_.get()->ShareData(*src.impl_.get()); } - template - void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) - const { - impl_.get()->ShareExternalPointer(src, capacity, d); + template + void ShareExternalPointer( + T* src, + size_t capacity = 0, + MemoryDeleter d = nullptr) const { + impl_.get()->ShareExternalPointer(src, capacity, d); + } + + template + void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const { + impl_.get()->ShareExternalPointer(std::move(data_ptr), capacity); } - template void ShareExternalPointer( void* src, const TypeMeta& meta, size_t capacity = 0, - Deleter d = nullptr) const { - impl_.get()->ShareExternalPointer(src, meta, capacity, d); + MemoryDeleter d = nullptr) const { + impl_.get()->ShareExternalPointer(src, meta, capacity, d); + } + + void ShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { + impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity); } inline const void* raw_data() const { diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h index 8b6f5e1a5c3033..e0122fdcc9983a 100644 --- a/caffe2/python/pybind_state_dlpack.h +++ b/caffe2/python/pybind_state_dlpack.h @@ -108,16 +108,21 @@ class DLPackWrapper { } tensor->Resize(dims); - const auto& meta = DLTypeToCaffe(dlTensor->dtype); + caffe2::TypeMeta meta = DLTypeToCaffe(dlTensor->dtype); + at::Device device = at::Device(tensor->GetDeviceType()); tensor->ShareExternalPointer( - ((int8_t*)dlTensor->data) + dlTensor->byte_offset, + at::DataPtr( + (void*)(((int8_t*)dlTensor->data) + dlTensor->byte_offset), + static_cast(dlMTensor), + [](void* t_ptr) -> void { + DLManagedTensor* mt_ptr = static_cast(t_ptr); + if (mt_ptr->destructor) { + mt_ptr->destructor(mt_ptr); + } + }, + device), meta, - 0, - [dlMTensor](void*) { - if (dlMTensor->destructor) { - dlMTensor->destructor(dlMTensor); - } - }); + 0); } Tensor* tensor; From 6398d626f4b191dc437cb1ef0f12fc579ffaf0ee Mon Sep 17 00:00:00 2001 From: zou3519 Date: Wed, 12 Sep 2018 10:54:32 -0700 Subject: [PATCH 079/237] Warn that export+import module always load onto the CPU (#11485) Summary: Test Plan `cd docs && make html` ![image](https://user-images.githubusercontent.com/5652049/45325074-ed04e480-b51d-11e8-9d2d-685dbe8a08e9.png) cc zdevito apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/11485 Differential Revision: D9772119 Pulled By: zou3519 fbshipit-source-id: 3dcb16c9edc2e8deebef17accf91a1c7d4dc9063 --- docs/source/jit.rst | 4 ++++ torch/jit/__init__.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 35e62f52ecefde..8bc4ce6e336fc9 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -33,6 +33,10 @@ Creating Torch Script Code To be able to save a module, it must not make any calls to native python functions. This means that all submodules must be subclasses of ScriptModules as well. + .. DANGER:: + All modules, no matter their device, are always loaded onto the CPU during loading. + This is different from :func:`torch.load`'s semantics and may change in the future. + .. autofunction:: load diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index 1397f0cd18af24..f40cade7fec486 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -62,6 +62,10 @@ def load(filename): r""" Load a ``ScriptModule`` previously saved with :func:`save ` + .. DANGER:: + All previously saved modules, no matter their device, are always loaded onto the CPU. + This is different from :func:`torch.load`'s semantics and may change in the future. + Arguments: filename (string): the file to load From 23d55883c0ae1362f0acdd7b59075566ab4147e2 Mon Sep 17 00:00:00 2001 From: Yan Zhu Date: Wed, 12 Sep 2018 11:17:52 -0700 Subject: [PATCH 080/237] minor formatting error log (#11528) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11528 as title Reviewed By: chocjy Differential Revision: D9773214 fbshipit-source-id: b7dd4c19ab83a18f344de8e71ce5b3bf74d1af72 --- caffe2/core/tensor.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 70bdbf83189ac5..6fd35133811cd9 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -583,8 +583,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { IsType(), "Tensor type mismatch, caller expects elements to be ", TypeMeta::TypeName(), - " while tensor contains ", - storage_.dtype().name()); + ", while tensor contains ", + storage_.dtype().name(), + ". "); return static_cast(storage_.data()); } From 13b05c8c78dc03c41e51cb7077910ac88f9b580b Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Wed, 12 Sep 2018 11:18:42 -0700 Subject: [PATCH 081/237] Add EndToEndHybridModel CUDA tests (#11544) Summary: Also adds two additional tests that check for memory leaks while the relevant graph executors are alive: - (minimal test): Create a ScriptModule, keep it alive, and test that it does not leak memory while it is alive - (large test) Do MNIST training with a traced MNIST module and test that no memory is leaked while the traced module (with graph executor) is alive cc apaszke zdevito Pull Request resolved: https://github.com/pytorch/pytorch/pull/11544 Reviewed By: apaszke Differential Revision: D9778479 Pulled By: zou3519 fbshipit-source-id: 2d6cdea81dd1264f2c0396b662f70fdafecb3647 --- test/test_jit.py | 183 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 148 insertions(+), 35 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 69347e2d67b111..0295ebaf70afe9 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -2620,6 +2620,29 @@ def _check_code(self, code_str, fn_name, inputs): cu = torch.jit.CompilationUnit(code_str) self.assertEqual(cu.func(*inputs), scope[fn_name](*inputs)) + @unittest.skipIf(not RUN_CUDA, 'no CUDA') + def test_scriptmodule_releases_tensors_cuda(self): + @torch.jit.script + def fn(x, y): + return x.sigmoid() * y.tanh() + + def test(backward=False): + x = torch.randn(3, 3, dtype=torch.double, device='cuda', requires_grad=True) + y = torch.randn(3, 3, dtype=torch.double, device='cuda', requires_grad=True) + out = fn(x, y) + if backward: + out.sum().backward() + + with self.assertLeaksNoCudaTensors(): + test() + test() + test() + + with self.assertLeaksNoCudaTensors(): + test(backward=True) + test(backward=True) + test(backward=True) + def test_index(self): def consec(size, start=0): numel = torch.tensor(size).prod().item() @@ -6760,9 +6783,29 @@ def foo(x): self.assertEqual(foo(input), input) +class MnistNet(nn.Module): + def __init__(self): + super(MnistNet, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + class TestEndToEndHybridFrontendModels(JitTestCase): - def test_dcgan_models(self): + @staticmethod + def _test_dcgan_models(self, device, check_export_import=True): class DCGANGenerator(nn.Module): def __init__(self, nz, ngf, nc): super(DCGANGenerator, self).__init__() @@ -6820,10 +6863,22 @@ def forward(self, input): return self.main(input).view(-1, 1).squeeze(1) bs, nz, ngf, nc, ndf = 5, 6, 9, 3, 10 - self.checkTrace(DCGANGenerator(nz, ngf, nc), (torch.rand(bs, nz, 1, 1),)) - example_input = DCGANGenerator(nz, ngf, nc)(torch.rand(bs, nz, 1, 1)) - self.checkTrace(DCGANDiscriminator(nc, ndf), (example_input,)) + self.checkTrace(DCGANGenerator(nz, ngf, nc).to(device), + (torch.rand(bs, nz, 1, 1, device=device),), + export_import=check_export_import) + example_input = DCGANGenerator(nz, ngf, nc).to(device)(torch.rand(bs, nz, 1, 1, device=device)) + self.checkTrace(DCGANDiscriminator(nc, ndf).to(device), (example_input,), + export_import=check_export_import) + + def test_dcgan_models(self): + self._test_dcgan_models(self, device='cpu') + + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_dcgan_models_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_dcgan_models(self, device='cuda', check_export_import=False) + # XXX: When this is fixed, write a CUDA test for this. @unittest.skip('https://github.com/pytorch/pytorch/issues/8439 InstanceNormalization bug') def test_neural_style(self): class TransformerNet(torch.nn.Module): @@ -6924,29 +6979,45 @@ def forward(self, x): self.checkTrace(TransformerNet(), (torch.rand(5, 3, 224, 224),)) + @staticmethod + def _test_mnist(self, device, check_export_import=True): + # eval() is present because dropout makes this nondeterministic + self.checkTrace(MnistNet().to(device).eval(), (torch.rand(5, 1, 28, 28, device=device),), + export_import=check_export_import) + def test_mnist(self): - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) + self._test_mnist(self, device='cpu') - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=1) + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_mnist_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_mnist(self, device='cuda', check_export_import=False) - # eval() is present because dropout makes this nondeterministic - self.checkTrace(Net().eval(), (torch.rand(5, 1, 28, 28),)) + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_mnist_training_leaks_no_memory_cuda(self): + net = MnistNet().cuda() + traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')]) - def test_reinforcement_learning(self): + def train(iters): + for _ in range(iters): + # Get some fake data + inp = torch.randn(5, 1, 28, 28, device='cuda') + out = traced_net(inp) + + # Here's some fake loss + out.sum().backward() + + # Zero out grads + traced_net.zero_grad() + + # Set it up so the params have .grad fields so they are not reported as leaks + train(1) + + with self.assertLeaksNoCudaTensors(): + train(5) + + @staticmethod + def _test_reinforcement_learning(self, device, test_export_import=True): class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() @@ -6958,10 +7029,19 @@ def forward(self, x): action_scores = self.affine2(x) return F.softmax(action_scores, dim=1) - self.checkTrace(Policy(), (torch.rand(1, 4),)) + self.checkTrace(Policy().to(device), (torch.rand(1, 4, device=device),), + export_import=test_export_import) - @skipIfRocm - def test_snli(self): + def test_reinforcement_learning(self): + self._test_reinforcement_learning(self, device='cpu') + + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_reinforcement_learning_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_reinforcement_learning(self, device='cuda', test_export_import=False) + + @staticmethod + def _test_snli(self, device, check_export_import=True): class Bottle(nn.Module): def forward(self, input): @@ -7045,13 +7125,24 @@ class Config: n_layers = 2 n_cells = 4 # 2 * n_layers because birnn = True - premise = torch.LongTensor(48, 128).random_(0, 100) - hypothesis = torch.LongTensor(24, 128).random_(0, 100) + premise = torch.LongTensor(48, 128).random_(0, 100).to(device) + hypothesis = torch.LongTensor(24, 128).random_(0, 100).to(device) - self.checkTrace(SNLIClassifier(Config()), (premise, hypothesis), inputs_require_grads=False) + self.checkTrace(SNLIClassifier(Config()).to(device), (premise, hypothesis), + inputs_require_grads=False, export_import=check_export_import) @skipIfRocm - def test_super_resolution(self): + def test_snli(self): + self._test_snli(self, device='cpu') + + @skipIfRocm + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_snli_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_snli(self, device='cuda', check_export_import=False) + + @staticmethod + def _test_super_resolution(self, device, check_export_import=True): import torch.nn.init as init class Net(nn.Module): @@ -7073,8 +7164,19 @@ def forward(self, x): x = self.pixel_shuffle(self.conv4(x)) return x - net = Net(upscale_factor=4) - self.checkTrace(net, (torch.rand(5, 1, 64, 64),)) + net = Net(upscale_factor=4).to(device) + self.checkTrace(net, (torch.rand(5, 1, 64, 64, device=device),), + export_import=check_export_import) + + @skipIfRocm + def test_super_resolution(self): + self._test_super_resolution(self, device='cpu') + + @skipIfRocm + @unittest.skipIf(not RUN_CUDA, 'no CUDA') + def test_super_resolution_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_super_resolution(self, device='cuda', check_export_import=False) @suppress_warnings def test_time_sequence_prediction(self): @@ -7131,9 +7233,11 @@ def forward(self, input): return outputs # TODO: toggle export_import once above issues are fixed - self.checkTrace(Sequence(), (torch.rand(3, 4),), export_import=False) + self.checkTrace(Sequence(), (torch.rand(3, 4),), + export_import=False) - def test_vae(self): + @staticmethod + def _test_vae(self, device, check_export_import=True): class VAE(nn.Module): def __init__(self): super(VAE, self).__init__() @@ -7166,7 +7270,16 @@ def forward(self, x): return self.decode(z), mu, logvar # eval() is present because randn_like makes this nondeterministic - self.checkTrace(VAE().eval(), (torch.rand(128, 1, 28, 28),)) + self.checkTrace(VAE().to(device).eval(), (torch.rand(128, 1, 28, 28, device=device),), + export_import=check_export_import) + + def test_vae(self): + self._test_vae(self, device='cpu') + + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_vae_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_vae(self, device='cuda', check_export_import=False) # Smoke tests for export methods From 17e76e26c84d61ab9251660ccbcb1cf7494936a9 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Wed, 12 Sep 2018 12:05:19 -0700 Subject: [PATCH 082/237] Add trigonometry functions to docs/source/onnx.rst Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11581 Differential Revision: D9794449 Pulled By: soumith fbshipit-source-id: 1218fcf8969a10ffbfefd3ced7fee9fe7df296f1 --- docs/source/onnx.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index e461cef4be68c9..90bf2878e7b730 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -169,6 +169,12 @@ The following operators are supported: * ge * le * exp +* sin +* cos +* tan +* asin +* acos +* atan * permute * Conv * BatchNorm From ad7936e10839edb5622a3c0391ae1dadfc79597c Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Wed, 12 Sep 2018 12:21:20 -0700 Subject: [PATCH 083/237] Fix reloading modules back into python (#11552) Summary: This changes the way module import works so that when a module is reloaded in python it becomes a ScriptModule and not a _C.ScriptModule Pull Request resolved: https://github.com/pytorch/pytorch/pull/11552 Differential Revision: D9782751 Pulled By: zdevito fbshipit-source-id: 9576850b75494b228ce3def94c0d371a4a44b11d --- test/test_jit.py | 26 +++++++++++++++++++++ torch/csrc/jit/import.cpp | 42 +++++++++++++++++++--------------- torch/csrc/jit/import.h | 7 ++++-- torch/csrc/jit/script/init.cpp | 8 +++---- torch/jit/__init__.py | 11 ++++++++- 5 files changed, 68 insertions(+), 26 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 0295ebaf70afe9..e4797c57dda3b5 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1734,6 +1734,32 @@ def test_export_dropout(self): x = torch.randn(3, 4) self.assertEqual(traced(x), imported(x)) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + def test_cuda_export_restore(self): + class Sub(torch.jit.ScriptModule): + def __init__(self): + super(Sub, self).__init__() + self.weight = nn.Parameter(torch.randn(3, 4)) + + @torch.jit.script_method + def forward(self, thing): + return self.weight + thing + + class M(torch.jit.ScriptModule): + def __init__(self): + super(M, self).__init__() + self.mod = Sub() + + @torch.jit.script_method + def forward(self, v): + return self.mod(v) + m = M() + m.cuda() + m2 = self.getExportImportCopy(m) + m2.cuda() + input = torch.rand(3, 4).cuda() + self.assertEqual(m(input), m2(input)) + def test_export_batchnorm(self): for mode in ['eval', 'train']: for clazz in [ diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index f868f1ff555ad5..d2f828a2517ef8 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -180,7 +180,7 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block, class ModuleDecoder : DecoderBase { public: - ModuleDecoder(std::shared_ptr root_module, + ModuleDecoder(ModuleLookup module_lookup, const std::string& filename); private: @@ -202,7 +202,7 @@ class ModuleDecoder : DecoderBase { const std::vector& strides); std::pair, std::string> parseFullName( - std::shared_ptr root_module, + ModuleLookup module_lookup, const std::string fullname); PyTorchFileReader file_reader_; @@ -330,7 +330,7 @@ at::Tensor ModuleDecoder::buildTensorCommon( // Given a full name of a parameter or method, // return the parent submodule and local name std::pair, std::string> ModuleDecoder::parseFullName( - std::shared_ptr root_module, + ModuleLookup module_lookup, const std::string fullname) { std::vector vec; std::stringstream ss(fullname); @@ -339,18 +339,13 @@ std::pair, std::string> ModuleDecoder::parseFull vec.push_back(name); } - std::shared_ptr curr = root_module; - for (size_t i = 0; i < vec.size() - 1; i++) { - if (curr->find_module(vec[i]) == nullptr) { - curr->register_module(vec[i], std::make_shared()); - } - curr = curr->get_module(vec[i]); - } - return std::make_pair(curr, vec.back()); + std::string last = vec.back(); + vec.pop_back(); + return std::make_pair(module_lookup(vec), std::move(last)); } ModuleDecoder::ModuleDecoder( - const std::shared_ptr root_module, + ModuleLookup module_lookup, const std::string &filename) : file_reader_(filename) { auto model_proto = onnx::ModelProto(); @@ -363,7 +358,7 @@ ModuleDecoder::ModuleDecoder( for (auto &tensor_proto : graph_proto.initializer()) { std::shared_ptr parent_module; std::string name; - std::tie(parent_module, name) = parseFullName(root_module, tensor_proto.name()); + std::tie(parent_module, name) = parseFullName(module_lookup, tensor_proto.name()); auto param = buildParameter(tensor_proto); parent_module->register_parameter(name, param, /* is_buffer = */ tensor_proto.int64_data(0)); @@ -373,7 +368,7 @@ ModuleDecoder::ModuleDecoder( for (auto &node_proto : graph_proto.node()) { std::shared_ptr parent_module; std::string name; - std::tie(parent_module, name) = parseFullName(root_module, node_proto.name()); + std::tie(parent_module, name) = parseFullName(module_lookup, node_proto.name()); std::vector member_inputs; for (auto ¶m_name : node_proto.input()) { @@ -393,15 +388,26 @@ ModuleDecoder::ModuleDecoder( } // namespace -void ImportIRModule( - const std::shared_ptr module, +void import_ir_module( + ModuleLookup module_lookup, const std::string& filename) { - ModuleDecoder(module, filename); + ModuleDecoder(module_lookup, filename); } std::shared_ptr load(const std::string& filename) { auto module = std::make_shared(); - ModuleDecoder(module, filename); + + auto module_lookup = [&](const std::vector& qualified_name) { + std::shared_ptr curr = module; + for (const auto& name : qualified_name) { + if (curr->find_module(name) == nullptr) { + curr->register_module(name, std::make_shared()); + } + curr = curr->get_module(name); + } + return curr; + }; + ModuleDecoder(module_lookup, filename); return module; } diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h index ba466f736d3811..21f9d605ec59b3 100644 --- a/torch/csrc/jit/import.h +++ b/torch/csrc/jit/import.h @@ -5,8 +5,11 @@ namespace torch { namespace jit { -TORCH_API void ImportIRModule( - const std::shared_ptr module, +using ModuleLookup = std::function( + const std::vector&)>; + +TORCH_API void import_ir_module( + ModuleLookup module_lookup, const std::string& filename); TORCH_API std::shared_ptr load(const std::string& filename); diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index 9f8185621a1c0e..c09caf4c3702f9 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -24,6 +24,8 @@ #include #include #include +#include + namespace torch { namespace jit { @@ -370,10 +372,6 @@ void initJitScriptBindings(PyObject* module) { py::class_>(m, "ScriptModule") .def(py::init<>()) .def("save", &Module::save) - .def("_load", [](const std::shared_ptr module, - const std::string& filename) { - ImportIRModule(module, filename); - }) .def("_set_optimized", &Module::set_optimized) .def( "_define", @@ -529,7 +527,7 @@ void initJitScriptBindings(PyObject* module) { }); m.def("merge_type_from_type_comment", &mergeTypesFromTypeComment); - + m.def("import_ir_module", import_ir_module); } } // namespace script diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f40cade7fec486..b32edf715a6c00 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -73,7 +73,16 @@ def load(filename): A ``ScriptModule`` object. """ m = ScriptModule() - m._load(filename) + + def module_lookup(names): + curr = m + for name in names: + if not hasattr(curr, name): + setattr(curr, name, ScriptModule()) + curr = getattr(curr, name) + return curr + + torch._C.import_ir_module(module_lookup, filename) return m From 739e6af86940f558f384a4a8922df8dc660d5660 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Wed, 12 Sep 2018 12:28:31 -0700 Subject: [PATCH 084/237] Add reminder % to the jit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11557 Reviewed By: apaszke Differential Revision: D9784642 Pulled By: wanchaol fbshipit-source-id: b7c60c3e9534555c9d7db83769965b3f2f277cdf --- test/test_jit.py | 39 +++++++++++++++++++++------- torch/csrc/jit/register_prim_ops.cpp | 15 +++++++---- torch/csrc/jit/script/compiler.cpp | 3 +++ torch/csrc/jit/script/lexer.cpp | 1 + torch/csrc/jit/script/lexer.h | 2 +- torch/csrc/jit/script/parser.h | 2 +- torch/csrc/jit/script/tree_views.h | 3 +++ torch/jit/frontend.py | 1 + 8 files changed, 50 insertions(+), 16 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index e4797c57dda3b5..4aa43b1c8596a5 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -3791,19 +3791,32 @@ def test_number_math(self): template = dedent(''' # int, int -> int def func1(): - return 8 {op} 2 + return 7 {op} 2 def func2(): - return 2 {op} 2 + return 3 {op} 2 - # float, float -> float def func3(): - return 3.14 {op} 0.125 + return -7 {op} 3 def func4(): + return 7 {op} -3 + + # float, float -> float + def func5(): + return 3.14 {op} 0.125 + + def func6(): return 3.14 {op} 3.14 + + def func7(): + return -0.5 {op} 2.0 + + def func8(): + return 3.5 {op} -2.0 + ''') - ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!='] + ops = ['+', '-', '*', '%', '<', '<=', '>', '>=', '==', '!='] for op in ops: code = template.format(op=op) @@ -3815,6 +3828,10 @@ def func4(): self.assertEqual(cu.func2(), scope['func2']()) self.assertEqual(cu.func3(), scope['func3']()) self.assertEqual(cu.func4(), scope['func4']()) + self.assertEqual(cu.func5(), scope['func5']()) + self.assertEqual(cu.func6(), scope['func6']()) + self.assertEqual(cu.func7(), scope['func7']()) + self.assertEqual(cu.func8(), scope['func8']()) def test_number_div(self): self.checkScript(div_int_future, (), optimize=True) @@ -3866,10 +3883,10 @@ def test(op, const, swap_args): cu = torch.jit.CompilationUnit(code) self.assertEqual(cu.func(tensor), scope['func'](tensor)) - var_int = 2 - var_float = 1.4321 + var_int = [2, -2] + var_float = [1.4321, -1.2] - ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!='] + ops = ['+', '-', '*', '%', '<', '<=', '>', '>=', '==', '!='] # TODO: turn this on for py3 (and add PY3 division semantics) ops_py2_only = ['/'] if PY2: @@ -3881,7 +3898,7 @@ def test(op, const, swap_args): long_tensor[long_tensor == 0] = 2 tensors = [float_tensor, double_tensor, long_tensor] - consts = [var_int, var_float] + consts = var_int + var_float for op, tensor, const, swap_args in product(ops, tensors, consts, [True, False]): # FIXME: things like 2 / long_tensor are not implemented correctly @@ -3889,6 +3906,10 @@ def test(op, const, swap_args): if op == '/' and tensor.data_ptr() == long_tensor.data_ptr(): continue + # % operator does not take: const % tensor + if op == '%' and swap_args is True: + continue + test(op, const, swap_args) def test_tensor_number_math(self): diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 67274f8a9192d6..13f5f6dbbad275 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -390,14 +390,14 @@ RegisterOperators reg({ }); // define implementations for primitive number ops -#define DEFINE_GENERIC_OP(aten_op, op, float_result) \ +#define DEFINE_GENERIC_OP(aten_op, int_op, float_op, float_result) \ Operator( \ #aten_op "(int a, int b) -> int", \ [](Node* node) { \ return [=](Stack& stack) { \ int64_t a, b; \ pop(stack, a, b); \ - push(stack, op); \ + push(stack, int_op); \ return 0; \ }; \ }), \ @@ -406,7 +406,7 @@ RegisterOperators reg({ return [=](Stack& stack) { \ double a, b; \ pop(stack, a, b); \ - push(stack, op); \ + push(stack, float_op); \ return 0; \ }; \ }), @@ -421,8 +421,8 @@ RegisterOperators reg({ }; \ }), -#define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, float) -#define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, int) +#define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, float) +#define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, int) // define helpers for where aten is missing scalar overloads // note: it would be better to define these in a standard library as @@ -618,6 +618,11 @@ RegisterOperators reg2({ DEFINE_BINARY_OP(aten::mul, a * b) DEFINE_BINARY_OP(aten::pow, static_cast(pow(a, b))) + // Pass in two ops for handling int and float separately as % in C++ only works for int + // The modulus calculation is different between C++ and Python (on negative), we preserve + // the python behavior as it's more common and match python syntax, hence the conversion. + DEFINE_GENERIC_OP(aten::remainder, (b + (a % b)) % b, fmod((b + fmod(a, b)), b), float) + // TODO: Support python floordiv (//) // Right now aten::floordiv is only used by loop unrolling DEFINE_INT_OP(aten::floordiv, a / b) diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index a04e4ae0fc3f03..5efa0668d91296 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -1338,6 +1338,8 @@ struct to_ir { return prim::Starred; case '/': return aten::div; + case '%': + return aten::remainder; case TK_NE: return aten::ne; case TK_EQ: @@ -1479,6 +1481,7 @@ struct to_ir { case '/': case '+': case '-': + case '%': case TK_UNARY_MINUS: { const auto& inputs = tree->trees(); auto kind = getNodeKind(tree->kind(), inputs.size()); diff --git a/torch/csrc/jit/script/lexer.cpp b/torch/csrc/jit/script/lexer.cpp index d5410c39b84455..969a79ebb635bc 100644 --- a/torch/csrc/jit/script/lexer.cpp +++ b/torch/csrc/jit/script/lexer.cpp @@ -25,6 +25,7 @@ static const std::unordered_map binary_prec = { {'-', 5}, {'*', 6}, {'/', 6}, + {'%', 6}, {'@', 6}, {TK_POW, 7}, }; diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h index 617addf486c736..7d29ea5944ad10 100644 --- a/torch/csrc/jit/script/lexer.h +++ b/torch/csrc/jit/script/lexer.h @@ -85,7 +85,7 @@ namespace script { _(TK_SLICE_EXPR, "slice expr", "") \ _(TK_TYPE_COMMENT, "type comment", "# type:") -static const char* valid_single_char_tokens = "+-*/@()[]:,={}><.?"; +static const char* valid_single_char_tokens = "+-*/%@()[]:,={}><.?"; enum TokenKind { // we use characters to represent themselves so skip all valid characters diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h index e808e70ccd4006..53f4dc368714f4 100644 --- a/torch/csrc/jit/script/parser.h +++ b/torch/csrc/jit/script/parser.h @@ -177,7 +177,7 @@ struct Parser { int binary_prec; while (shared.isBinary(L.cur().kind, &binary_prec)) { if (binary_prec <= precedence) // not allowed to parse something which is - // not greater than 'precedenc' + // not greater than 'precedence' break; int kind = L.cur().kind; diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h index b5d38f80fde390..162c33e6838666 100644 --- a/torch/csrc/jit/script/tree_views.h +++ b/torch/csrc/jit/script/tree_views.h @@ -233,6 +233,7 @@ struct Expr : public TreeView { case '*': case TK_STARRED: case '/': + case '%': case TK_NOT: case TK_CONST: case TK_STRINGLITERAL: @@ -421,6 +422,7 @@ struct AssignKind : public TreeView { case '-': case '*': case '/': + case '%': return; default: throw ErrorReport(tree) << "is not a valid AssignKind"; @@ -496,6 +498,7 @@ struct BinOp : public Expr { case '-': case '@': case TK_POW: + case '%': if (tree->trees().size() != 2) throw ErrorReport(tree) << "BinOp expected 2 subtrees, found " << tree->trees().size(); return; diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 659c3bc4cdd70f..99d767797e1b7e 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -303,6 +303,7 @@ class ExprBuilder(Builder): ast.Mult: '*', ast.Div: '/', ast.Pow: '**', + ast.Mod: '%', } if not PY2: From 9a7c196040f55d921a4deee42cbb1dd294b8f23b Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Wed, 12 Sep 2018 12:57:32 -0700 Subject: [PATCH 085/237] Move Type, Tensor, TensorMethods to core. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11519 Reviewed By: yf225 Differential Revision: D9771684 Pulled By: gchanan fbshipit-source-id: a57ee2072af99ce856f895c688b09d750a8606e0 --- aten/src/ATen/ATen.h | 2 +- aten/src/ATen/Tensor.h | 2 + aten/src/ATen/Type.h | 2 + aten/src/ATen/core/ATenCoreTest.cpp | 2 + aten/src/ATen/core/Tensor.h | 686 ++++++++++++ aten/src/ATen/core/TensorMethods.h | 1242 +++++++++++++++++++++ aten/src/ATen/core/Type.h | 647 +++++++++++ aten/src/ATen/gen.py | 47 +- aten/src/ATen/templates/NativeFunctions.h | 2 +- aten/src/ATen/templates/Tensor.h | 2 - aten/src/ATen/templates/TensorMethods.h | 6 +- aten/src/ATen/templates/Type.h | 4 +- cmake/Codegen.cmake | 10 +- 13 files changed, 2633 insertions(+), 21 deletions(-) create mode 100644 aten/src/ATen/Tensor.h create mode 100644 aten/src/ATen/Type.h create mode 100644 aten/src/ATen/core/Tensor.h create mode 100644 aten/src/ATen/core/TensorMethods.h create mode 100644 aten/src/ATen/core/Type.h diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h index cf074730bf072b..29812852e24fdb 100644 --- a/aten/src/ATen/ATen.h +++ b/aten/src/ATen/ATen.h @@ -19,7 +19,7 @@ #include "ATen/core/Storage.h" #include "ATen/Tensor.h" #include "ATen/TensorGeometry.h" -#include "ATen/TensorMethods.h" +#include "ATen/core/TensorMethods.h" #include "ATen/TensorOperators.h" #include "ATen/core/TensorOptions.h" #include "ATen/Type.h" diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h new file mode 100644 index 00000000000000..cef05f5341cb31 --- /dev/null +++ b/aten/src/ATen/Tensor.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/aten/src/ATen/Type.h b/aten/src/ATen/Type.h new file mode 100644 index 00000000000000..0c95f43e0482e9 --- /dev/null +++ b/aten/src/ATen/Type.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp index 5bb595a0bce5de..bb670b315f16c9 100644 --- a/aten/src/ATen/core/ATenCoreTest.cpp +++ b/aten/src/ATen/core/ATenCoreTest.cpp @@ -1,9 +1,11 @@ #include +#include namespace at { static int CoreTestGlobal = 0; int CoreTest() { + Tensor x; return CoreTestGlobal++; } diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h new file mode 100644 index 00000000000000..4b1f18655547d3 --- /dev/null +++ b/aten/src/ATen/core/Tensor.h @@ -0,0 +1,686 @@ +#pragma once + +#include "ATen/core/Device.h" +#include "ATen/core/Layout.h" +#include "ATen/core/Scalar.h" +#include "ATen/core/ScalarType.h" +#include "ATen/core/SparseTensorRef.h" +#include "ATen/core/Storage.h" +#include "ATen/core/TensorAccessor.h" +#include "ATen/core/TensorImpl.h" +#include "ATen/core/optional.h" +#include "ATen/core/UndefinedTensorImpl.h" +#include "ATen/core/Error.h" + +namespace at { +struct Generator; +struct Type; +struct Tensor; +struct TensorOptions; +} // namespace at + +namespace at { +// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which +// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr. +// +// For example: +// +// void func(Tensor a) { +// Tensor b = a; +// ... +// } +// +// In this example, when we say Tensor b = a, we are creating a new object that points to the +// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the +// destructor decrements the reference count by calling release() on the TensorImpl it points to. +// The existing constructors, operator overloads, etc. take care to implement the correct semantics. +// +// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and +// special care must be taken to handle this. +struct AT_API Tensor { + Tensor(){}; + Tensor(c10::intrusive_ptr tensor_impl) + : tensor_impl_(std::move(tensor_impl)) { + if (tensor_impl_.get() == nullptr) { + throw std::runtime_error("TensorBaseImpl with nullptr not supported"); + } + } + + Tensor(const Tensor&) = default; + Tensor(Tensor&&) = default; + + int64_t dim() const { + return tensor_impl_->dim(); + } + + TensorImpl * unsafeGetTensorImpl() const { + return tensor_impl_.get(); + } + TensorImpl * unsafeReleaseTensorImpl() { + return tensor_impl_.release(); + } + const c10::intrusive_ptr& getIntrusivePtr() const { + return tensor_impl_; + } + + bool defined() const { + return tensor_impl_; + } + + void reset() { + tensor_impl_.reset(); + } + + // The following overloads are very intruiging. Consider the following + // program: + // + // x[1] = 3; + // + // We would expect that the first entry of x is written to 3. But how can we + // actually achieve this? x[1] evaluates to a tensor... + // + // The answer is, using a ref-qualifier. x[1] is an rvalue, which cannot be + // (profitably) assigned to in the traditional sense, so we overload + // assignment to mean, "Actually, copy 3 into the tensor data." This is done + // with an rvalue-reference ref-qualified overload (the methods with && at the + // end of their type.) + // + // There's one more fly in the ointment: We also want + // + // Tensor x = y; + // + // to work, and we want it NOT to copy. So we need a traditional operator= + // overload. But we MUST specify a mutable lvalue ref-qualifier, to + // disambiguate the traditional overload from the rvalue-reference + // ref-qualified overload. Otherwise, it will be ambiguous, because + // a non ref-qualified method is eligible for all situations. + + // Unfortunately, we have to write these constructors out manually + // to work around an MSVC bug: + // error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &': + // multiple versions of a defaulted special member functions are not allowed + // Tensor& operator=(const Tensor&) & = default; + // Tensor& operator=(Tensor&&) & = default; + Tensor& operator=(const Tensor& x) & { + tensor_impl_ = x.tensor_impl_; + return *this; + } + Tensor& operator=(Tensor&& x) & { + tensor_impl_ = std::move(x.tensor_impl_); + return *this; + } + + Tensor& operator=(Scalar v) &&; + Tensor& operator=(const Tensor&) &&; + Tensor& operator=(Tensor&&) &&; + + bool is_same(const Tensor& other) const noexcept { + return tensor_impl_ == other.tensor_impl_; + } + size_t use_count() const noexcept { + return tensor_impl_.use_count(); + } + size_t weak_use_count() const noexcept { + return tensor_impl_.weak_use_count(); + } + + const char * toString() const; + + IntList sizes() const { + return tensor_impl_->sizes(); + } + IntList strides() const { + return tensor_impl_->strides(); + } + int64_t ndimension() const { + return dim(); + } + Type & type() const { + return tensor_impl_->type(); + } + TensorTypeId type_id() const { + return tensor_impl_->type_id(); + } + ScalarType scalar_type() const { + return tensor_impl_->scalar_type(); + } + const Storage& storage() const { + return tensor_impl_->storage(); + } + Tensor toType(const Type & t, bool non_blocking=false) const; + Tensor & copy_(const Tensor & src, bool non_blocking=false); + Tensor toType(ScalarType t) const; + Tensor toBackend(Backend b) const; + + /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`. + /// Defined in Type.h because of include order issues. + bool is_variable() const noexcept; + + /// Returns a `Tensor`'s layout. Defined in Type.h + Layout layout() const noexcept; + + /// Returns a `Tensor`'s dtype (`ScalarType`). Defined in Type.h + ScalarType dtype() const noexcept; + + /// Returns a `Tensor`'s device. + Device device() const; + + /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in + /// TensorOptions.h. + TensorOptions options() const; + + template + T * data() const; + + // Purposely not defined here to avoid inlining + void print() const; + + //toLongData(), toFloatData() etc. + #define TO_TYPE_DATA(T,name,_) \ + T * to##name##Data() const; + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA) + #undef TO_TYPE_DATA + + #define TO_C_TYPE(T,name,_) \ + T toC##name () const; + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE) + #undef TO_C_TYPE + + // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and + // dimension. + template + TensorAccessor accessor() const& { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return TensorAccessor(data(),sizes().data(),strides().data()); + } + template + TensorAccessor accessor() && = delete; + + // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and + // dimension. You can optionally specify RestrictPtrTraits as a template parameter to + // cast the data pointer to a __restrict__ pointer. + // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor + // as an argument. + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() const& { + static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); + AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + return PackedTensorAccessor(static_cast::PtrType>(data()),sizes().data(),strides().data()); + } + template class PtrTraits = DefaultPtrTraits> + PackedTensorAccessor packed_accessor() && = delete; + + Tensor operator-() const; + Tensor& operator+=(const Tensor & other); + Tensor& operator+=(Scalar other); + Tensor& operator-=(const Tensor & other); + Tensor& operator-=(Scalar other); + Tensor& operator*=(const Tensor & other); + Tensor& operator*=(Scalar other); + Tensor& operator/=(const Tensor & other); + Tensor& operator/=(Scalar other); + Tensor operator[](Scalar index) const; + Tensor operator[](Tensor index) const; + Tensor operator[](int64_t index) const; + + Tensor cpu() const; + Tensor cuda() const; + + // ~~~~~ Autograd API ~~~~~ + + Tensor& set_requires_grad(bool requires_grad) { + tensor_impl_->set_requires_grad(requires_grad); + return *this; + } + bool requires_grad() const { + return tensor_impl_->requires_grad(); + } + + Tensor& grad() { + return tensor_impl_->grad(); + } + const Tensor& grad() const { + return tensor_impl_->grad(); + } + + void set_data(Tensor new_data); + + /// Computes the gradient of current tensor w.r.t. graph leaves. + void backward( + at::optional gradient = at::nullopt, + bool keep_graph = false, + bool create_graph = false); + + // STOP. Thinking of adding a method here, which only makes use + // of other ATen methods? Define it in native_functions.yaml. + + //example + //Tensor * add(Tensor & b); + int64_t storage_offset() const; + Tensor & resize_(IntList size); + Tensor & set_(Storage source); + Tensor & set_(Storage source, int64_t storage_offset, IntList size, IntList stride={}); + Tensor & set_(const Tensor & source); + Tensor & set_(); + bool is_contiguous() const; + bool is_set_to(const Tensor & tensor) const; + Tensor & masked_fill_(const Tensor & mask, Scalar value); + Tensor & masked_fill_(const Tensor & mask, const Tensor & value); + Tensor & masked_scatter_(const Tensor & mask, const Tensor & source); + Tensor masked_select(const Tensor & mask) const; + Tensor nonzero() const; + Tensor contiguous() const; + Tensor view(IntList size) const; + Tensor index_select(int64_t dim, const Tensor & index) const; + Tensor take(const Tensor & index) const; + Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false); + Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source); + Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value); + Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value); + Tensor unfold(int64_t dimension, int64_t size, int64_t step) const; + Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src); + Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value); + Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src); + Tensor gather(int64_t dim, const Tensor & index) const; + void* data_ptr() const; + bool equal(const Tensor & other) const; + Tensor __and__(Scalar other) const; + Tensor __and__(const Tensor & other) const; + Tensor & __iand__(Scalar other); + Tensor & __iand__(const Tensor & other); + Tensor __or__(Scalar other) const; + Tensor __or__(const Tensor & other) const; + Tensor & __ior__(Scalar other); + Tensor & __ior__(const Tensor & other); + Tensor __xor__(Scalar other) const; + Tensor __xor__(const Tensor & other) const; + Tensor & __ixor__(Scalar other); + Tensor & __ixor__(const Tensor & other); + Tensor __lshift__(Scalar other) const; + Tensor __lshift__(const Tensor & other) const; + Tensor & __ilshift__(Scalar other); + Tensor & __ilshift__(const Tensor & other); + Tensor __rshift__(Scalar other) const; + Tensor __rshift__(const Tensor & other) const; + Tensor & __irshift__(Scalar other); + Tensor & __irshift__(const Tensor & other); + Tensor lt(Scalar other) const; + Tensor lt(const Tensor & other) const; + Tensor & lt_(Scalar other); + Tensor & lt_(const Tensor & other); + Tensor gt(Scalar other) const; + Tensor gt(const Tensor & other) const; + Tensor & gt_(Scalar other); + Tensor & gt_(const Tensor & other); + Tensor le(Scalar other) const; + Tensor le(const Tensor & other) const; + Tensor & le_(Scalar other); + Tensor & le_(const Tensor & other); + Tensor ge(Scalar other) const; + Tensor ge(const Tensor & other) const; + Tensor & ge_(Scalar other); + Tensor & ge_(const Tensor & other); + Tensor eq(Scalar other) const; + Tensor eq(const Tensor & other) const; + Tensor & eq_(Scalar other); + Tensor & eq_(const Tensor & other); + Tensor ne(Scalar other) const; + Tensor ne(const Tensor & other) const; + Tensor & ne_(Scalar other); + Tensor & ne_(const Tensor & other); + Tensor min(const Tensor & other) const; + Tensor min() const; + Tensor max(const Tensor & other) const; + Tensor max() const; + Tensor median() const; + std::tuple sort(int64_t dim=-1, bool descending=false) const; + std::tuple topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const; + Tensor all() const; + Tensor any() const; + Tensor lgamma() const; + Tensor & lgamma_(); + Tensor digamma() const; + Tensor & digamma_(); + Tensor polygamma(int64_t n) const; + Tensor & polygamma_(int64_t n); + Tensor & erfinv_(); + Tensor erfinv() const; + Tensor & frac_(); + Tensor frac() const; + Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const; + Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm); + Tensor dist(const Tensor & other, Scalar p=2) const; + Tensor reciprocal() const; + Tensor & reciprocal_(); + Tensor neg() const; + Tensor & neg_(); + Tensor atan2(const Tensor & other) const; + Tensor & atan2_(const Tensor & other); + Tensor pow(const Tensor & exponent) const; + Tensor & pow_(Scalar exponent); + Tensor & pow_(const Tensor & exponent); + Tensor lerp(const Tensor & end, Scalar weight) const; + Tensor & lerp_(const Tensor & end, Scalar weight); + Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const; + Tensor sign() const; + Tensor & sign_(); + Tensor trace() const; + Tensor fmod(Scalar other) const; + Tensor fmod(const Tensor & other) const; + Tensor & fmod_(Scalar other); + Tensor & fmod_(const Tensor & other); + Tensor remainder(Scalar other) const; + Tensor remainder(const Tensor & other) const; + Tensor & remainder_(Scalar other); + Tensor & remainder_(const Tensor & other); + Tensor tril(int64_t diagonal=0) const; + Tensor & tril_(int64_t diagonal=0); + Tensor triu(int64_t diagonal=0) const; + Tensor & triu_(int64_t diagonal=0); + Tensor cross(const Tensor & other, int64_t dim=-1) const; + Tensor diag(int64_t diagonal=0) const; + Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); + Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); + Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const; + Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1); + std::tuple gels(const Tensor & A) const; + std::tuple trtrs(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const; + std::tuple symeig(bool eigenvectors=false, bool upper=true) const; + std::tuple eig(bool eigenvectors=false) const; + std::tuple svd(bool some=true) const; + Tensor potrf(bool upper=true) const; + Tensor potrs(const Tensor & input2, bool upper=true) const; + Tensor potri(bool upper=true) const; + std::tuple pstrf(bool upper=true, Scalar tol=-1) const; + std::tuple qr() const; + std::tuple geqrf() const; + Tensor orgqr(const Tensor & input2) const; + Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const; + std::tuple btrifact(bool pivot=true) const; + std::tuple btrifact_with_info(bool pivot=true) const; + Tensor btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const; + Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr); + Tensor & random_(int64_t to, Generator * generator=nullptr); + Tensor & random_(Generator * generator=nullptr); + Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const; + Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr); + Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr); + Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr); + Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr); + Tensor & exponential_(double lambd=1, Generator * generator=nullptr); + Tensor & geometric_(double p, Generator * generator=nullptr); + Tensor abs() const; + Tensor & abs_(); + Tensor acos() const; + Tensor & acos_(); + Tensor add(const Tensor & other, Scalar alpha=1) const; + Tensor & add_(const Tensor & other, Scalar alpha=1); + Tensor add(Scalar other, Scalar alpha=1) const; + Tensor & add_(Scalar other, Scalar alpha=1); + Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const; + Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1); + Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1); + Tensor all(int64_t dim, bool keepdim=false) const; + bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const; + Tensor any(int64_t dim, bool keepdim=false) const; + Tensor argmax(int64_t dim, bool keepdim=false) const; + Tensor argmax() const; + Tensor argmin(int64_t dim, bool keepdim=false) const; + Tensor argmin() const; + Tensor as_strided(IntList size, IntList stride) const; + Tensor & as_strided_(IntList size, IntList stride); + Tensor as_strided(IntList size, IntList stride, int64_t storage_offset) const; + Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset); + Tensor asin() const; + Tensor & asin_(); + Tensor atan() const; + Tensor & atan_(); + Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const; + Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1); + Tensor bernoulli(const Tensor & p, Generator * generator=nullptr) const; + Tensor bernoulli(double p, Generator * generator=nullptr) const; + Tensor bernoulli() const; + Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr); + Tensor & bernoulli_(double p, Generator * generator=nullptr); + Tensor & bernoulli_(); + Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const; + Tensor bmm(const Tensor & mat2) const; + Tensor ceil() const; + Tensor & ceil_(); + std::vector chunk(int64_t chunks, int64_t dim=0) const; + Tensor clamp(Scalar min, Scalar max) const; + Tensor & clamp_(Scalar min, Scalar max); + Tensor clamp_max(Scalar max) const; + Tensor & clamp_max_(Scalar max); + Tensor clamp_min(Scalar min) const; + Tensor & clamp_min_(Scalar min); + Tensor cos() const; + Tensor & cos_(); + Tensor cosh() const; + Tensor & cosh_(); + Tensor cumsum(int64_t dim, ScalarType dtype) const; + Tensor cumsum(int64_t dim) const; + Tensor cumprod(int64_t dim, ScalarType dtype) const; + Tensor cumprod(int64_t dim) const; + Tensor det() const; + Tensor diagflat(int64_t offset=0) const; + Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const; + Tensor div(const Tensor & other) const; + Tensor & div_(const Tensor & other); + Tensor div(Scalar other) const; + Tensor & div_(Scalar other); + Tensor dot(const Tensor & tensor) const; + Tensor erf() const; + Tensor & erf_(); + Tensor erfc() const; + Tensor & erfc_(); + Tensor exp() const; + Tensor & exp_(); + Tensor expm1() const; + Tensor & expm1_(); + Tensor expand(IntList size, bool implicit=false) const; + Tensor expand_as(const Tensor & other) const; + Tensor flatten(int64_t start_dim=0, int64_t end_dim=-1) const; + Tensor & fill_(Scalar value); + Tensor & fill_(const Tensor & value); + Tensor floor() const; + Tensor & floor_(); + Tensor ger(const Tensor & vec2) const; + std::tuple gesv(const Tensor & A) const; + Tensor fft(int64_t signal_ndim, bool normalized=false) const; + Tensor ifft(int64_t signal_ndim, bool normalized=false) const; + Tensor rfft(int64_t signal_ndim, bool normalized=false, bool onesided=true) const; + Tensor irfft(int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const; + Tensor index(TensorList indices) const; + Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source); + Tensor index_put(TensorList indices, const Tensor & values) const; + Tensor & index_put_(TensorList indices, const Tensor & values); + Tensor inverse() const; + Tensor isclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const; + bool is_cuda() const; + bool is_distributed() const; + bool is_floating_point() const; + bool is_complex() const; + bool is_nonzero() const; + bool is_same_size(const Tensor & other) const; + bool is_signed() const; + bool is_sparse() const; + std::tuple kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const; + Tensor log() const; + Tensor & log_(); + Tensor log10() const; + Tensor & log10_(); + Tensor log1p() const; + Tensor & log1p_(); + Tensor log2() const; + Tensor & log2_(); + Tensor logdet() const; + Tensor log_softmax(int64_t dim) const; + Tensor logsumexp(int64_t dim, bool keepdim=false) const; + Tensor matmul(const Tensor & other) const; + Tensor matrix_power(int64_t n) const; + std::tuple max(int64_t dim, bool keepdim=false) const; + Tensor max_values(int64_t dim, bool keepdim=false) const; + Tensor mean(ScalarType dtype) const; + Tensor mean() const; + Tensor mean(int64_t dim, bool keepdim, ScalarType dtype) const; + Tensor mean(int64_t dim, bool keepdim=false) const; + Tensor mean(int64_t dim, ScalarType dtype) const; + std::tuple median(int64_t dim, bool keepdim=false) const; + std::tuple min(int64_t dim, bool keepdim=false) const; + Tensor min_values(int64_t dim, bool keepdim=false) const; + Tensor mm(const Tensor & mat2) const; + std::tuple mode(int64_t dim=-1, bool keepdim=false) const; + Tensor mul(const Tensor & other) const; + Tensor & mul_(const Tensor & other); + Tensor mul(Scalar other) const; + Tensor & mul_(Scalar other); + Tensor mv(const Tensor & vec) const; + Tensor mvlgamma(int64_t p) const; + Tensor & mvlgamma_(int64_t p); + Tensor narrow(int64_t dim, int64_t start, int64_t length) const; + Tensor permute(IntList dims) const; + Tensor pin_memory() const; + Tensor pinverse(double rcond=1e-15) const; + Tensor repeat(IntList repeats) const; + Tensor reshape(IntList shape) const; + Tensor reshape_as(const Tensor & other) const; + Tensor round() const; + Tensor & round_(); + Tensor relu() const; + Tensor & relu_(); + Tensor hardshrink(Scalar lambd=0.5) const; + Tensor hardshrink_backward(const Tensor & grad_out, Scalar lambd) const; + Tensor rsqrt() const; + Tensor & rsqrt_(); + Tensor select(int64_t dim, int64_t index) const; + Tensor sigmoid() const; + Tensor & sigmoid_(); + Tensor sin() const; + Tensor & sin_(); + Tensor sinh() const; + Tensor & sinh_(); + Tensor detach() const; + Tensor & detach_(); + int64_t size(int64_t dim) const; + Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const; + std::tuple slogdet() const; + Tensor smm(const Tensor & mat2) const; + Tensor softmax(int64_t dim) const; + std::vector split(int64_t split_size, int64_t dim=0) const; + std::vector split_with_sizes(IntList split_sizes, int64_t dim=0) const; + Tensor squeeze() const; + Tensor squeeze(int64_t dim) const; + Tensor & squeeze_(); + Tensor & squeeze_(int64_t dim); + Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const; + int64_t stride(int64_t dim) const; + Tensor sum(ScalarType dtype) const; + Tensor sum() const; + Tensor sum(IntList dim, bool keepdim, ScalarType dtype) const; + Tensor sum(IntList dim, bool keepdim=false) const; + Tensor sum(IntList dim, ScalarType dtype) const; + Tensor sqrt() const; + Tensor & sqrt_(); + Tensor std(bool unbiased=true) const; + Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const; + Tensor prod(ScalarType dtype) const; + Tensor prod() const; + Tensor prod(int64_t dim, bool keepdim, ScalarType dtype) const; + Tensor prod(int64_t dim, bool keepdim=false) const; + Tensor prod(int64_t dim, ScalarType dtype) const; + Tensor t() const; + Tensor & t_(); + Tensor tan() const; + Tensor & tan_(); + Tensor tanh() const; + Tensor & tanh_(); + Tensor transpose(int64_t dim0, int64_t dim1) const; + Tensor & transpose_(int64_t dim0, int64_t dim1); + Tensor flip(IntList dims) const; + Tensor rot90(int64_t k=1, IntList dims={0,1}) const; + Tensor trunc() const; + Tensor & trunc_(); + Tensor type_as(const Tensor & other) const; + Tensor unsqueeze(int64_t dim) const; + Tensor & unsqueeze_(int64_t dim); + Tensor var(bool unbiased=true) const; + Tensor var(int64_t dim, bool unbiased=true, bool keepdim=false) const; + Tensor view_as(const Tensor & other) const; + Tensor where(const Tensor & condition, const Tensor & other) const; + Tensor norm(Scalar p=2) const; + Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const; + Tensor clone() const; + Tensor & resize_as_(const Tensor & the_template); + Tensor pow(Scalar exponent) const; + Tensor & zero_(); + Tensor sub(const Tensor & other, Scalar alpha=1) const; + Tensor & sub_(const Tensor & other, Scalar alpha=1); + Tensor sub(Scalar other, Scalar alpha=1) const; + Tensor & sub_(Scalar other, Scalar alpha=1); + Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const; + Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); + Tensor & sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims); + Tensor & sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims); + Tensor sparse_mask(SparseTensorRef mask) const; + Tensor to_dense() const; + int64_t _sparseDims() const; + int64_t _denseDims() const; + int64_t _nnz() const; + Tensor coalesce() const; + bool is_coalesced() const; + Tensor _indices() const; + Tensor _values() const; + int64_t numel() const; + std::vector unbind(int64_t dim=0) const; + int64_t get_device() const; + Tensor to(Device device, ScalarType dtype, bool non_blocking=false) const; + Tensor to(ScalarType dtype, bool non_blocking=false) const; + Tensor to(Device device, bool non_blocking=false) const; + Tensor to(const Tensor & other, bool non_blocking=false) const; + Scalar _local_scalar() const; + + template + auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward(params)...)) { + return func(*this, std::forward(params)...); + } + + friend struct WeakTensor; + +protected: + c10::intrusive_ptr tensor_impl_; +}; + +struct AT_API WeakTensor { + WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {} + + // XXX: this can return undefined tensors + // Ideally it would be at::optional, but MSVC is too cool for that + Tensor lock() const { + return Tensor(weak_tensor_impl_.lock()); + } + + bool is_same(const WeakTensor& other) const noexcept { + return weak_tensor_impl_ == other.weak_tensor_impl_; + } + + size_t use_count() const noexcept { + return weak_tensor_impl_.use_count(); + } + size_t weak_use_count() const noexcept { + return weak_tensor_impl_.weak_use_count(); + } + + TensorImpl* unsafeGetTensorImpl() const { + return weak_tensor_impl_._unsafe_get_target(); + } + +private: + c10::weak_intrusive_ptr weak_tensor_impl_; +}; +} // namespace at diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h new file mode 100644 index 00000000000000..aba169279e6d2b --- /dev/null +++ b/aten/src/ATen/core/TensorMethods.h @@ -0,0 +1,1242 @@ +#pragma once + +#include "ATen/core/Tensor.h" +#include "ATen/core/Scalar.h" +#include "ATen/core/SparseTensorRef.h" +#include "ATen/core/Type.h" +#include "ATen/core/TensorOptions.h" + +namespace at { + +inline Tensor Tensor::toType(const Type & t, bool non_blocking) const { + if(type() == t) + return *this; + return t.copy(*this, non_blocking); +} + +inline Tensor Tensor::cpu() const { + return toType(type().cpu()); +} + +inline Tensor Tensor::cuda() const { + return toType(type().cuda()); +} + +inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) { + return type().copy_(*this, src, non_blocking); +} + +inline Tensor Tensor::toType(ScalarType t) const { + return toType(type().toScalarType(t)); +} + +inline Tensor Tensor::toBackend(Backend b) const { + return toType(type().toBackend(b)); +} + +inline TensorOptions Tensor::options() const { + return TensorOptions().dtype(dtype()) + .device(device()) + .layout(layout()) + .is_variable(is_variable()); +} + +inline void Tensor::backward( + at::optional gradient, + bool keep_graph, + bool create_graph) { + type().backward(*this, std::move(gradient), keep_graph, create_graph); +} + +inline void Tensor::set_data(Tensor new_data) { + type().set_data(*this, new_data); +} + +// all static inline to allow for inlining of the non-dynamic part of dispatch +inline int64_t Tensor::storage_offset() const { + return type().storage_offset(*this); +} +inline Tensor & Tensor::resize_(IntList size) { + return type().resize_(*this, size); +} +inline Tensor & Tensor::set_(Storage source) { + return type().set_(*this, source); +} +inline Tensor & Tensor::set_(Storage source, int64_t storage_offset, IntList size, IntList stride) { + return type().set_(*this, source, storage_offset, size, stride); +} +inline Tensor & Tensor::set_(const Tensor & source) { + return type().set_(*this, source); +} +inline Tensor & Tensor::set_() { + return type().set_(*this); +} +inline bool Tensor::is_contiguous() const { + return type().is_contiguous(*this); +} +inline bool Tensor::is_set_to(const Tensor & tensor) const { + return type().is_set_to(*this, tensor); +} +inline Tensor & Tensor::masked_fill_(const Tensor & mask, Scalar value) { + return type().masked_fill_(*this, mask, value); +} +inline Tensor & Tensor::masked_fill_(const Tensor & mask, const Tensor & value) { + return type().masked_fill_(*this, mask, value); +} +inline Tensor & Tensor::masked_scatter_(const Tensor & mask, const Tensor & source) { + return type().masked_scatter_(*this, mask, source); +} +inline Tensor Tensor::masked_select(const Tensor & mask) const { + return type().masked_select(*this, mask); +} +inline Tensor Tensor::nonzero() const { + return type().nonzero(*this); +} +inline Tensor Tensor::contiguous() const { + return type().contiguous(*this); +} +inline Tensor Tensor::view(IntList size) const { + return type().view(*this, size); +} +inline Tensor Tensor::index_select(int64_t dim, const Tensor & index) const { + return type().index_select(*this, dim, index); +} +inline Tensor Tensor::take(const Tensor & index) const { + return type().take(*this, index); +} +inline Tensor & Tensor::put_(const Tensor & index, const Tensor & source, bool accumulate) { + return type().put_(*this, index, source, accumulate); +} +inline Tensor & Tensor::index_add_(int64_t dim, const Tensor & index, const Tensor & source) { + return type().index_add_(*this, dim, index, source); +} +inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, Scalar value) { + return type().index_fill_(*this, dim, index, value); +} +inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, const Tensor & value) { + return type().index_fill_(*this, dim, index, value); +} +inline Tensor Tensor::unfold(int64_t dimension, int64_t size, int64_t step) const { + return type().unfold(*this, dimension, size, step); +} +inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, const Tensor & src) { + return type().scatter_(*this, dim, index, src); +} +inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, Scalar value) { + return type().scatter_(*this, dim, index, value); +} +inline Tensor & Tensor::scatter_add_(int64_t dim, const Tensor & index, const Tensor & src) { + return type().scatter_add_(*this, dim, index, src); +} +inline Tensor Tensor::gather(int64_t dim, const Tensor & index) const { + return type().gather(*this, dim, index); +} +inline void* Tensor::data_ptr() const { + return type().data_ptr(*this); +} +inline bool Tensor::equal(const Tensor & other) const { + return type().equal(*this, other); +} +inline Tensor Tensor::__and__(Scalar other) const { + return type().__and__(*this, other); +} +inline Tensor Tensor::__and__(const Tensor & other) const { + return type().__and__(*this, other); +} +inline Tensor & Tensor::__iand__(Scalar other) { + return type().__iand__(*this, other); +} +inline Tensor & Tensor::__iand__(const Tensor & other) { + return type().__iand__(*this, other); +} +inline Tensor Tensor::__or__(Scalar other) const { + return type().__or__(*this, other); +} +inline Tensor Tensor::__or__(const Tensor & other) const { + return type().__or__(*this, other); +} +inline Tensor & Tensor::__ior__(Scalar other) { + return type().__ior__(*this, other); +} +inline Tensor & Tensor::__ior__(const Tensor & other) { + return type().__ior__(*this, other); +} +inline Tensor Tensor::__xor__(Scalar other) const { + return type().__xor__(*this, other); +} +inline Tensor Tensor::__xor__(const Tensor & other) const { + return type().__xor__(*this, other); +} +inline Tensor & Tensor::__ixor__(Scalar other) { + return type().__ixor__(*this, other); +} +inline Tensor & Tensor::__ixor__(const Tensor & other) { + return type().__ixor__(*this, other); +} +inline Tensor Tensor::__lshift__(Scalar other) const { + return type().__lshift__(*this, other); +} +inline Tensor Tensor::__lshift__(const Tensor & other) const { + return type().__lshift__(*this, other); +} +inline Tensor & Tensor::__ilshift__(Scalar other) { + return type().__ilshift__(*this, other); +} +inline Tensor & Tensor::__ilshift__(const Tensor & other) { + return type().__ilshift__(*this, other); +} +inline Tensor Tensor::__rshift__(Scalar other) const { + return type().__rshift__(*this, other); +} +inline Tensor Tensor::__rshift__(const Tensor & other) const { + return type().__rshift__(*this, other); +} +inline Tensor & Tensor::__irshift__(Scalar other) { + return type().__irshift__(*this, other); +} +inline Tensor & Tensor::__irshift__(const Tensor & other) { + return type().__irshift__(*this, other); +} +inline Tensor Tensor::lt(Scalar other) const { + return type().lt(*this, other); +} +inline Tensor Tensor::lt(const Tensor & other) const { + return type().lt(*this, other); +} +inline Tensor & Tensor::lt_(Scalar other) { + return type().lt_(*this, other); +} +inline Tensor & Tensor::lt_(const Tensor & other) { + return type().lt_(*this, other); +} +inline Tensor Tensor::gt(Scalar other) const { + return type().gt(*this, other); +} +inline Tensor Tensor::gt(const Tensor & other) const { + return type().gt(*this, other); +} +inline Tensor & Tensor::gt_(Scalar other) { + return type().gt_(*this, other); +} +inline Tensor & Tensor::gt_(const Tensor & other) { + return type().gt_(*this, other); +} +inline Tensor Tensor::le(Scalar other) const { + return type().le(*this, other); +} +inline Tensor Tensor::le(const Tensor & other) const { + return type().le(*this, other); +} +inline Tensor & Tensor::le_(Scalar other) { + return type().le_(*this, other); +} +inline Tensor & Tensor::le_(const Tensor & other) { + return type().le_(*this, other); +} +inline Tensor Tensor::ge(Scalar other) const { + return type().ge(*this, other); +} +inline Tensor Tensor::ge(const Tensor & other) const { + return type().ge(*this, other); +} +inline Tensor & Tensor::ge_(Scalar other) { + return type().ge_(*this, other); +} +inline Tensor & Tensor::ge_(const Tensor & other) { + return type().ge_(*this, other); +} +inline Tensor Tensor::eq(Scalar other) const { + return type().eq(*this, other); +} +inline Tensor Tensor::eq(const Tensor & other) const { + return type().eq(*this, other); +} +inline Tensor & Tensor::eq_(Scalar other) { + return type().eq_(*this, other); +} +inline Tensor & Tensor::eq_(const Tensor & other) { + return type().eq_(*this, other); +} +inline Tensor Tensor::ne(Scalar other) const { + return type().ne(*this, other); +} +inline Tensor Tensor::ne(const Tensor & other) const { + return type().ne(*this, other); +} +inline Tensor & Tensor::ne_(Scalar other) { + return type().ne_(*this, other); +} +inline Tensor & Tensor::ne_(const Tensor & other) { + return type().ne_(*this, other); +} +inline Tensor Tensor::min(const Tensor & other) const { + return type().min(*this, other); +} +inline Tensor Tensor::min() const { + return type().min(*this); +} +inline Tensor Tensor::max(const Tensor & other) const { + return type().max(*this, other); +} +inline Tensor Tensor::max() const { + return type().max(*this); +} +inline Tensor Tensor::median() const { + return type().median(*this); +} +inline std::tuple Tensor::sort(int64_t dim, bool descending) const { + return type().sort(*this, dim, descending); +} +inline std::tuple Tensor::topk(int64_t k, int64_t dim, bool largest, bool sorted) const { + return type().topk(*this, k, dim, largest, sorted); +} +inline Tensor Tensor::all() const { + return type().all(*this); +} +inline Tensor Tensor::any() const { + return type().any(*this); +} +inline Tensor Tensor::lgamma() const { + return type().lgamma(*this); +} +inline Tensor & Tensor::lgamma_() { + return type().lgamma_(*this); +} +inline Tensor Tensor::digamma() const { + return type().digamma(*this); +} +inline Tensor & Tensor::digamma_() { + return type().digamma_(*this); +} +inline Tensor Tensor::polygamma(int64_t n) const { + return type().polygamma(n, *this); +} +inline Tensor & Tensor::polygamma_(int64_t n) { + return type().polygamma_(*this, n); +} +inline Tensor & Tensor::erfinv_() { + return type().erfinv_(*this); +} +inline Tensor Tensor::erfinv() const { + return type().erfinv(*this); +} +inline Tensor & Tensor::frac_() { + return type().frac_(*this); +} +inline Tensor Tensor::frac() const { + return type().frac(*this); +} +inline Tensor Tensor::renorm(Scalar p, int64_t dim, Scalar maxnorm) const { + return type().renorm(*this, p, dim, maxnorm); +} +inline Tensor & Tensor::renorm_(Scalar p, int64_t dim, Scalar maxnorm) { + return type().renorm_(*this, p, dim, maxnorm); +} +inline Tensor Tensor::dist(const Tensor & other, Scalar p) const { + return type().dist(*this, other, p); +} +inline Tensor Tensor::reciprocal() const { + return type().reciprocal(*this); +} +inline Tensor & Tensor::reciprocal_() { + return type().reciprocal_(*this); +} +inline Tensor Tensor::neg() const { + return type().neg(*this); +} +inline Tensor & Tensor::neg_() { + return type().neg_(*this); +} +inline Tensor Tensor::atan2(const Tensor & other) const { + return type().atan2(*this, other); +} +inline Tensor & Tensor::atan2_(const Tensor & other) { + return type().atan2_(*this, other); +} +inline Tensor Tensor::pow(const Tensor & exponent) const { + return type().pow(*this, exponent); +} +inline Tensor & Tensor::pow_(Scalar exponent) { + return type().pow_(*this, exponent); +} +inline Tensor & Tensor::pow_(const Tensor & exponent) { + return type().pow_(*this, exponent); +} +inline Tensor Tensor::lerp(const Tensor & end, Scalar weight) const { + return type().lerp(*this, end, weight); +} +inline Tensor & Tensor::lerp_(const Tensor & end, Scalar weight) { + return type().lerp_(*this, end, weight); +} +inline Tensor Tensor::histc(int64_t bins, Scalar min, Scalar max) const { + return type().histc(*this, bins, min, max); +} +inline Tensor Tensor::sign() const { + return type().sign(*this); +} +inline Tensor & Tensor::sign_() { + return type().sign_(*this); +} +inline Tensor Tensor::trace() const { + return type().trace(*this); +} +inline Tensor Tensor::fmod(Scalar other) const { + return type().fmod(*this, other); +} +inline Tensor Tensor::fmod(const Tensor & other) const { + return type().fmod(*this, other); +} +inline Tensor & Tensor::fmod_(Scalar other) { + return type().fmod_(*this, other); +} +inline Tensor & Tensor::fmod_(const Tensor & other) { + return type().fmod_(*this, other); +} +inline Tensor Tensor::remainder(Scalar other) const { + return type().remainder(*this, other); +} +inline Tensor Tensor::remainder(const Tensor & other) const { + return type().remainder(*this, other); +} +inline Tensor & Tensor::remainder_(Scalar other) { + return type().remainder_(*this, other); +} +inline Tensor & Tensor::remainder_(const Tensor & other) { + return type().remainder_(*this, other); +} +inline Tensor Tensor::tril(int64_t diagonal) const { + return type().tril(*this, diagonal); +} +inline Tensor & Tensor::tril_(int64_t diagonal) { + return type().tril_(*this, diagonal); +} +inline Tensor Tensor::triu(int64_t diagonal) const { + return type().triu(*this, diagonal); +} +inline Tensor & Tensor::triu_(int64_t diagonal) { + return type().triu_(*this, diagonal); +} +inline Tensor Tensor::cross(const Tensor & other, int64_t dim) const { + return type().cross(*this, other, dim); +} +inline Tensor Tensor::diag(int64_t diagonal) const { + return type().diag(*this, diagonal); +} +inline Tensor Tensor::addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const { + return type().addbmm(*this, batch1, batch2, beta, alpha); +} +inline Tensor & Tensor::addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return type().addbmm_(*this, batch1, batch2, beta, alpha); +} +inline Tensor Tensor::addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const { + return type().addcmul(*this, tensor1, tensor2, value); +} +inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return type().addcmul_(*this, tensor1, tensor2, value); +} +inline Tensor Tensor::addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const { + return type().addcdiv(*this, tensor1, tensor2, value); +} +inline Tensor & Tensor::addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) { + return type().addcdiv_(*this, tensor1, tensor2, value); +} +inline std::tuple Tensor::gels(const Tensor & A) const { + return type().gels(*this, A); +} +inline std::tuple Tensor::trtrs(const Tensor & A, bool upper, bool transpose, bool unitriangular) const { + return type().trtrs(*this, A, upper, transpose, unitriangular); +} +inline std::tuple Tensor::symeig(bool eigenvectors, bool upper) const { + return type().symeig(*this, eigenvectors, upper); +} +inline std::tuple Tensor::eig(bool eigenvectors) const { + return type().eig(*this, eigenvectors); +} +inline std::tuple Tensor::svd(bool some) const { + return type().svd(*this, some); +} +inline Tensor Tensor::potrf(bool upper) const { + return type().potrf(*this, upper); +} +inline Tensor Tensor::potrs(const Tensor & input2, bool upper) const { + return type().potrs(*this, input2, upper); +} +inline Tensor Tensor::potri(bool upper) const { + return type().potri(*this, upper); +} +inline std::tuple Tensor::pstrf(bool upper, Scalar tol) const { + return type().pstrf(*this, upper, tol); +} +inline std::tuple Tensor::qr() const { + return type().qr(*this); +} +inline std::tuple Tensor::geqrf() const { + return type().geqrf(*this); +} +inline Tensor Tensor::orgqr(const Tensor & input2) const { + return type().orgqr(*this, input2); +} +inline Tensor Tensor::ormqr(const Tensor & input2, const Tensor & input3, bool left, bool transpose) const { + return type().ormqr(*this, input2, input3, left, transpose); +} +inline std::tuple Tensor::btrifact(bool pivot) const { + return type().btrifact(*this, pivot); +} +inline std::tuple Tensor::btrifact_with_info(bool pivot) const { + return type().btrifact_with_info(*this, pivot); +} +inline Tensor Tensor::btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const { + return type().btrisolve(*this, LU_data, LU_pivots); +} +inline Tensor & Tensor::random_(int64_t from, int64_t to, Generator * generator) { + return type().random_(*this, from, to, generator); +} +inline Tensor & Tensor::random_(int64_t to, Generator * generator) { + return type().random_(*this, to, generator); +} +inline Tensor & Tensor::random_(Generator * generator) { + return type().random_(*this, generator); +} +inline Tensor Tensor::multinomial(int64_t num_samples, bool replacement, Generator * generator) const { + return type().multinomial(*this, num_samples, replacement, generator); +} +inline Tensor & Tensor::uniform_(double from, double to, Generator * generator) { + return type().uniform_(*this, from, to, generator); +} +inline Tensor & Tensor::normal_(double mean, double std, Generator * generator) { + return type().normal_(*this, mean, std, generator); +} +inline Tensor & Tensor::cauchy_(double median, double sigma, Generator * generator) { + return type().cauchy_(*this, median, sigma, generator); +} +inline Tensor & Tensor::log_normal_(double mean, double std, Generator * generator) { + return type().log_normal_(*this, mean, std, generator); +} +inline Tensor & Tensor::exponential_(double lambd, Generator * generator) { + return type().exponential_(*this, lambd, generator); +} +inline Tensor & Tensor::geometric_(double p, Generator * generator) { + return type().geometric_(*this, p, generator); +} +inline Tensor Tensor::abs() const { + return type().abs(*this); +} +inline Tensor & Tensor::abs_() { + return type().abs_(*this); +} +inline Tensor Tensor::acos() const { + return type().acos(*this); +} +inline Tensor & Tensor::acos_() { + return type().acos_(*this); +} +inline Tensor Tensor::add(const Tensor & other, Scalar alpha) const { + return type().add(*this, other, alpha); +} +inline Tensor & Tensor::add_(const Tensor & other, Scalar alpha) { + return type().add_(*this, other, alpha); +} +inline Tensor Tensor::add(Scalar other, Scalar alpha) const { + return type().add(*this, other, alpha); +} +inline Tensor & Tensor::add_(Scalar other, Scalar alpha) { + return type().add_(*this, other, alpha); +} +inline Tensor Tensor::addmv(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const { + return type().addmv(*this, mat, vec, beta, alpha); +} +inline Tensor & Tensor::addmv_(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) { + return type().addmv_(*this, mat, vec, beta, alpha); +} +inline Tensor Tensor::addr(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const { + return type().addr(*this, vec1, vec2, beta, alpha); +} +inline Tensor & Tensor::addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { + return type().addr_(*this, vec1, vec2, beta, alpha); +} +inline Tensor Tensor::all(int64_t dim, bool keepdim) const { + return type().all(*this, dim, keepdim); +} +inline bool Tensor::allclose(const Tensor & other, double rtol, double atol, bool equal_nan) const { + return type().allclose(*this, other, rtol, atol, equal_nan); +} +inline Tensor Tensor::any(int64_t dim, bool keepdim) const { + return type().any(*this, dim, keepdim); +} +inline Tensor Tensor::argmax(int64_t dim, bool keepdim) const { + return type().argmax(*this, dim, keepdim); +} +inline Tensor Tensor::argmax() const { + return type().argmax(*this); +} +inline Tensor Tensor::argmin(int64_t dim, bool keepdim) const { + return type().argmin(*this, dim, keepdim); +} +inline Tensor Tensor::argmin() const { + return type().argmin(*this); +} +inline Tensor Tensor::as_strided(IntList size, IntList stride) const { + return type().as_strided(*this, size, stride); +} +inline Tensor & Tensor::as_strided_(IntList size, IntList stride) { + return type().as_strided_(*this, size, stride); +} +inline Tensor Tensor::as_strided(IntList size, IntList stride, int64_t storage_offset) const { + return type().as_strided(*this, size, stride, storage_offset); +} +inline Tensor & Tensor::as_strided_(IntList size, IntList stride, int64_t storage_offset) { + return type().as_strided_(*this, size, stride, storage_offset); +} +inline Tensor Tensor::asin() const { + return type().asin(*this); +} +inline Tensor & Tensor::asin_() { + return type().asin_(*this); +} +inline Tensor Tensor::atan() const { + return type().atan(*this); +} +inline Tensor & Tensor::atan_() { + return type().atan_(*this); +} +inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const { + return type().baddbmm(*this, batch1, batch2, beta, alpha); +} +inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) { + return type().baddbmm_(*this, batch1, batch2, beta, alpha); +} +inline Tensor Tensor::bernoulli(const Tensor & p, Generator * generator) const { + return type().bernoulli(*this, p, generator); +} +inline Tensor Tensor::bernoulli(double p, Generator * generator) const { + return type().bernoulli(*this, p, generator); +} +inline Tensor Tensor::bernoulli() const { + return type().bernoulli(*this); +} +inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) { + return type().bernoulli_(*this, p, generator); +} +inline Tensor & Tensor::bernoulli_(double p, Generator * generator) { + return type().bernoulli_(*this, p, generator); +} +inline Tensor & Tensor::bernoulli_() { + return type().bernoulli_(*this); +} +inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const { + return type().bincount(*this, weights, minlength); +} +inline Tensor Tensor::bmm(const Tensor & mat2) const { + return type().bmm(*this, mat2); +} +inline Tensor Tensor::ceil() const { + return type().ceil(*this); +} +inline Tensor & Tensor::ceil_() { + return type().ceil_(*this); +} +inline std::vector Tensor::chunk(int64_t chunks, int64_t dim) const { + return type().chunk(*this, chunks, dim); +} +inline Tensor Tensor::clamp(Scalar min, Scalar max) const { + return type().clamp(*this, min, max); +} +inline Tensor & Tensor::clamp_(Scalar min, Scalar max) { + return type().clamp_(*this, min, max); +} +inline Tensor Tensor::clamp_max(Scalar max) const { + return type().clamp_max(*this, max); +} +inline Tensor & Tensor::clamp_max_(Scalar max) { + return type().clamp_max_(*this, max); +} +inline Tensor Tensor::clamp_min(Scalar min) const { + return type().clamp_min(*this, min); +} +inline Tensor & Tensor::clamp_min_(Scalar min) { + return type().clamp_min_(*this, min); +} +inline Tensor Tensor::cos() const { + return type().cos(*this); +} +inline Tensor & Tensor::cos_() { + return type().cos_(*this); +} +inline Tensor Tensor::cosh() const { + return type().cosh(*this); +} +inline Tensor & Tensor::cosh_() { + return type().cosh_(*this); +} +inline Tensor Tensor::cumsum(int64_t dim, ScalarType dtype) const { + return type().cumsum(*this, dim, dtype); +} +inline Tensor Tensor::cumsum(int64_t dim) const { + return type().cumsum(*this, dim); +} +inline Tensor Tensor::cumprod(int64_t dim, ScalarType dtype) const { + return type().cumprod(*this, dim, dtype); +} +inline Tensor Tensor::cumprod(int64_t dim) const { + return type().cumprod(*this, dim); +} +inline Tensor Tensor::det() const { + return type().det(*this); +} +inline Tensor Tensor::diagflat(int64_t offset) const { + return type().diagflat(*this, offset); +} +inline Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const { + return type().diagonal(*this, offset, dim1, dim2); +} +inline Tensor Tensor::div(const Tensor & other) const { + return type().div(*this, other); +} +inline Tensor & Tensor::div_(const Tensor & other) { + return type().div_(*this, other); +} +inline Tensor Tensor::div(Scalar other) const { + return type().div(*this, other); +} +inline Tensor & Tensor::div_(Scalar other) { + return type().div_(*this, other); +} +inline Tensor Tensor::dot(const Tensor & tensor) const { + return type().dot(*this, tensor); +} +inline Tensor Tensor::erf() const { + return type().erf(*this); +} +inline Tensor & Tensor::erf_() { + return type().erf_(*this); +} +inline Tensor Tensor::erfc() const { + return type().erfc(*this); +} +inline Tensor & Tensor::erfc_() { + return type().erfc_(*this); +} +inline Tensor Tensor::exp() const { + return type().exp(*this); +} +inline Tensor & Tensor::exp_() { + return type().exp_(*this); +} +inline Tensor Tensor::expm1() const { + return type().expm1(*this); +} +inline Tensor & Tensor::expm1_() { + return type().expm1_(*this); +} +inline Tensor Tensor::expand(IntList size, bool implicit) const { + return type().expand(*this, size, implicit); +} +inline Tensor Tensor::expand_as(const Tensor & other) const { + return type().expand_as(*this, other); +} +inline Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim) const { + return type().flatten(*this, start_dim, end_dim); +} +inline Tensor & Tensor::fill_(Scalar value) { + return type().fill_(*this, value); +} +inline Tensor & Tensor::fill_(const Tensor & value) { + return type().fill_(*this, value); +} +inline Tensor Tensor::floor() const { + return type().floor(*this); +} +inline Tensor & Tensor::floor_() { + return type().floor_(*this); +} +inline Tensor Tensor::ger(const Tensor & vec2) const { + return type().ger(*this, vec2); +} +inline std::tuple Tensor::gesv(const Tensor & A) const { + return type().gesv(*this, A); +} +inline Tensor Tensor::fft(int64_t signal_ndim, bool normalized) const { + return type().fft(*this, signal_ndim, normalized); +} +inline Tensor Tensor::ifft(int64_t signal_ndim, bool normalized) const { + return type().ifft(*this, signal_ndim, normalized); +} +inline Tensor Tensor::rfft(int64_t signal_ndim, bool normalized, bool onesided) const { + return type().rfft(*this, signal_ndim, normalized, onesided); +} +inline Tensor Tensor::irfft(int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const { + return type().irfft(*this, signal_ndim, normalized, onesided, signal_sizes); +} +inline Tensor Tensor::index(TensorList indices) const { + return type().index(*this, indices); +} +inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) { + return type().index_copy_(*this, dim, index, source); +} +inline Tensor Tensor::index_put(TensorList indices, const Tensor & values) const { + return type().index_put(*this, indices, values); +} +inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values) { + return type().index_put_(*this, indices, values); +} +inline Tensor Tensor::inverse() const { + return type().inverse(*this); +} +inline Tensor Tensor::isclose(const Tensor & other, double rtol, double atol, bool equal_nan) const { + return type().isclose(*this, other, rtol, atol, equal_nan); +} +inline bool Tensor::is_cuda() const { + return type().is_cuda(*this); +} +inline bool Tensor::is_distributed() const { + return type().is_distributed(*this); +} +inline bool Tensor::is_floating_point() const { + return type().is_floating_point(*this); +} +inline bool Tensor::is_complex() const { + return type().is_complex(*this); +} +inline bool Tensor::is_nonzero() const { + return type().is_nonzero(*this); +} +inline bool Tensor::is_same_size(const Tensor & other) const { + return type().is_same_size(*this, other); +} +inline bool Tensor::is_signed() const { + return type().is_signed(*this); +} +inline bool Tensor::is_sparse() const { + return type().is_sparse(*this); +} +inline std::tuple Tensor::kthvalue(int64_t k, int64_t dim, bool keepdim) const { + return type().kthvalue(*this, k, dim, keepdim); +} +inline Tensor Tensor::log() const { + return type().log(*this); +} +inline Tensor & Tensor::log_() { + return type().log_(*this); +} +inline Tensor Tensor::log10() const { + return type().log10(*this); +} +inline Tensor & Tensor::log10_() { + return type().log10_(*this); +} +inline Tensor Tensor::log1p() const { + return type().log1p(*this); +} +inline Tensor & Tensor::log1p_() { + return type().log1p_(*this); +} +inline Tensor Tensor::log2() const { + return type().log2(*this); +} +inline Tensor & Tensor::log2_() { + return type().log2_(*this); +} +inline Tensor Tensor::logdet() const { + return type().logdet(*this); +} +inline Tensor Tensor::log_softmax(int64_t dim) const { + return type().log_softmax(*this, dim); +} +inline Tensor Tensor::logsumexp(int64_t dim, bool keepdim) const { + return type().logsumexp(*this, dim, keepdim); +} +inline Tensor Tensor::matmul(const Tensor & other) const { + return type().matmul(*this, other); +} +inline Tensor Tensor::matrix_power(int64_t n) const { + return type().matrix_power(*this, n); +} +inline std::tuple Tensor::max(int64_t dim, bool keepdim) const { + return type().max(*this, dim, keepdim); +} +inline Tensor Tensor::max_values(int64_t dim, bool keepdim) const { + return type().max_values(*this, dim, keepdim); +} +inline Tensor Tensor::mean(ScalarType dtype) const { + return type().mean(*this, dtype); +} +inline Tensor Tensor::mean() const { + return type().mean(*this); +} +inline Tensor Tensor::mean(int64_t dim, bool keepdim, ScalarType dtype) const { + return type().mean(*this, dim, keepdim, dtype); +} +inline Tensor Tensor::mean(int64_t dim, bool keepdim) const { + return type().mean(*this, dim, keepdim); +} +inline Tensor Tensor::mean(int64_t dim, ScalarType dtype) const { + return type().mean(*this, dim, dtype); +} +inline std::tuple Tensor::median(int64_t dim, bool keepdim) const { + return type().median(*this, dim, keepdim); +} +inline std::tuple Tensor::min(int64_t dim, bool keepdim) const { + return type().min(*this, dim, keepdim); +} +inline Tensor Tensor::min_values(int64_t dim, bool keepdim) const { + return type().min_values(*this, dim, keepdim); +} +inline Tensor Tensor::mm(const Tensor & mat2) const { + return type().mm(*this, mat2); +} +inline std::tuple Tensor::mode(int64_t dim, bool keepdim) const { + return type().mode(*this, dim, keepdim); +} +inline Tensor Tensor::mul(const Tensor & other) const { + return type().mul(*this, other); +} +inline Tensor & Tensor::mul_(const Tensor & other) { + return type().mul_(*this, other); +} +inline Tensor Tensor::mul(Scalar other) const { + return type().mul(*this, other); +} +inline Tensor & Tensor::mul_(Scalar other) { + return type().mul_(*this, other); +} +inline Tensor Tensor::mv(const Tensor & vec) const { + return type().mv(*this, vec); +} +inline Tensor Tensor::mvlgamma(int64_t p) const { + return type().mvlgamma(*this, p); +} +inline Tensor & Tensor::mvlgamma_(int64_t p) { + return type().mvlgamma_(*this, p); +} +inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const { + return type().narrow(*this, dim, start, length); +} +inline Tensor Tensor::permute(IntList dims) const { + return type().permute(*this, dims); +} +inline Tensor Tensor::pin_memory() const { + return type().pin_memory(*this); +} +inline Tensor Tensor::pinverse(double rcond) const { + return type().pinverse(*this, rcond); +} +inline Tensor Tensor::repeat(IntList repeats) const { + return type().repeat(*this, repeats); +} +inline Tensor Tensor::reshape(IntList shape) const { + return type().reshape(*this, shape); +} +inline Tensor Tensor::reshape_as(const Tensor & other) const { + return type().reshape_as(*this, other); +} +inline Tensor Tensor::round() const { + return type().round(*this); +} +inline Tensor & Tensor::round_() { + return type().round_(*this); +} +inline Tensor Tensor::relu() const { + return type().relu(*this); +} +inline Tensor & Tensor::relu_() { + return type().relu_(*this); +} +inline Tensor Tensor::hardshrink(Scalar lambd) const { + return type().hardshrink(*this, lambd); +} +inline Tensor Tensor::hardshrink_backward(const Tensor & grad_out, Scalar lambd) const { + return type().hardshrink_backward(grad_out, *this, lambd); +} +inline Tensor Tensor::rsqrt() const { + return type().rsqrt(*this); +} +inline Tensor & Tensor::rsqrt_() { + return type().rsqrt_(*this); +} +inline Tensor Tensor::select(int64_t dim, int64_t index) const { + return type().select(*this, dim, index); +} +inline Tensor Tensor::sigmoid() const { + return type().sigmoid(*this); +} +inline Tensor & Tensor::sigmoid_() { + return type().sigmoid_(*this); +} +inline Tensor Tensor::sin() const { + return type().sin(*this); +} +inline Tensor & Tensor::sin_() { + return type().sin_(*this); +} +inline Tensor Tensor::sinh() const { + return type().sinh(*this); +} +inline Tensor & Tensor::sinh_() { + return type().sinh_(*this); +} +inline Tensor Tensor::detach() const { + return type().detach(*this); +} +inline Tensor & Tensor::detach_() { + return type().detach_(*this); +} +inline int64_t Tensor::size(int64_t dim) const { + return type().size(*this, dim); +} +inline Tensor Tensor::slice(int64_t dim, int64_t start, int64_t end, int64_t step) const { + return type().slice(*this, dim, start, end, step); +} +inline std::tuple Tensor::slogdet() const { + return type().slogdet(*this); +} +inline Tensor Tensor::smm(const Tensor & mat2) const { + return type().smm(*this, mat2); +} +inline Tensor Tensor::softmax(int64_t dim) const { + return type().softmax(*this, dim); +} +inline std::vector Tensor::split(int64_t split_size, int64_t dim) const { + return type().split(*this, split_size, dim); +} +inline std::vector Tensor::split_with_sizes(IntList split_sizes, int64_t dim) const { + return type().split_with_sizes(*this, split_sizes, dim); +} +inline Tensor Tensor::squeeze() const { + return type().squeeze(*this); +} +inline Tensor Tensor::squeeze(int64_t dim) const { + return type().squeeze(*this, dim); +} +inline Tensor & Tensor::squeeze_() { + return type().squeeze_(*this); +} +inline Tensor & Tensor::squeeze_(int64_t dim) { + return type().squeeze_(*this, dim); +} +inline Tensor Tensor::sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const { + return type().sspaddmm(*this, mat1, mat2, beta, alpha); +} +inline Tensor Tensor::stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const { + return type().stft(*this, n_fft, hop_length, win_length, window, normalized, onesided); +} +inline int64_t Tensor::stride(int64_t dim) const { + return type().stride(*this, dim); +} +inline Tensor Tensor::sum(ScalarType dtype) const { + return type().sum(*this, dtype); +} +inline Tensor Tensor::sum() const { + return type().sum(*this); +} +inline Tensor Tensor::sum(IntList dim, bool keepdim, ScalarType dtype) const { + return type().sum(*this, dim, keepdim, dtype); +} +inline Tensor Tensor::sum(IntList dim, bool keepdim) const { + return type().sum(*this, dim, keepdim); +} +inline Tensor Tensor::sum(IntList dim, ScalarType dtype) const { + return type().sum(*this, dim, dtype); +} +inline Tensor Tensor::sqrt() const { + return type().sqrt(*this); +} +inline Tensor & Tensor::sqrt_() { + return type().sqrt_(*this); +} +inline Tensor Tensor::std(bool unbiased) const { + return type().std(*this, unbiased); +} +inline Tensor Tensor::std(int64_t dim, bool unbiased, bool keepdim) const { + return type().std(*this, dim, unbiased, keepdim); +} +inline Tensor Tensor::prod(ScalarType dtype) const { + return type().prod(*this, dtype); +} +inline Tensor Tensor::prod() const { + return type().prod(*this); +} +inline Tensor Tensor::prod(int64_t dim, bool keepdim, ScalarType dtype) const { + return type().prod(*this, dim, keepdim, dtype); +} +inline Tensor Tensor::prod(int64_t dim, bool keepdim) const { + return type().prod(*this, dim, keepdim); +} +inline Tensor Tensor::prod(int64_t dim, ScalarType dtype) const { + return type().prod(*this, dim, dtype); +} +inline Tensor Tensor::t() const { + return type().t(*this); +} +inline Tensor & Tensor::t_() { + return type().t_(*this); +} +inline Tensor Tensor::tan() const { + return type().tan(*this); +} +inline Tensor & Tensor::tan_() { + return type().tan_(*this); +} +inline Tensor Tensor::tanh() const { + return type().tanh(*this); +} +inline Tensor & Tensor::tanh_() { + return type().tanh_(*this); +} +inline Tensor Tensor::transpose(int64_t dim0, int64_t dim1) const { + return type().transpose(*this, dim0, dim1); +} +inline Tensor & Tensor::transpose_(int64_t dim0, int64_t dim1) { + return type().transpose_(*this, dim0, dim1); +} +inline Tensor Tensor::flip(IntList dims) const { + return type().flip(*this, dims); +} +inline Tensor Tensor::rot90(int64_t k, IntList dims) const { + return type().rot90(*this, k, dims); +} +inline Tensor Tensor::trunc() const { + return type().trunc(*this); +} +inline Tensor & Tensor::trunc_() { + return type().trunc_(*this); +} +inline Tensor Tensor::type_as(const Tensor & other) const { + return type().type_as(*this, other); +} +inline Tensor Tensor::unsqueeze(int64_t dim) const { + return type().unsqueeze(*this, dim); +} +inline Tensor & Tensor::unsqueeze_(int64_t dim) { + return type().unsqueeze_(*this, dim); +} +inline Tensor Tensor::var(bool unbiased) const { + return type().var(*this, unbiased); +} +inline Tensor Tensor::var(int64_t dim, bool unbiased, bool keepdim) const { + return type().var(*this, dim, unbiased, keepdim); +} +inline Tensor Tensor::view_as(const Tensor & other) const { + return type().view_as(*this, other); +} +inline Tensor Tensor::where(const Tensor & condition, const Tensor & other) const { + return type().where(condition, *this, other); +} +inline Tensor Tensor::norm(Scalar p) const { + return type().norm(*this, p); +} +inline Tensor Tensor::norm(Scalar p, int64_t dim, bool keepdim) const { + return type().norm(*this, p, dim, keepdim); +} +inline Tensor Tensor::clone() const { + return type().clone(*this); +} +inline Tensor & Tensor::resize_as_(const Tensor & the_template) { + return type().resize_as_(*this, the_template); +} +inline Tensor Tensor::pow(Scalar exponent) const { + return type().pow(*this, exponent); +} +inline Tensor & Tensor::zero_() { + return type().zero_(*this); +} +inline Tensor Tensor::sub(const Tensor & other, Scalar alpha) const { + return type().sub(*this, other, alpha); +} +inline Tensor & Tensor::sub_(const Tensor & other, Scalar alpha) { + return type().sub_(*this, other, alpha); +} +inline Tensor Tensor::sub(Scalar other, Scalar alpha) const { + return type().sub(*this, other, alpha); +} +inline Tensor & Tensor::sub_(Scalar other, Scalar alpha) { + return type().sub_(*this, other, alpha); +} +inline Tensor Tensor::addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const { + return type().addmm(*this, mat1, mat2, beta, alpha); +} +inline Tensor & Tensor::addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) { + return type().addmm_(*this, mat1, mat2, beta, alpha); +} +inline Tensor & Tensor::sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims) { + return type().sparse_resize_(*this, size, sparseDims, denseDims); +} +inline Tensor & Tensor::sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims) { + return type().sparse_resize_and_clear_(*this, size, sparseDims, denseDims); +} +inline Tensor Tensor::sparse_mask(SparseTensorRef mask) const { + return type().sparse_mask(*this, mask); +} +inline Tensor Tensor::to_dense() const { + return type().to_dense(*this); +} +inline int64_t Tensor::_sparseDims() const { + return type()._sparseDims(*this); +} +inline int64_t Tensor::_denseDims() const { + return type()._denseDims(*this); +} +inline int64_t Tensor::_nnz() const { + return type()._nnz(*this); +} +inline Tensor Tensor::coalesce() const { + return type().coalesce(*this); +} +inline bool Tensor::is_coalesced() const { + return type().is_coalesced(*this); +} +inline Tensor Tensor::_indices() const { + return type()._indices(*this); +} +inline Tensor Tensor::_values() const { + return type()._values(*this); +} +inline int64_t Tensor::numel() const { + return type().numel(*this); +} +inline std::vector Tensor::unbind(int64_t dim) const { + return type().unbind(*this, dim); +} +inline int64_t Tensor::get_device() const { + return type().get_device(*this); +} +inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) const { + return type().to(*this, device, dtype, non_blocking); +} +inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const { + return type().to(*this, dtype, non_blocking); +} +inline Tensor Tensor::to(Device device, bool non_blocking) const { + return type().to(*this, device, non_blocking); +} +inline Tensor Tensor::to(const Tensor & other, bool non_blocking) const { + return type().to(*this, other, non_blocking); +} +inline Scalar Tensor::_local_scalar() const { + return type()._local_scalar(*this); +} + +#define DEFINE_CAST(T, name, _) \ + template <> \ + inline T* Tensor::data() const { \ + AT_CHECK( \ + type().scalarType() == ScalarType::name, \ + "expected scalar type ", \ + #name, \ + " but found ", \ + at::toString(type().scalarType())); \ + return static_cast(this->data_ptr()); \ + } \ + inline T* Tensor::to##name##Data() const { \ + return data(); \ + } + +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST) +#undef DEFINE_CAST + +#define DEFINE_TO_C_TYPE(T,name,_) \ +inline T Tensor::toC##name () const { return _local_scalar().to##name (); } + +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE) +#undef DEFINE_TO_C_TYPE + +} //namespace at diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h new file mode 100644 index 00000000000000..6284e7634bb755 --- /dev/null +++ b/aten/src/ATen/core/Type.h @@ -0,0 +1,647 @@ +#pragma once + +#include "ATen/core/ATenGeneral.h" +#include "ATen/core/Allocator.h" +#include "ATen/core/Deprecated.h" +#include "ATen/core/Generator.h" +#include "ATen/core/Layout.h" +#include "ATen/core/Scalar.h" +#include "ATen/core/ScalarType.h" +#include "ATen/core/SparseTensorRef.h" +#include "ATen/core/Tensor.h" +#include "ATen/core/ArrayRef.h" +#include "ATen/core/Half.h" +#include "ATen/core/TensorTypeIdRegistration.h" +#include "ATen/core/Reduction.h" +#include "ATen/core/TensorOptions.h" + +#include +#include +#include +#include +#include + +// To solve the conflict of s_addr in inaddr.h +#ifdef _MSC_VER +#ifdef s_addr +#undef s_addr +#endif +#endif + +namespace at { + +class Context; +struct Allocator; +struct Generator; +struct Storage; + +static inline void noop_deleter(void*) {} + +enum class TypeID { + CPUByte, + CPUChar, + CPUDouble, + CPUFloat, + CPUInt, + CPULong, + CPUShort, + CPUHalf, + SparseCPUByte, + SparseCPUChar, + SparseCPUDouble, + SparseCPUFloat, + SparseCPUInt, + SparseCPULong, + SparseCPUShort, + CUDAByte, + CUDAChar, + CUDADouble, + CUDAFloat, + CUDAInt, + CUDALong, + CUDAShort, + CUDAHalf, + SparseCUDAByte, + SparseCUDAChar, + SparseCUDADouble, + SparseCUDAFloat, + SparseCUDAInt, + SparseCUDALong, + SparseCUDAShort, + CPUComplexFloat, + CPUComplexDouble, + CUDAComplexFloat, + CUDAComplexDouble, + Undefined, + NumOptions +}; + +struct AT_API Type { + explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined) + : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {} + + virtual ~Type() {} + virtual ScalarType scalarType() const = 0; + virtual Backend backend() const = 0; + Layout layout() const noexcept { return layout_from_backend(backend()); } + virtual bool is_cuda() const = 0; + virtual bool is_sparse() const = 0; + virtual bool is_distributed() const = 0; + bool is_variable() const noexcept { return is_variable_; } + bool is_undefined() const noexcept { return is_undefined_; } + virtual Allocator * allocator() const = 0; + virtual Device getDeviceFromPtr(void * data) const = 0; + virtual Storage storage(bool resizable = false) const = 0; + virtual Storage storage(size_t size, bool resizable = false) const = 0; + virtual Storage storageFromBlob(void * data, int64_t size, const std::function & deleter=noop_deleter) const = 0; + virtual Storage storageWithAllocator(int64_t size, Allocator* allocator) const = 0; + virtual std::unique_ptr generator() const = 0; + virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0; + virtual Storage unsafeStorageFromTH(void * th_pointer, bool retain) const = 0; + virtual const char * toString() const = 0; + virtual size_t elementSizeInBytes() const = 0; + virtual Type & toBackend(Backend b) const = 0; + virtual Type & toScalarType(ScalarType s) const = 0; + Type & toSparse() const { + return this->toBackend(at::toSparse(this->backend())); + } + Type & toDense() const { + return this->toBackend(at::toDense(this->backend())); + } + Type & cpu() const { + return this->toBackend(at::backendToCPU(this->backend())); + } + Type & cuda() const { + return this->toBackend(at::backendToCUDA(this->backend())); + } + // contiguous IDs for all types in the system + // for external dispatch + virtual TypeID ID() const = 0; + + // New-style TensorTypeId that supports open registration. + TensorTypeId type_id() const { return type_id_; } + + // NB: This will return DeviceType::CPU for Backend::SparseCPU + DeviceType device_type() const { + return backendToDeviceType(backend()); + } + + virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional to_device={}) const = 0; + virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0; + virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; + virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0; + + virtual void backward(Tensor & self, at::optional gradient, bool keep_graph, bool create_graph) const = 0; + virtual void set_data(Tensor & self, Tensor new_data) const = 0; + + virtual Tensor tensorFromBlob(void * data, IntList sizes, const std::function & deleter=noop_deleter) const = 0; + virtual Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function & deleter=noop_deleter) const = 0; + virtual Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const = 0; + virtual Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const = 0; + virtual Tensor scalarTensor(Scalar s) const = 0; + + bool operator==(const Type& other) const { + return this == &other; + } + bool operator!=(const Type& other) const { + return this != &other; + } + + /// Constructs the `TensorOptions` from a type and a `device_index`. + TensorOptions options(int32_t device_index = -1) const { + TensorOptions r; + r.dtype(scalarType()); + r.device({backendToDeviceType(backend()), device_index}); + r.layout(layout()); + r.is_variable(is_variable()); + return r; + } + + operator TensorOptions() const { + return options(); + } + + // example + // virtual Tensor * add(Tensor & a, Tensor & b) = 0; + virtual int64_t storage_offset(const Tensor & self) const = 0; + virtual Tensor & resize_(Tensor & self, IntList size) const = 0; + virtual Tensor & set_(Tensor & self, Storage source) const = 0; + virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride={}) const = 0; + virtual Tensor & set_(Tensor & self, const Tensor & source) const = 0; + virtual Tensor & set_(Tensor & self) const = 0; + virtual bool is_contiguous(const Tensor & self) const = 0; + virtual bool is_set_to(const Tensor & self, const Tensor & tensor) const = 0; + virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0; + virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0; + virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0; + virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0; + virtual Tensor & s_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0; + virtual Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0; + virtual Tensor s_masked_select(const Tensor & self, const Tensor & mask) const = 0; + virtual Tensor masked_select(const Tensor & self, const Tensor & mask) const = 0; + virtual Tensor nonzero(const Tensor & self) const = 0; + virtual Tensor contiguous(const Tensor & self) const = 0; + virtual Tensor view(const Tensor & self, IntList size) const = 0; + virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const = 0; + virtual Tensor take(const Tensor & self, const Tensor & index) const = 0; + virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate=false) const = 0; + virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0; + virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0; + virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const = 0; + virtual Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step) const = 0; + virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0; + virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0; + virtual Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0; + virtual Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) const = 0; + virtual void* data_ptr(const Tensor & self) const = 0; + virtual bool equal(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __and__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___and__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __and__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __iand__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___iand__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __iand__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __or__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___or__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __or__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ior__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___ior__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ior__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __xor__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___xor__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __xor__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ixor__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___ixor__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ixor__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __lshift__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___lshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __lshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ilshift__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___ilshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __ilshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor __rshift__(const Tensor & self, Scalar other) const = 0; + virtual Tensor s___rshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor __rshift__(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __irshift__(Tensor & self, Scalar other) const = 0; + virtual Tensor & s___irshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & __irshift__(Tensor & self, const Tensor & other) const = 0; + virtual Tensor lt(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_lt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor lt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & lt_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_lt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & lt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor gt(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_gt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor gt(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & gt_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_gt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & gt_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor le(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_le(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor le(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & le_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_le_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & le_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor ge(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_ge(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor ge(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ge_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_ge_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ge_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor eq(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_eq(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor eq(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & eq_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_eq_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & eq_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor ne(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_ne(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor ne(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ne_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_ne_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & ne_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor s_min(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor min(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor min(const Tensor & self) const = 0; + virtual Tensor s_max(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor max(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor max(const Tensor & self) const = 0; + virtual Tensor median(const Tensor & self) const = 0; + virtual std::tuple sort(const Tensor & self, int64_t dim=-1, bool descending=false) const = 0; + virtual std::tuple topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const = 0; + virtual Tensor all(const Tensor & self) const = 0; + virtual Tensor any(const Tensor & self) const = 0; + virtual Tensor lgamma(const Tensor & self) const = 0; + virtual Tensor & lgamma_(Tensor & self) const = 0; + virtual Tensor digamma(const Tensor & self) const = 0; + virtual Tensor & digamma_(Tensor & self) const = 0; + virtual Tensor polygamma(int64_t n, const Tensor & self) const = 0; + virtual Tensor & polygamma_(Tensor & self, int64_t n) const = 0; + virtual Tensor & erfinv_(Tensor & self) const = 0; + virtual Tensor erfinv(const Tensor & self) const = 0; + virtual Tensor & frac_(Tensor & self) const = 0; + virtual Tensor frac(const Tensor & self) const = 0; + virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0; + virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0; + virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0; + virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0; + virtual Tensor reciprocal(const Tensor & self) const = 0; + virtual Tensor & reciprocal_(Tensor & self) const = 0; + virtual Tensor neg(const Tensor & self) const = 0; + virtual Tensor & neg_(Tensor & self) const = 0; + virtual Tensor s_atan2(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor atan2(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & s_atan2_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & atan2_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor s_pow(const Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor pow(const Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor pow(Scalar base, const Tensor & self) const = 0; + virtual Tensor & pow_(Tensor & self, Scalar exponent) const = 0; + virtual Tensor & s_pow_(Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor & pow_(Tensor & self, const Tensor & exponent) const = 0; + virtual Tensor s_lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0; + virtual Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const = 0; + virtual Tensor sign(const Tensor & self) const = 0; + virtual Tensor & sign_(Tensor & self) const = 0; + virtual Tensor trace(const Tensor & self) const = 0; + virtual Tensor fmod(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_fmod(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor fmod(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & fmod_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_fmod_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & fmod_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor remainder(const Tensor & self, Scalar other) const = 0; + virtual Tensor s_remainder(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor remainder(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & remainder_(Tensor & self, Scalar other) const = 0; + virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor & remainder_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor tril(const Tensor & self, int64_t diagonal=0) const = 0; + virtual Tensor & tril_(Tensor & self, int64_t diagonal=0) const = 0; + virtual Tensor triu(const Tensor & self, int64_t diagonal=0) const = 0; + virtual Tensor & triu_(Tensor & self, int64_t diagonal=0) const = 0; + virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1) const = 0; + virtual Tensor diag(const Tensor & self, int64_t diagonal=0) const = 0; + virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual std::tuple gels(const Tensor & self, const Tensor & A) const = 0; + virtual std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const = 0; + virtual std::tuple symeig(const Tensor & self, bool eigenvectors=false, bool upper=true) const = 0; + virtual std::tuple eig(const Tensor & self, bool eigenvectors=false) const = 0; + virtual std::tuple svd(const Tensor & self, bool some=true) const = 0; + virtual Tensor potrf(const Tensor & self, bool upper=true) const = 0; + virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true) const = 0; + virtual Tensor potri(const Tensor & self, bool upper=true) const = 0; + virtual std::tuple pstrf(const Tensor & self, bool upper=true, Scalar tol=-1) const = 0; + virtual std::tuple qr(const Tensor & self) const = 0; + virtual std::tuple geqrf(const Tensor & self) const = 0; + virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0; + virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const = 0; + virtual std::tuple btrifact(const Tensor & self, bool pivot=true) const = 0; + virtual std::tuple btrifact_with_info(const Tensor & self, bool pivot=true) const = 0; + virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const = 0; + virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator=nullptr) const = 0; + virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator=nullptr) const = 0; + virtual Tensor & random_(Tensor & self, Generator * generator=nullptr) const = 0; + virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const = 0; + virtual Tensor & uniform_(Tensor & self, double from=0, double to=1, Generator * generator=nullptr) const = 0; + virtual Tensor & normal_(Tensor & self, double mean=0, double std=1, Generator * generator=nullptr) const = 0; + virtual Tensor & cauchy_(Tensor & self, double median=0, double sigma=1, Generator * generator=nullptr) const = 0; + virtual Tensor & log_normal_(Tensor & self, double mean=1, double std=2, Generator * generator=nullptr) const = 0; + virtual Tensor & exponential_(Tensor & self, double lambd=1, Generator * generator=nullptr) const = 0; + virtual Tensor & geometric_(Tensor & self, double p, Generator * generator=nullptr) const = 0; + virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride={}) const = 0; + virtual Tensor tensor(IntList size, IntList stride) const = 0; + virtual Tensor abs(const Tensor & self) const = 0; + virtual Tensor & abs_(Tensor & self) const = 0; + virtual Tensor acos(const Tensor & self) const = 0; + virtual Tensor & acos_(Tensor & self) const = 0; + virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; + virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; + virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha=1) const = 0; + virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha=1) const = 0; + virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0; + virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step=1) const = 0); + AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0); + virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor argmax(const Tensor & self) const = 0; + virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor argmin(const Tensor & self) const = 0; + virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0; + virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0; + virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0; + virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0; + virtual Tensor asin(const Tensor & self) const = 0; + virtual Tensor & asin_(Tensor & self) const = 0; + virtual Tensor atan(const Tensor & self) const = 0; + virtual Tensor & atan_(Tensor & self) const = 0; + virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0; + virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator=nullptr) const = 0; + virtual Tensor bernoulli(const Tensor & self) const = 0; + virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0; + virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator=nullptr) const = 0; + virtual Tensor & bernoulli_(Tensor & self) const = 0; + virtual Tensor bincount(const Tensor & self, const Tensor & weights={}, int64_t minlength=0) const = 0; + virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0; + virtual Tensor ceil(const Tensor & self) const = 0; + virtual Tensor & ceil_(Tensor & self) const = 0; + virtual std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim=0) const = 0; + virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const = 0; + virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const = 0; + virtual Tensor clamp_max(const Tensor & self, Scalar max) const = 0; + virtual Tensor & clamp_max_(Tensor & self, Scalar max) const = 0; + virtual Tensor clamp_min(const Tensor & self, Scalar min) const = 0; + virtual Tensor & clamp_min_(Tensor & self, Scalar min) const = 0; + virtual Tensor cos(const Tensor & self) const = 0; + virtual Tensor & cos_(Tensor & self) const = 0; + virtual Tensor cosh(const Tensor & self) const = 0; + virtual Tensor & cosh_(Tensor & self) const = 0; + virtual Tensor cumsum(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual Tensor cumsum(const Tensor & self, int64_t dim) const = 0; + virtual Tensor cumprod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual Tensor cumprod(const Tensor & self, int64_t dim) const = 0; + virtual Tensor det(const Tensor & self) const = 0; + virtual Tensor diagflat(const Tensor & self, int64_t offset=0) const = 0; + virtual Tensor diagonal(const Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const = 0; + virtual Tensor div(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & div_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor div(const Tensor & self, Scalar other) const = 0; + virtual Tensor & div_(Tensor & self, Scalar other) const = 0; + virtual Tensor dot(const Tensor & self, const Tensor & tensor) const = 0; + AT_DEPRECATED(virtual Tensor empty(IntList size) const = 0); + virtual Tensor erf(const Tensor & self) const = 0; + virtual Tensor & erf_(Tensor & self) const = 0; + virtual Tensor erfc(const Tensor & self) const = 0; + virtual Tensor & erfc_(Tensor & self) const = 0; + virtual Tensor exp(const Tensor & self) const = 0; + virtual Tensor & exp_(Tensor & self) const = 0; + virtual Tensor expm1(const Tensor & self) const = 0; + virtual Tensor & expm1_(Tensor & self) const = 0; + virtual Tensor expand(const Tensor & self, IntList size, bool implicit=false) const = 0; + virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0; + AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m=-1) const = 0); + virtual Tensor flatten(const Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) const = 0; + virtual Tensor & fill_(Tensor & self, Scalar value) const = 0; + virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0; + virtual Tensor floor(const Tensor & self) const = 0; + virtual Tensor & floor_(Tensor & self) const = 0; + AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0); + virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0; + virtual std::tuple gesv(const Tensor & self, const Tensor & A) const = 0; + virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0; + virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0; + virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true) const = 0; + virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const = 0; + virtual Tensor index(const Tensor & self, TensorList indices) const = 0; + virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0; + virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0; + virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0; + virtual Tensor inverse(const Tensor & self) const = 0; + virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0; + virtual bool is_cuda(const Tensor & self) const = 0; + virtual bool is_distributed(const Tensor & self) const = 0; + virtual bool is_floating_point(const Tensor & self) const = 0; + virtual bool is_complex(const Tensor & self) const = 0; + virtual bool is_nonzero(const Tensor & self) const = 0; + virtual bool is_same_size(const Tensor & self, const Tensor & other) const = 0; + virtual bool is_signed(const Tensor & self) const = 0; + virtual bool is_sparse(const Tensor & self) const = 0; + virtual std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const = 0; + AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps=100) const = 0); + virtual Tensor log(const Tensor & self) const = 0; + virtual Tensor & log_(Tensor & self) const = 0; + virtual Tensor log10(const Tensor & self) const = 0; + virtual Tensor & log10_(Tensor & self) const = 0; + virtual Tensor log1p(const Tensor & self) const = 0; + virtual Tensor & log1p_(Tensor & self) const = 0; + virtual Tensor log2(const Tensor & self) const = 0; + virtual Tensor & log2_(Tensor & self) const = 0; + virtual Tensor logdet(const Tensor & self) const = 0; + AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps=100) const = 0); + virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0; + virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor matrix_power(const Tensor & self, int64_t n) const = 0; + virtual std::tuple max(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor mean(const Tensor & self, ScalarType dtype) const = 0; + virtual Tensor mean(const Tensor & self) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual std::tuple median(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual std::tuple min(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor mm(const Tensor & self, const Tensor & mat2) const = 0; + virtual std::tuple mode(const Tensor & self, int64_t dim=-1, bool keepdim=false) const = 0; + virtual Tensor mul(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor & mul_(Tensor & self, const Tensor & other) const = 0; + virtual Tensor mul(const Tensor & self, Scalar other) const = 0; + virtual Tensor & mul_(Tensor & self, Scalar other) const = 0; + virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0; + virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0; + virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0; + virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0; + AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0); + virtual Tensor permute(const Tensor & self, IntList dims) const = 0; + virtual Tensor pin_memory(const Tensor & self) const = 0; + virtual Tensor pinverse(const Tensor & self, double rcond=1e-15) const = 0; + AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator=nullptr) const = 0); + AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator=nullptr) const = 0); + AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator=nullptr) const = 0); + AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator=nullptr) const = 0); + AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator=nullptr) const = 0); + AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step=1) const = 0); + virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0; + virtual Tensor reshape(const Tensor & self, IntList shape) const = 0; + virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor round(const Tensor & self) const = 0; + virtual Tensor & round_(Tensor & self) const = 0; + virtual Tensor relu(const Tensor & self) const = 0; + virtual Tensor & relu_(Tensor & self) const = 0; + virtual Tensor hardshrink(const Tensor & self, Scalar lambd=0.5) const = 0; + virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0; + virtual Tensor rsqrt(const Tensor & self) const = 0; + virtual Tensor & rsqrt_(Tensor & self) const = 0; + virtual Tensor select(const Tensor & self, int64_t dim, int64_t index) const = 0; + virtual Tensor sigmoid(const Tensor & self) const = 0; + virtual Tensor & sigmoid_(Tensor & self) const = 0; + virtual Tensor sin(const Tensor & self) const = 0; + virtual Tensor & sin_(Tensor & self) const = 0; + virtual Tensor sinh(const Tensor & self) const = 0; + virtual Tensor & sinh_(Tensor & self) const = 0; + virtual Tensor detach(const Tensor & self) const = 0; + virtual Tensor & detach_(Tensor & self) const = 0; + virtual int64_t size(const Tensor & self, int64_t dim) const = 0; + virtual Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const = 0; + virtual std::tuple slogdet(const Tensor & self) const = 0; + virtual Tensor smm(const Tensor & self, const Tensor & mat2) const = 0; + virtual Tensor softmax(const Tensor & self, int64_t dim) const = 0; + virtual std::vector split(const Tensor & self, int64_t split_size, int64_t dim=0) const = 0; + virtual std::vector split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim=0) const = 0; + virtual Tensor squeeze(const Tensor & self) const = 0; + virtual Tensor squeeze(const Tensor & self, int64_t dim) const = 0; + virtual Tensor & squeeze_(Tensor & self) const = 0; + virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0; + virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const = 0; + virtual int64_t stride(const Tensor & self, int64_t dim) const = 0; + virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0; + virtual Tensor sum(const Tensor & self) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim=false) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0; + virtual Tensor sqrt(const Tensor & self) const = 0; + virtual Tensor & sqrt_(Tensor & self) const = 0; + virtual Tensor std(const Tensor & self, bool unbiased=true) const = 0; + virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0; + virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0; + virtual Tensor prod(const Tensor & self) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; + virtual Tensor t(const Tensor & self) const = 0; + virtual Tensor & t_(Tensor & self) const = 0; + virtual Tensor tan(const Tensor & self) const = 0; + virtual Tensor & tan_(Tensor & self) const = 0; + virtual Tensor tanh(const Tensor & self) const = 0; + virtual Tensor & tanh_(Tensor & self) const = 0; + virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const = 0; + virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const = 0; + virtual Tensor flip(const Tensor & self, IntList dims) const = 0; + virtual Tensor rot90(const Tensor & self, int64_t k=1, IntList dims={0,1}) const = 0; + virtual Tensor trunc(const Tensor & self) const = 0; + virtual Tensor & trunc_(Tensor & self) const = 0; + virtual Tensor type_as(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const = 0; + virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const = 0; + virtual Tensor var(const Tensor & self, bool unbiased=true) const = 0; + virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0; + virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0; + virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0; + AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0); + virtual Tensor norm(const Tensor & self, Scalar p=2) const = 0; + virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor clone(const Tensor & self) const = 0; + virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0; + virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0; + virtual Tensor & zero_(Tensor & self) const = 0; + virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; + virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; + virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1) const = 0; + virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha=1) const = 0; + virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor tensor() const = 0; + virtual Tensor tensor(IntList size) const = 0; + virtual Tensor native_sparse_coo_tensor(IntList size) const = 0; + virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0; + virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0; + virtual Tensor sparse_coo_tensor(IntList size) const = 0; + virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0; + virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0; + virtual Tensor _native_sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntList size) const = 0; + virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0; + virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0; + virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0; + virtual Tensor to_dense(const Tensor & self) const = 0; + virtual int64_t _sparseDims(const Tensor & self) const = 0; + virtual int64_t _denseDims(const Tensor & self) const = 0; + virtual int64_t _nnz(const Tensor & self) const = 0; + virtual Tensor coalesce(const Tensor & self) const = 0; + virtual bool is_coalesced(const Tensor & self) const = 0; + virtual Tensor _indices(const Tensor & self) const = 0; + virtual Tensor _values(const Tensor & self) const = 0; + virtual int64_t numel(const Tensor & self) const = 0; + virtual std::vector unbind(const Tensor & self, int64_t dim=0) const = 0; + virtual int64_t get_device(const Tensor & self) const = 0; + virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking=false) const = 0; + virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking=false) const = 0; + virtual Tensor to(const Tensor & self, Device device, bool non_blocking=false) const = 0; + virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking=false) const = 0; + virtual Scalar _local_scalar(const Tensor & self) const = 0; +protected: + TensorTypeId type_id_; + bool is_variable_; + bool is_undefined_; + +}; + +inline bool Tensor::is_variable() const noexcept { + return type().is_variable(); +} + +inline ScalarType Tensor::dtype() const noexcept { + return type().scalarType(); +} + +inline Layout Tensor::layout() const noexcept { + return type().layout(); +} + +inline Device Tensor::device() const { + return Device(type().device_type(), type().is_cuda() ? get_device() : -1); +} + +} // namespace at diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 5497a752dc4fb4..f0827077827a25 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -1,5 +1,6 @@ import argparse import os +import filecmp import yaml from collections import OrderedDict @@ -40,19 +41,23 @@ parser.add_argument( '-d', '--install_dir', help='output directory', default='ATen') options = parser.parse_args() +core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None if options.install_dir is not None and not os.path.exists(options.install_dir): os.makedirs(options.install_dir) +if core_install_dir is not None and not os.path.exists(core_install_dir): + os.makedirs(core_install_dir) class FileManager(object): - def __init__(self): + def __init__(self, prefix=None): + self.install_dir = '{}/{}'.format(options.install_dir, prefix) if prefix else options.install_dir self.filenames = set() self.outputs_written = False self.undeclared_files = [] def will_write(self, filename): - filename = '{}/{}'.format(options.install_dir, filename) + filename = '{}/{}'.format(self.install_dir, filename) if self.outputs_written: raise Exception("'will_write' can only be called before " + "the call to write_outputs, refactor so outputs are registered " + @@ -78,7 +83,7 @@ def write_outputs(self, filename): self.outputs_written = True def write(self, filename, s, env=None): - filename = '{}/{}'.format(options.install_dir, filename) + filename = '{}/{}'.format(self.install_dir, filename) if isinstance(s, CodeTemplate): assert env is not None env['generated_comment'] = "@" + "generated by aten/src/ATen/gen.py" @@ -128,6 +133,7 @@ def check_all_files_written(self): context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}()); """) +core_file_manager = FileManager('core_tmp') file_manager = FileManager() cuda_file_manager = FileManager() @@ -332,9 +338,11 @@ def iterate_types(): # so that the script runs quickly when we are just querying the # outputs def declare_outputs(): - files = ['Declarations.yaml', 'Type.h', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h', - 'TensorMethods.h', 'Functions.h', - 'CPUCopy.cpp', 'NativeFunctions.h', + core_files = ['Type.h', 'Tensor.h', 'TensorMethods.h'] + for f in core_files: + core_file_manager.will_write(f) + files = ['Declarations.yaml', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h', + 'Functions.h', 'CPUCopy.cpp', 'NativeFunctions.h', 'RegisterCPU.cpp', 'RegisterCPU.h'] for f in files: file_manager.will_write(f) @@ -401,7 +409,15 @@ def generate_outputs(): all_types.append(generate_storage_type_and_tensor( backend, density, scalar_type, declarations)) - file_manager.write('Type.h', TYPE_H, top_env) + core_files = { + 'Type.h': TYPE_H, + 'Tensor.h': TENSOR_H, + 'TensorMethods.h': TENSOR_METHODS_H + } + + for core_file, core_template_file in core_files.items(): + core_file_manager.write(core_file, core_template_file, top_env) + file_manager.write('TypeExtendedInterface.h', TYPE_EXTENDED_INTERFACE_H, top_env) file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env) file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env) @@ -412,8 +428,6 @@ def generate_outputs(): cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env) cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env) - file_manager.write('Tensor.h', TENSOR_H, top_env) - file_manager.write('TensorMethods.h', TENSOR_METHODS_H, top_env) file_manager.write('Functions.h', FUNCTIONS_H, top_env) file_manager.write('CPUCopy.cpp', copy_wrapper.create(all_types, 'CPU')) @@ -423,10 +437,25 @@ def generate_outputs(): file_manager.check_all_files_written() cuda_file_manager.check_all_files_written() + # check that generated files match source files + core_source_path = os.path.join(options.source_path, 'core') + match, mismatch, errors = filecmp.cmpfiles(core_install_dir, core_source_path, core_files.keys(), shallow=False) + if errors: + raise RuntimeError("Error while trying to compare source and generated files for {}. " + "Source directory: {}. Generated directory: {}." + .format(errors, core_source_path, core_install_dir)) + if mismatch: + file_component = '{}'.format(','.join(mismatch)) + if len(mismatch) > 1: + file_component = '{' + file_component + '}' + update_cmd = "cp {}/{} {}".format(core_install_dir, file_component, core_source_path) + raise RuntimeError("Source files: {} did not match generated files. To update the source files, " + "run \"{}\"".format(mismatch, update_cmd)) declare_outputs() if options.output_dependencies is not None: file_manager.write_outputs(options.output_dependencies) + core_file_manager.write_outputs(options.output_dependencies + "-core") cuda_file_manager.write_outputs(options.output_dependencies + "-cuda") else: generate_outputs() diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h index 82a2f00ff77bc7..c6355127734b1b 100644 --- a/aten/src/ATen/templates/NativeFunctions.h +++ b/aten/src/ATen/templates/NativeFunctions.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index aab8acd86a6ba5..50f6bb83bb7edb 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -1,7 +1,5 @@ #pragma once -// ${generated_comment} - #include "ATen/core/Device.h" #include "ATen/core/Layout.h" #include "ATen/core/Scalar.h" diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index ec86db721b9f04..54fd80c5f13463 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -1,11 +1,9 @@ #pragma once -// ${generated_comment} - -#include "ATen/Tensor.h" +#include "ATen/core/Tensor.h" #include "ATen/core/Scalar.h" #include "ATen/core/SparseTensorRef.h" -#include "ATen/Type.h" +#include "ATen/core/Type.h" #include "ATen/core/TensorOptions.h" namespace at { diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 183b2d4d498026..18fb393aeca759 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -1,7 +1,5 @@ #pragma once -// ${generated_comment} - #include "ATen/core/ATenGeneral.h" #include "ATen/core/Allocator.h" #include "ATen/core/Deprecated.h" @@ -10,7 +8,7 @@ #include "ATen/core/Scalar.h" #include "ATen/core/ScalarType.h" #include "ATen/core/SparseTensorRef.h" -#include "ATen/Tensor.h" +#include "ATen/core/Tensor.h" #include "ATen/core/ArrayRef.h" #include "ATen/core/Half.h" #include "ATen/core/TensorTypeIdRegistration.h" diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index ff838c58889e45..7d9a18eda18489 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -167,12 +167,20 @@ if (NOT BUILD_ATEN_MOBILE) file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*") + # these are files that are generated by the script and checked in -- the script checks + # that they are equivalent so it must be a dependency of the script + set(core_gen_checked_inputs + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Type.h + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Tensor.h + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorMethods.h) + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen) + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core_tmp) add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} COMMAND ${GEN_COMMAND} --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen - DEPENDS ${all_python} ${all_templates} ${cwrap_files}) + DEPENDS ${all_python} ${all_templates} ${cwrap_files} ${core_gen_checked_inputs}) # Generated headers used from a CUDA (.cu) file are # not tracked correctly in CMake. We make the libATen.so depend explicitly From a3036b3bb3b559c7f712b31d5f79b366e58cc003 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Wed, 12 Sep 2018 13:46:47 -0700 Subject: [PATCH 086/237] Fused weightnorm for ATen (#10842) Summary: This PR contains a C++ implementation of weight norm. The user-side exposure of weight norm through torch.nn.utils.weight_norm is unchanged. If running on the GPU, and the norm is requested over the first or last dimension of the weight tensor, the forward pass is carried out using the fused kernels I wrote for our Fairseq GTC hero run, which offer superior performance to primitive ops and superior numerical stability when running in FP16. In the common case that the backward pass is not itself constructing a graph (ie not attempting to set up double backward) the backward pass will be carried out using another fused kernel. If the backward pass is constructing a graph, an alternate code path is taken, which does the math using differentiable primitive ops. In this way, the implementation allows double backward, even if the fused kernel was used in forward (although in this case, you don't benefit from the performance and stability of the fused backward kernel). If running on the CPU, or if norming over an interior dim, the forward pass is carried out using double-differentiable primitive ops. Figuring out how to generate all the right plumbing for this was tricky, but it was a fun experience learning how the autogenerator works and how the graph is constructed. Thanks to colesbury for useful guidance on this front. I do have a few lingering questions: - Should I unify my return statements (ie by default-constructing Tensors outside if blocks and using operator= within)? - What is the significance of `non_blocking` when calling e.g. `auto norms = saved_norms.to(saved_g.type().scalarType(), non_blocking=True/False);`? I am currently omitting `non_blocking`, so it defaults to False, but I didn't see any associated synchronizes on the timeline, so I'm wondering what it means. - Is there an "official" mapping from at::ScalarTypes to corresponding accumulate types, as there are for the PODs + Half in [AccumulateType.h](https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/AccumulateType.h)? I looked for an equivalent mapping for ScalarTypes, didn't find one, and ended up rigging it myself (` at::ScalarType AccType = g.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float : g.type().scalarType();`). - Are sparse tensors a concern? Should I include another check for sparse tensors in the `_weight_norm` entry point, and send those along the fallback CPU path as well? Pull Request resolved: https://github.com/pytorch/pytorch/pull/10842 Differential Revision: D9735531 Pulled By: ezyang fbshipit-source-id: 24431d46532cf5503876b3bd450d5ca775b3eaee --- aten/src/ATen/native/WeightNorm.cpp | 117 +++++ aten/src/ATen/native/cuda/WeightNorm.cu | 502 +++++++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 21 + tools/autograd/derivatives.yaml | 6 + torch/nn/utils/weight_norm.py | 19 +- torch/onnx/symbolic.py | 5 + 6 files changed, 654 insertions(+), 16 deletions(-) create mode 100644 aten/src/ATen/native/WeightNorm.cpp create mode 100644 aten/src/ATen/native/cuda/WeightNorm.cu diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp new file mode 100644 index 00000000000000..1627b4c2596e12 --- /dev/null +++ b/aten/src/ATen/native/WeightNorm.cpp @@ -0,0 +1,117 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" + +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace at { +namespace native { + +// Staying faithful to the Python for now for clarity, look for optimizations later +// (e.g., single return statement for RVO) +Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim) +{ + // I assume tensor.contiguous(), view(), norm(), etc. here will dispatch through VariableType. + if (dim == -1) { + return v.norm(pow); + } else if (dim == 0) { + std::vector output_size(v.dim(), 1); + output_size[0] = v.size(0); + return v.contiguous().view({v.size(0), -1}).norm(pow, 1).view(output_size); + } else if (dim == v.dim() - 1) { + std::vector output_size(v.dim(), 1); + output_size[v.dim() - 1] = v.size(v.dim() - 1); + return v.contiguous().view({-1, v.size(v.dim() - 1)}).norm(pow, 0).view(output_size); + } else { + // To consider: at::native::norm_except_dim is probably fine as well, + // and would avoid an additional dynamic dispatch. + return at::norm_except_dim(v.transpose(0, dim), pow, 0).transpose(0, dim); // optimize? + } +} + +Tensor _weight_norm + (const Tensor & v_in, + const Tensor & g_in, + int64_t dim) +{ + + AT_CHECK( + v_in.device() == g_in.device(), + "weight_norm: expected v_in and g_in to be on the same device, but v_in is " + "on ", v_in.device(), " and g_in is on ", g_in.device()); + + auto v = v_in.contiguous(); + auto g = g_in.contiguous(); + + bool can_use_fused = v.type().is_cuda() && (dim == 0 || dim == v.dim() - 1); + + if (can_use_fused) { + // weight_norm does not have a derivative defined for it, so this will route back through + // VariableType.cpp, and construct a WeightNormFusedBackward object in the autograd graph. + return std::get<0>(at::_weight_norm_cuda_interface(v, g, dim)); + } else { + // Double-differentiable primitive ops + // at::native::norm_except_dim would probably be fine as well. + return v*(g/at::norm_except_dim(v, 2, dim)); + } +} + +// Differentiable backward path, an alternative to weight_norm_cuda_backward, to be used +// when backward is itself creating a graph. +// The GradMode::is_enabled() check must be performed within Functions.cpp; that's why we +// define a separate function here, instead of inlining it in weight_norm_cuda_backward. +std::tuple _weight_norm_differentiable_backward + (const Tensor & grad_w, + const Tensor & saved_v, + const Tensor & saved_g, + const Tensor & saved_norms, + int64_t dim) +{ + // In Functions.cpp, the HardshrinkBackward object supplies "grad.contiguous()" + // as the first argument, so grad_w should be contiguous here. + // All these checks should succeed: + AT_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous"); + AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + + int64_t last_dim = saved_v.dim() - 1; + int64_t last_size = saved_v.size(last_dim); + + // Like weight_norm_fused_backward, weight_norm_differentiable_backward should only ever be called + // through a WeightNormFusedBackward object, so we expect that dim == 0 || dim == saved_v.size(-1) + AT_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension"); + + // saved_g and saved_norms are already shaped to broadcast over the correct dimensions + + // ...but saved_norms might be Float when saved_g and saved_v are half. + // To consider: saved_norms.to(..., True /*non_blocking*/); + auto norms = saved_norms.to(saved_g.type().scalarType()); + + std::vector bcast_size(saved_v.dim(), 1); + + // Analytic backward path using differentiable primitive ops + if (dim == 0) { + bcast_size[0] = saved_v.size(0); + auto per_dim_sums = (grad_w*saved_v).view({saved_v.size(0), -1}).sum(1).view(bcast_size); + auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms))); + auto grad_g = per_dim_sums/norms; + return std::tuple{grad_v, grad_g}; + } else { // dim == last_dim + bcast_size[last_dim] = last_size; + auto per_dim_sums = (grad_w*saved_v).view({-1, last_size}).sum(0).view(bcast_size); + auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms))); + auto grad_g = per_dim_sums/norms; + return std::tuple{grad_v, grad_g}; + } +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu new file mode 100644 index 00000000000000..67d8f39e2de71d --- /dev/null +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -0,0 +1,502 @@ +#include "ATen/ATen.h" +#include "ATen/AccumulateType.h" +#include "ATen/TensorUtils.h" +#include "ATen/core/Error.h" + +#include "ATen/cuda/CUDAContext.h" +#include +#include + +namespace at { +namespace native { +namespace { + +// Block size for weight_norm_*_first_dim_kernel. +// Currently, kernels are non-persistent. +// Dialing up the block size to, say 1024, can improve performance by +// increase the amount of cache available per block, which can improve cache hit rate. +// However, this is less efficient for short rows. 256 is pretty versatile. +// May be worth implementing heuristics later. +#define BLOCK 256 + +// Block size for weight_norm_*_last_dim_kernel. +// This is tricker than the first_dim case because we must make blocks +// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses. +// Since output-element parallelism is along the fast dimension, this reduces the number of +// blocks we can launch by 16X. +#define TILE_W 16 +// Somewhat versatile strategy: max out intra-block parallelism by extending +// blocks across the slow dimension up to the hardware-max block size of 1024. +#define TILE_H 64 + +template +__device__ __forceinline__ void reduce_block_into_lanes + (T *x, + T val, + int lanes, // lanes is intended to be <= 32. + ReduceOp reduceOp) +{ + int tid = threadIdx.x + threadIdx.y*blockDim.x; + int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32. + + if(blockSize >= 64) + { + x[tid] = val; + __syncthreads(); + } + + #pragma unroll + for(int i = (blockSize >> 1); i >= 64; i >>= 1) + { + if(tid < i) + x[tid] = reduceOp(x[tid], x[tid+i]); + __syncthreads(); + } + + if(tid < 32) + { + T final; + if(blockSize >= 64) + final = reduceOp(x[tid], x[tid+32]); + else + final = val; + // __SYNCWARP(); + + #pragma unroll + for(int i = 16; i >= lanes; i >>= 1) + final = reduceOp(final, WARP_SHFL_DOWN(final, i)); + + if(tid < lanes) + x[tid] = final; // EpilogueOp + } + + // Make sure the smem result is visible to all warps. + __syncthreads(); +} + +template + +__global__ void weight_norm_fwd_first_dim_kernel + (scalar_t* __restrict__ w, + accscalar_t* __restrict__ norms, + const scalar_t* __restrict__ v, + const scalar_t* __restrict__ g, + const int rowSize) +{ + // We are norming each slowest-dim row of the tensor separately. + // For now, assign one block to each row. + const int tid = threadIdx.x; + const int row = blockIdx.x; + const int stride = blockDim.x; + + // Logical index offset for this flattened row + const int rowStart = row*rowSize; + + // Hack to get around nvcc complaining when an smem array is declared with the same name + // but different types in different kernels (in this case different instantiations) + // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s" + extern __shared__ char buf[]; + accscalar_t* s = (accscalar_t*)buf; + + accscalar_t thread_sum = 0.f; + for(int i = tid; i < rowSize; i += stride ) + { + accscalar_t val_f = scalar_cast(v[i+rowStart]); + thread_sum += val_f*val_f; // AccumOp, could do Kahan here + } + + reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd()); + accscalar_t result = s[0]; + + result = sqrtf(result); + + if(tid == 0) + norms[row] = result; + + // Broadcast load, could use shared memory instead. + accscalar_t g_this_row = scalar_cast(g[row]); + + accscalar_t rnorm = 1.f/result; // for consistency with backward kernel + + // Write data to output + for(int i = tid; i < rowSize; i += stride ) + { + accscalar_t val_f = scalar_cast(v[i+rowStart]); + w[i+rowStart] = scalar_cast(g_this_row*val_f*rnorm); + } +} + +template + +__global__ void weight_norm_fwd_last_dim_kernel +( + scalar_t* __restrict__ w, + accscalar_t* __restrict__ norms, + const scalar_t* __restrict__ v, + const scalar_t* __restrict__ g, + const int fast_dim_size, + const int slower_dims_size +) +{ + const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x; + + extern __shared__ char buf[]; + accscalar_t* alloc = (accscalar_t*)buf; + accscalar_t* s = &alloc[0]; + accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y]; + + accscalar_t thread_sum = 0.f; + + int slower_dims_location = threadIdx.y; + int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t val_f = scalar_cast(v[currentIdx]); + thread_sum += val_f*val_f; // AccumOp, could do Kahan here + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } + + reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd()); + + // Better to pass an EpilogueOp to reduce_block_into_lanes? + if(threadIdx.y == 0) + { + accscalar_t result = s[threadIdx.x]; + accscalar_t norm_this_col = sqrtf(result); + norms[fast_dim_location] = norm_this_col; + rnorms_this_block[threadIdx.x] = 1.f/norm_this_col; + } + + __syncthreads(); + + accscalar_t g_this_col = scalar_cast(g[fast_dim_location]); + accscalar_t rnorm = rnorms_this_block[threadIdx.x]; + + slower_dims_location = threadIdx.y; + currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t val_f = scalar_cast(v[currentIdx]); + w[currentIdx] = scalar_cast(g_this_col*val_f*rnorm); + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } +} + +template + +__global__ void weight_norm_bwd_first_dim_kernel + (scalar_t* __restrict__ grad_v, + scalar_t* __restrict__ grad_g, + const scalar_t* __restrict__ grad_w, + const scalar_t* __restrict__ saved_v, + const scalar_t* __restrict__ saved_g, + const accscalar_t* __restrict__ saved_norms, + const int rowSize) +{ + // For now, assign one block to each row. + const int tid = threadIdx.x; + const int row = blockIdx.x; + const int stride = blockDim.x; + + // Logical index offset for this flattened row + const int rowStart = row*rowSize; + + // Hack to get around nvcc complaining when an smem array is declared with the same name + // but different types in different kernels (in this case different instantiations) + // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s" + extern __shared__ char buf[]; + accscalar_t* s = (accscalar_t*)buf; + + accscalar_t thread_sum = 0.f; + for(int i = tid; i < rowSize; i += stride ) + { + accscalar_t grad_wi = scalar_cast(grad_w[i+rowStart]); + accscalar_t saved_vi = scalar_cast(saved_v[i+rowStart]); + thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here + } + + reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd()); + accscalar_t result = s[0]; + + // Could choose to save reciprocal of norm instead I suppose, but norms is probably + // more handy to keep around. + // Broadcast load; could use shared memory instead. + accscalar_t rnorm = 1.f/saved_norms[row]; + accscalar_t rnorm3 = rnorm*rnorm*rnorm; + + // Write g gradients. + if(tid == 0) + grad_g[row] = scalar_cast(result*rnorm); + + // Broadcast load, could use shared memory instead. + accscalar_t g_this_row = scalar_cast(saved_g[row]); + + // Write v gradients. We are reusing values that were loaded earlier, so there + // is an optimization opportunity here (store values persistently). + for(int j = tid; j < rowSize; j += stride ) + { + accscalar_t grad_wj = scalar_cast(grad_w[j+rowStart]); + accscalar_t saved_vj = scalar_cast(saved_v[j+rowStart]); + accscalar_t grad_vj = g_this_row*(rnorm*grad_wj - rnorm3*saved_vj*result); + grad_v[j+rowStart] = scalar_cast(grad_vj); + } +} + +template + +__global__ void weight_norm_bwd_last_dim_kernel + (scalar_t* __restrict__ grad_v, + scalar_t* __restrict__ grad_g, + const scalar_t* __restrict__ grad_w, + const scalar_t* __restrict__ saved_v, + const scalar_t* __restrict__ saved_g, + const accscalar_t* __restrict__ saved_norms, + const int fast_dim_size, + const int slower_dims_size) +{ + const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x; + + extern __shared__ char buf[]; + accscalar_t* s = (accscalar_t*)buf; + + accscalar_t thread_sum = 0.f; + + int slower_dims_location = threadIdx.y; + int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t grad_wi = scalar_cast(grad_w[currentIdx]); + accscalar_t saved_vi = scalar_cast(saved_v[currentIdx]); + thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } + + reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd()); + accscalar_t result = s[threadIdx.x]; + + // Broadcast load; could use shared memory instead. + accscalar_t rnorm = 1.f/saved_norms[fast_dim_location]; + accscalar_t rnorm3 = rnorm*rnorm*rnorm; + + // Write g gradients. + if(threadIdx.y == 0) + grad_g[fast_dim_location] = scalar_cast(result*rnorm); + + // Entire block pulls these values, could use shared memory instead. + accscalar_t g_this_col = scalar_cast(saved_g[fast_dim_location]); + + // Write v gradients. + slower_dims_location = threadIdx.y; + currentIdx = fast_dim_location + fast_dim_size*slower_dims_location; + if(fast_dim_location < fast_dim_size) + while(slower_dims_location < slower_dims_size) + { + accscalar_t grad_wj = scalar_cast(grad_w[currentIdx]); + accscalar_t saved_vj = scalar_cast(saved_v[currentIdx]); + accscalar_t grad_vj = g_this_col*(rnorm*grad_wj - rnorm3*saved_vj*result); + grad_v[currentIdx] = scalar_cast(grad_vj); + currentIdx += blockDim.y*fast_dim_size; + slower_dims_location += blockDim.y; + } +} + +} // anonymous namespace + +std::tuple weight_norm_cuda + (const Tensor & v, + const Tensor & g, + int64_t dim) +{ + auto w = at::empty_like(v); + + // weight_norm_fused does have a derivative defined in derivatives.yaml, therefore, VariableType.cpp + // sends the unpacked g.data() as the argument. In other words, we expect "g" is a bare Tensor here. + + // norms is only needed to stash for backward. + // g.type().scalarType() may be at::ScalarType::Double, Float, or Half. + // If Half, stash norms as float. + at::ScalarType AccType = g.type().scalarType() == at::ScalarType::Half ? + at::ScalarType::Float : g.type().scalarType(); + // Will this create norms on the same device as g, regardless of what the thread's default + // current device is? I believe so, because Type::* functions are DeviceGuard()ed. + auto norms = g.type().toScalarType(AccType).tensor(g.sizes(), g.strides()); + + const int ndims = v.dim(); + + if(dim == 0) + { + // Find logical size of each flattened slowest-dim row + int rowSize = 1; + for(int i = ndims - 1; i > 0; i--) + rowSize *= v.size(i); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (v.type(), + "weight_norm_fwd_first_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_fwd_first_dim_kernel + <<>> + (w.data(), + norms.data(), + v.data(), + g.data(), + rowSize); + }); + } + else if(dim == ndims - 1) + { + // Precompute slower_dims_size and fast_dim_size + int slower_dims_size = 1; + for(int i = 0; i < ndims - 1; i++) + slower_dims_size *= v.size(i); + + int fast_dim_size = v.size(ndims-1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (v.type(), + "weight_norm_fwd_last_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_fwd_last_dim_kernel + <<<(fast_dim_size+TILE_W-1)/TILE_W, + dim3(TILE_W,TILE_H), + (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t), + stream>>> + (w.data(), + norms.data(), + v.data(), + g.data(), + fast_dim_size, + slower_dims_size); + }); + } + + // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, + // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught + // until a later error check on a synchronizing CUDA call. Unfortunately, without manually + // synchronizing here, this is the best we can do. + THCudaCheck(cudaGetLastError()); + + return std::tuple{w, norms}; +} + +std::tuple weight_norm_cuda_backward + (const Tensor & grad_w, + const Tensor & saved_v, + const Tensor & saved_g, + const Tensor & saved_norms, + int64_t dim) +{ + // These checks should always succeed, because weight_norm_fused_backward should only + // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g. + AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + AT_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim") + + auto grad_v = at::empty_like(saved_v); + auto grad_g = at::empty_like(saved_g); + + const int ndims = saved_v.dim(); + + if(dim == 0) + { + // Find logical size of each flattened slowest-dim row + int rowSize = 1; + for(int i = ndims - 1; i > 0; i--) + rowSize *= saved_v.size(i); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (saved_v.type(), + "weight_norm_bwd_first_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_bwd_first_dim_kernel + <<>> + (grad_v.data(), + grad_g.data(), + grad_w.data(), + saved_v.data(), + saved_g.data(), + saved_norms.data(), + rowSize); + }); + } + else if(dim == ndims - 1) + { + // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array. + int slower_dims_size = 1; + for(int i = 0; i < ndims - 1; i++) + slower_dims_size *= saved_v.size(i); + + int fast_dim_size = saved_v.size(ndims-1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF + (saved_v.type(), + "weight_norm_bwd_last_dim_kernel", + [&] + { + using accscalar_t = acc_type; + + weight_norm_bwd_last_dim_kernel + <<<(fast_dim_size+TILE_W-1)/TILE_W, + dim3(TILE_W,TILE_H), + (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t), + stream>>> + (grad_v.data(), + grad_g.data(), + grad_w.data(), + saved_v.data(), + saved_g.data(), + saved_norms.data(), + fast_dim_size, + slower_dims_size); + }); + } + + // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, + // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught + // until a later error check on a synchronizing CUDA call. Unfortunately, without manually + // synchronizing here, this is the best we can do. + THCudaCheck(cudaGetLastError()); + + return std::tuple{grad_v, grad_g}; +} + +#undef BLOCK +#undef TILE_W +#undef TILE_H + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index cdd7a2655473f1..af58e45ebe0ebd 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1746,6 +1746,27 @@ CPU: _s_where_cpu CUDA: _s_where_cuda +- func: norm_except_dim(Tensor v, int64_t pow=2, int64_t dim=0) -> Tensor + variants: function + +# VariableType::_weight_norm does not want to be given a gap in the autograd graph, +# so we don't define "dispatch" variants for it. +- func: _weight_norm(Tensor v, Tensor g, int64_t dim=0) -> Tensor + variants: function + +- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim=0) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: weight_norm_cuda + +- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: weight_norm_cuda_backward + +- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor) + variants: function + - func: zeros(IntList size, TensorOptions options={}) -> Tensor - func: zeros_out(Tensor result, IntList size) -> Tensor diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 8accdb97427b27..caec9575ef9cca 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -762,6 +762,12 @@ self: where(condition, grad, zeros_like(grad)) other: where(condition, zeros_like(grad), grad) +# weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen +# to be running backward with create_graph=True, fall back to a backward function that uses +# differentiable ops. +- name: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim) + v, g: "GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_cuda_interface_backward(grad.contiguous(), v, g, result1, dim)" + - name: zero_(Tensor self) self: zeros_like(grad) diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py index abc8b63cb105b9..ddefa3d6f5f544 100644 --- a/torch/nn/utils/weight_norm.py +++ b/torch/nn/utils/weight_norm.py @@ -2,20 +2,7 @@ Weight Normalization from https://arxiv.org/abs/1602.07868 """ from torch.nn.parameter import Parameter - - -def _norm(p, dim): - """Computes the norm over all dimensions except dim""" - if dim is None: - return p.norm() - elif dim == 0: - output_size = (p.size(0),) + (1,) * (p.dim() - 1) - return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size) - elif dim == p.dim() - 1: - output_size = (1,) * (p.dim() - 1) + (p.size(-1),) - return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size) - else: - return _norm(p.transpose(0, dim), 0).transpose(0, dim) +from torch import _weight_norm, norm_except_dim class WeightNorm(object): @@ -26,7 +13,7 @@ def __init__(self, name, dim): def compute_weight(self, module): g = getattr(module, self.name + '_g') v = getattr(module, self.name + '_v') - return v * (g / _norm(v, self.dim)) + return _weight_norm(v, g, self.dim) @staticmethod def apply(module, name, dim): @@ -38,7 +25,7 @@ def apply(module, name, dim): del module._parameters[name] # add g and v as new parameters and express w as g/||v|| * v - module.register_parameter(name + '_g', Parameter(_norm(weight, dim).data)) + module.register_parameter(name + '_g', Parameter(norm_except_dim(weight, 2, dim).data)) module.register_parameter(name + '_v', Parameter(weight.data)) setattr(module, name, fn.compute_weight(module)) diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 53d4f769a4e56f..d9c5964109a0c4 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -721,6 +721,11 @@ def unfold(g, input, dimension, size, step): return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step) +@parse_args('v', 'v', 'i') +def _weight_norm(graph, v, g, dim): + return graph.op("ATen", v, g, dim_i=dim, operator_s="_weight_norm") + + @parse_args('v', 't', 't', 't') def elu(g, input, alpha, scale, input_scale): if scale and scale != 1.: From 504126e7050d9c29f4d1ef5bac13399fe50f2730 Mon Sep 17 00:00:00 2001 From: James Reed Date: Wed, 12 Sep 2018 13:57:55 -0700 Subject: [PATCH 087/237] Documentation for debugging JIT Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11540 Differential Revision: D9798647 Pulled By: jamesr66a fbshipit-source-id: 968a4af22c735a848fa27cbadaed9b7023ba8276 --- docs/source/jit.rst | 248 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 244 insertions(+), 4 deletions(-) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 8bc4ce6e336fc9..005c09fa693624 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -535,13 +535,253 @@ Python-defined Constants Debugging ~~~~~~~~~ -Print things +Disable JIT for Debugging + If you want to disable all JIT modes (tracing and scripting) so you can + debug your program in raw Python, you can use the ``PYTORCH_JIT`` environment + variable. ``PYTORCH_JIT`` can be used to globally disable the + JIT by setting its value to ``0``. Given an example script:: -Use ``USE_PYTHON=0`` to debug in normal python mode + @torch.jit.script + def scripted_fn(x : torch.Tensor): + for i in range(12): + x = x + x + return x + + + def fn(x): + x = torch.neg(x) + import pdb; pdb.set_trace() + return scripted_fn(x) + + traced_fn = torch.jit.trace(fn, (torch.rand(4, 5),)) + + traced_fn(torch.rand(3, 4)) -Look at the graph + Debugging this script with PDB works except for when we invoke the @script + function. We can globally disable JIT, so that we can call the @script + function as a normal python function and not compile it. If the above script + is called ``disable_jit_example.py``, we can invoke it like so:: -Pay attention to tracer warnings + $ PYTORCH_JIT=0 python disable_jit_example.py + + and we will be able to step into the @script function as a normal Python + function. + + +Interpreting Graphs + TorchScript uses a static single assignment (SSA) intermediate representation + (IR) to represent computation. The instructions in this format consist of + ATen (the C++ backend of PyTorch) operators and other primitive operators, + including control flow operators for loops and conditionals. As an example:: + + @torch.jit.script + def foo(len): + # type: (int) -> torch.Tensor + rv = torch.zeros(3, 4) + for i in range(len): + if i < 10: + rv = rv - 1.0 + else: + rv = rv + 1.0 + return rv + + print(foo.graph) + + A ``ScriptModule`` with a single ``forward`` method will have an attribute + ``graph``, which you can use to inspect the IR representing the computation. + If the ScriptModule has more than one method, you will need to access + ``.graph`` on the method itself and not the module. We can inspect the + graph of a method named ``bar`` on a ScriptModule by accessing ``.bar.graph``. + + The example script above produces the graph:: + + graph(%len : int) { + %13 : float = prim::Constant[value=1]() + %10 : int = prim::Constant[value=10]() + %2 : int = prim::Constant[value=4]() + %1 : int = prim::Constant[value=3]() + %3 : int[] = prim::ListConstruct(%1, %2) + %4 : int = prim::Constant[value=6]() + %5 : int = prim::Constant[value=0]() + %6 : int[] = prim::Constant[value=[0, -1]]() + %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6) + %8 : int = prim::Constant[value=1]() + %rv : Dynamic = prim::Loop(%len, %8, %rv.1) + block0(%i : int, %12 : Dynamic) { + %11 : int = aten::lt(%i, %10) + %rv.4 : Dynamic = prim::If(%11) + block0() { + %14 : int = prim::Constant[value=1]() + %rv.2 : Dynamic = aten::sub(%12, %13, %14) + -> (%rv.2) + } + block1() { + %16 : int = prim::Constant[value=1]() + %rv.3 : Dynamic = aten::add(%12, %13, %16) + -> (%rv.3) + } + %19 : int = prim::Constant[value=1]() + -> (%19, %rv.4) + } + return (%rv); + } + + Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for + example. ``%rv.1 : Dynamic`` means we assign the output to a (unique) + value named ``rv.1``, and that value is of ``Dynamic`` type, i.e. we do + not know its concrete shape. ``aten::zeros`` is the operator (equivalent + to ``torch.zeros``) and the input list ``(%3, %4, %5, %6)`` specifies which + values in scope should be passed as inputs. The schema for built-in functions + like ``aten::zeros`` can be found at `Builtin Functions`_. + + Notice that operators can also have associated ``blocks``, namely the + ``prim::Loop`` and ``prim::If`` operators. In the graph print-out, these + operators are formatted to reflect their equivalent source code forms + to facilitate easy debugging. + + Graphs can be inspected as shown to confirm that the computation described + by a ``ScriptModule`` is correct, in both automated and manual fashion, as + described below. + + +Tracing Edge Cases + There are some edge cases that exist where the trace of a given Python + function/module will not be representative of the underlying code. These + cases can include: + + * Tracing of control flow that is dependent on inputs (e.g. tensor shapes) + * Tracing of in-place operations of tensor views (e.g. indexing on the + left-hand side of an assignment) + + Note that these cases may in fact be traceable in the future. + + +Automatic Trace Checking + One way to automatically catch many errors in traces is by using ``check_inputs`` + on the ``torch.jit.trace()`` API. ``check_inputs`` takes a list of tuples + of inputs that will be used to re-trace the computation and verify the + results. For example:: + + def loop_in_traced_fn(x): + result = x[0] + for i in range(x.size(0)): + result = result * x[i] + return result + + inputs = (torch.rand(3, 4, 5),) + check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)] + + traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs) + + Gives us the following diagnostic information:: + + ERROR: Graphs differed across invocations! + Graph diff: + graph(%0 : Dynamic) { + %1 : int = prim::Constant[value=0]() + %2 : int = prim::Constant[value=0]() + %3 : Dynamic = aten::select(%0, %1, %2) + %4 : int = prim::Constant[value=0]() + %5 : int = prim::Constant[value=0]() + %6 : Dynamic = aten::select(%0, %4, %5) + %7 : Dynamic = aten::mul(%3, %6) + %8 : int = prim::Constant[value=0]() + %9 : int = prim::Constant[value=1]() + %10 : Dynamic = aten::select(%0, %8, %9) + %11 : Dynamic = aten::mul(%7, %10) + %12 : int = prim::Constant[value=0]() + %13 : int = prim::Constant[value=2]() + %14 : Dynamic = aten::select(%0, %12, %13) + %15 : Dynamic = aten::mul(%11, %14) + + %16 : int = prim::Constant[value=0]() + + %17 : int = prim::Constant[value=3]() + + %18 : Dynamic = aten::select(%0, %16, %17) + + %19 : Dynamic = aten::mul(%15, %18) + - return (%15); + ? ^ + + return (%19); + ? ^ + } + + + This message indicates to us that the computation differed between when + we first traced it and when we traced it with the ``check_inputs``. Indeed, + the loop within the body of ``loop_in_traced_fn`` depends on the shape + of the input ``x``, and thus when we try another ``x`` with a different + shape, the trace differs. + + In this case, data-dependent control flow like this can be captured using + script instead:: + + def fn(x): + result = x[0] + for i in range(x.size(0)): + result = result * x[i] + return result + + inputs = (torch.rand(3, 4, 5),) + check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)] + + scripted_fn = torch.jit.script(fn) + print(scripted_fn.graph) + + for input_tuple in [inputs] + check_inputs: + torch.testing.assert_allclose(fn(*input_tuple), scripted_fn(*input_tuple)) + + + Which produces:: + + graph(%x : Dynamic) { + %1 : int = prim::Constant[value=0]() + %2 : int = prim::Constant[value=0]() + %result.1 : Dynamic = aten::select(%x, %2, %1) + %4 : int = aten::size(%x, %1) + %5 : int = prim::Constant[value=1]() + %result : Dynamic = prim::Loop(%4, %5, %result.1) + block0(%i : int, %7 : Dynamic) { + %9 : int = prim::Constant[value=0]() + %10 : Dynamic = aten::select(%x, %9, %i) + %result.2 : Dynamic = aten::mul(%7, %10) + %12 : int = prim::Constant[value=1]() + -> (%12, %result.2) + } + return (%result); + } + + +Tracer Warnings + The tracer produces warnings for several problematic patterns in traced + computation. As an example, take a trace of a function that contains an + in-place assignment on a slice (a view) of a Tensor:: + + def fill_row_zero(x): + x[0] = torch.rand(*x.shape[1:2]) + return x + + traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),)) + print(traced.graph) + + + Produces several warnings and a graph which simply returns the input:: + + fill_row_zero.py:4: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe. + x[0] = torch.rand(*x.shape[1:2]) + fill_row_zero.py:6: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + Not within tolerance rtol=1e-05 atol=1e-05 at input[0, 1] (0.09115803241729736 vs. 0.6782537698745728) and 3 other locations (33.00%) + traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),)) + graph(%0 : Float(3, 4)) { + return (%0); + } + + We can fix this by modifying the code to not use the in-place update, but + rather build up the result tensor out-of-place with `torch.cat`:: + + def fill_row_zero(x): + x = torch.cat((torch.rand(1, *x.shape[1:2]), x[1:2]), dim=0) + return x + + traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),)) + print(traced.graph) Builtin Functions From f0a440007e9e671d7ccf3123f6f209e74586ff8e Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 13:59:50 -0700 Subject: [PATCH 088/237] Explicitly set locale on docs build. (#11595) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11595 Differential Revision: D9798567 Pulled By: ezyang fbshipit-source-id: ac05458347e181960a07cacae1dfc68d2837451f --- .jenkins/pytorch/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 7d491e0fccb7ef..2dc64157c5d00d 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -118,7 +118,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then pushd docs # TODO: Don't run this here pip install -r requirements.txt || true - make html + LC_ALL=C make html popd fi From 958ba4e91383e00697e59862acf0f101b25a66c9 Mon Sep 17 00:00:00 2001 From: Lingyi Liu Date: Wed, 12 Sep 2018 14:24:16 -0700 Subject: [PATCH 089/237] Aibench for asr decoder Summary: as title Reviewed By: sf-wind Differential Revision: D9738021 fbshipit-source-id: 98f570484bca6486ad99207732efd534ec7e3251 --- modules/observers/perf_observer.cc | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc index ed391a3e3ff5dc..93ae7868e7f622 100644 --- a/modules/observers/perf_observer.cc +++ b/modules/observers/perf_observer.cc @@ -1,11 +1,42 @@ #include "observers/perf_observer.h" #include "observers/observer_config.h" +#if !CAFFE2_MOBILE +#include "caffe2/core/flags.h" +#include "observers/net_observer_reporter_print.h" +#endif #include #include "caffe2/core/common.h" #include "caffe2/core/init.h" #include "caffe2/core/operator.h" +#if !CAFFE2_MOBILE +CAFFE2_DEFINE_int64( + aiBench_netInitSampleRate, + 0, + "One in N sampling rate for net delay"); + +CAFFE2_DEFINE_int64( + aiBench_netFollowupSampleRate, + 0, + "One in N sampling rate for net delay"); + +CAFFE2_DEFINE_int64( + aiBench_netFollowupSampleCount, + 0, + "control the following c logs"); + +CAFFE2_DEFINE_int64( + aiBench_operatorNetSampleRatio, + 0, + "One in N sampling rate for operator delay"); + +CAFFE2_DEFINE_int64( + aiBench_skipIters, + 0, + "skip the first N iterations of the net run"); +#endif + namespace caffe2 { namespace { @@ -13,6 +44,20 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) { AddGlobalNetObserverCreator([](NetBase* subject) { return caffe2::make_unique(subject); }); + +#if !CAFFE2_MOBILE + // for aibench usage + caffe2::ObserverConfig::setReporter( + caffe2::make_unique()); + + caffe2::ObserverConfig::initSampleRate( + FLAGS_aiBench_netInitSampleRate, + FLAGS_aiBench_netFollowupSampleRate, + FLAGS_aiBench_netFollowupSampleCount, + FLAGS_aiBench_operatorNetSampleRatio, + FLAGS_aiBench_skipIters); +#endif + return true; } } // namespace From d4e05f4e1e276055cd3d3e1a2a1e186e6c6405ee Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Wed, 12 Sep 2018 14:39:23 -0700 Subject: [PATCH 090/237] Move function deletion from the stack to the heap. (#11534) Summary: This eliminates the need for any heuristics regarding stack size limits. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11534 Differential Revision: D9779866 Pulled By: resistor fbshipit-source-id: 96753eead7904bbdc2869fb01f7bd42141032347 --- torch/csrc/autograd/function.cpp | 122 ++++++++++--------------------- torch/csrc/autograd/function.h | 2 - torch/csrc/autograd/init.cpp | 5 -- 3 files changed, 40 insertions(+), 89 deletions(-) diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp index ba33a6f116bac2..6090b520bd36ab 100644 --- a/torch/csrc/autograd/function.cpp +++ b/torch/csrc/autograd/function.cpp @@ -36,95 +36,53 @@ AnomalyMetadata* Function::metadata() noexcept { return anomaly_metadata_.get(); } -/* - * Fix for #5534: prevent stack overflow on deletion of deep computation graph - * - * Sometimes one can end up with a very big computation graph of Functions - * and Edges. Each std::shared_ptr contains a list of Edge, and - * each Edge contains a std::shared_ptr. Deleting a - * std::shared_ptr can trigger the recursive deletion of other - * std::shared_ptr's: this can stack overflow if the graph - * is deep enough. Here is an example of such a graph: - * - * shared_ptr -> Edge -> shared_ptr -> Edge -> ... -> shared_ptr - * - * The solution here is to use a custom deleter with each - * std::shared_ptr. The custom deleter keeps track of how many - * nested deleters it is in. When this number exceeds the maximum allowed - * depth, the Function* to be deleted are accumulated in a per-thread - * delete queue and handled by one of the deleters. - * - * Note that these custom deleters are NOT necessary for deleting PyFunction. - * This is because a THPFunction Python object owns a PyFunction that is in a - * computation graph. When Python objects get recursively destroyed, they - * are also queued into a delete list. This happens very early for them - * (at 50 deleters): https://github.com/python/cpython/blob/f320be77ffb73e3b9e7fc98c37b8df3975d84b40/Include/object.h#L1024-L1063 - * so we don't need to worry about them. - */ - -thread_local std::deque deleteFunctionQueue; -thread_local size_t deleteFunctionRecursionDepth = 0; - -/* - * If this number is set too high, a deep computation graph can still - * stack overflow. The procedure for setting this number was to - * 1) find the smallest value that would not guard against stack overflows - * on various machines - * 2) Take the minimum of all such values and subtract some leeway because - * the memory of these stack frames will probably grow as time passes. - * Testing on a few machines machines, the magic numbers were: - * - Mac OSX (Macbook Pro 15) : ~60000 - * - A beefy Ubuntu 16.04 box : ~15000 - * - Windows AWS instance (g3.4xlarge): variable. My two attempts at different - * times have gotten the following numbers: ~8300, 3669 - */ -#ifdef _WIN32 -size_t deleteFunctionMaxRecursionDepth = 3000; -#else -size_t deleteFunctionMaxRecursionDepth = 10000; -#endif - -struct RecursionDepthCounter { - public: - explicit RecursionDepthCounter() { - ++deleteFunctionRecursionDepth; - } - ~RecursionDepthCounter() { - --deleteFunctionRecursionDepth; - } - - size_t value() { - return deleteFunctionRecursionDepth; +static void gatherFunctions(Function* func, + std::vector>& stack) { + for (auto& edge : func->next_edges()) { + if (edge.function.use_count() == 1) { + stack.emplace_back(std::move(edge.function)); + } } -}; +} /* - * Note that the custom deleter deletes in BFS style. Without using - * the custom deleter, the computation graph is deleted in a DFS style. - * The BFS deletion is valid (and safe) because if a shared_ptr - * 's reference count hits 0, nothing else will access it. - */ + * Fix for #5534: prevent stack overflow on deletion of deep computation graph + * + * Sometimes one can end up with a very big computation graph of Functions + * and Edges. Each std::shared_ptr contains a list of Edge, and + * each Edge contains a std::shared_ptr. Deleting a + * std::shared_ptr can trigger the recursive deletion of other + * std::shared_ptr's: this can stack overflow if the graph + * is deep enough. Here is an example of such a graph: + * + * shared_ptr -> Edge -> shared_ptr -> Edge -> ... -> shared_ptr + * + * The solution here is to detect when we are decrementing away the last + * reference to a Function, and when doing so to buffer up the Function's + * that will be recursively decremented. We can then decrement (and free) + * the original Function without causing a recursive cascade, before + * draining the buffer applying the same behavior. This is, in effect, + * converting recursion to a loop, using a heap buffer in place of the + * recursive call stack. + */ void deleteFunction(Function* function) { - RecursionDepthCounter recursion_depth; + // To avoid stack overflow on large computational graphs, + // we need to track reference decrementing and freeing + // on the heap. + std::vector> stack; + gatherFunctions(function, stack); + delete function; - if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) { - deleteFunctionQueue.push_back(function); - return; - } + while (!stack.empty()) { + auto& curr_func = stack.back(); - delete function; + if (curr_func.use_count() == 1) { + // If this is the last reference, gather function references + // that will be recursively decremented. + gatherFunctions(curr_func.get(), stack); + } - if (deleteFunctionQueue.empty()) { - return; - } - if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) { - AT_ERROR("Only one deleter per thread should be able to process " - "the delete queue. Please open an issue."); - } - while (!deleteFunctionQueue.empty()) { - auto queued_function = deleteFunctionQueue.front(); - deleteFunctionQueue.pop_front(); - delete queued_function; + stack.pop_back(); } } diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index dfe8683197f8ee..b4c90b1489a261 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -33,8 +33,6 @@ using edge_list = std::vector; using saved_variable_list = std::vector; using IndexRange = std::pair; -TORCH_API extern size_t deleteFunctionMaxRecursionDepth; - // Custom deleter to prevent stack overflows. void deleteFunction(Function* function); diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index b306426025e025..0be1cc2600a2f0 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -50,11 +50,6 @@ PyObject * THPAutograd_initExtension(PyObject *_unused) }); m.def("_pop_range", []() { torch::autograd::profiler::popRange(); }); - /// TODO: Replace this ASAP with a better solution for deep autograd graphs! - m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) { - torch::autograd::deleteFunctionMaxRecursionDepth = value; - }); - Py_RETURN_TRUE; } From 02c4cd3c8a3a4234ffbddde8d1214f0cfb209086 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Wed, 12 Sep 2018 14:45:52 -0700 Subject: [PATCH 091/237] Skip flaky distributed tests (#11594) Summary: context: https://github.com/pytorch/pytorch/issues/11582 cc pietern The controller you requested could not be found. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11594 Differential Revision: D9798871 Pulled By: SsnL fbshipit-source-id: 9f9e1871c7fd9505ca898865eb8068fab4d3416d --- test/test_distributed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_distributed.py b/test/test_distributed.py index abf3663966700b..6dd0d984e842d3 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -408,6 +408,7 @@ def test_broadcast(self): ) @skip_if_no_cuda_distributed @skip_if_no_gpu + @unittest.skip("Flaky test, see pytorch#11582") def test_broadcast_cuda(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -642,6 +643,7 @@ def test_all_reduce_sum(self): ) @skip_if_no_cuda_distributed @skip_if_no_gpu + @unittest.skip("Flaky test, see pytorch#11582") def test_all_reduce_sum_cuda(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -930,6 +932,7 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu") @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped") @skip_if_no_gpu + @unittest.skip("Flaky test, see pytorch#11582") def test_broadcast_multigpu(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -1190,6 +1193,7 @@ def test_DistributedDataParallelCPU(self): "Only Nccl & Gloo backend support DistributedDataParallel") @skip_if_no_cuda_distributed @skip_if_no_gpu + @unittest.skip("Flaky test, see pytorch#11582") def test_DistributedDataParallel(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() From b663b7ce7ede61e48cf1d985f534bf028f94f343 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 15:14:54 -0700 Subject: [PATCH 092/237] Update ROCm Docker image with latest AMD debians (#11507) Summary: Building at https://ci.pytorch.org/jenkins/job/caffe2-docker-trigger/194/ Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11507 Differential Revision: D9772474 Pulled By: ezyang fbshipit-source-id: ab00f05744547dc7ec9f97511e2c8495ac282fac --- docker/caffe2/jenkins/common/install_rocm.sh | 29 ++++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh index c69d857118b2d7..82692d0acdb9b6 100644 --- a/docker/caffe2/jenkins/common/install_rocm.sh +++ b/docker/caffe2/jenkins/common/install_rocm.sh @@ -60,25 +60,30 @@ install_rocrand() { # Install rocSPARSE/hipSPARSE that will be released soon - can co-exist w/ hcSPARSE which will be removed soon install_hipsparse() { mkdir -p /opt/rocm/debians - curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.1.0.deb -o /opt/rocm/debians/rocsparse.deb - curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.1.0.deb -o /opt/rocm/debians/hipsparse.deb + curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.2.114-Linux.deb -o /opt/rocm/debians/rocsparse.deb + curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.2.55-Linux.deb -o /opt/rocm/debians/hipsparse.deb dpkg -i /opt/rocm/debians/rocsparse.deb dpkg -i /opt/rocm/debians/hipsparse.deb } # Install custom hcc containing two compiler fixes relevant to PyTorch install_customhcc() { + HIP_VERSION="1.5.18354" mkdir -p /opt/rocm/debians - curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-1.2.18272-Linux.deb - curl https://s3.amazonaws.com/ossci-linux/hip_base-1.5.18276.deb -o /opt/rocm/debians/hip_base-1.5.18276.deb - curl https://s3.amazonaws.com/ossci-linux/hip_doc-1.5.18276.deb -o /opt/rocm/debians/hip_doc-1.5.18276.deb - curl https://s3.amazonaws.com/ossci-linux/hip_samples-1.5.18276.deb -o /opt/rocm/debians/hip_samples-1.5.18276.deb - curl https://s3.amazonaws.com/ossci-linux/hip_hcc-1.5.18276.deb -o /opt/rocm/debians/hip_hcc-1.5.18276.deb - dpkg -i /opt/rocm/debians/hcc-1.2.18272-Linux.deb - dpkg -i /opt/rocm/debians/hip_base-1.5.18276.deb - dpkg -i /opt/rocm/debians/hip_doc-1.5.18276.deb - dpkg -i /opt/rocm/debians/hip_samples-1.5.18276.deb - dpkg -i /opt/rocm/debians/hip_hcc-1.5.18276.deb + curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-Linux.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_base-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_base.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_doc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_doc.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_samples-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_samples.deb + curl "https://s3.amazonaws.com/ossci-linux/hip_hcc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_hcc.deb + dpkg -i /opt/rocm/debians/hcc-Linux.deb + dpkg -i /opt/rocm/debians/hip_base.deb + dpkg -i /opt/rocm/debians/hip_doc.deb + dpkg -i /opt/rocm/debians/hip_samples.deb + dpkg -i /opt/rocm/debians/hip_hcc.deb + + if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then + sudo sed -i 's/\ -I${dir}/\ $<$:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake + fi } # Install Python packages depending on the base OS From ac94889939573d00ba10394766f036dc90ae5a21 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Wed, 12 Sep 2018 15:16:05 -0700 Subject: [PATCH 093/237] Add jit doc entry to sidebar (#11598) Summary: cc zdevito apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/11598 Differential Revision: D9801230 Pulled By: SsnL fbshipit-source-id: f0c8d2468b64a50c3c437667d462722dcd2682d1 --- docs/source/index.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 7732366d38a61c..31c6c3f743da06 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,8 +32,8 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs. optim torch.autograd torch.distributed - torch.distributed.deprecated torch.distributions + torch.jit torch.multiprocessing bottleneck checkpoint @@ -43,6 +43,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs. ffi model_zoo onnx + torch.distributed.deprecated torch.legacy .. toctree:: From c81406c5148bbbf2b0a4e3ef92636fe70fa2d6a6 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Wed, 12 Sep 2018 15:39:27 -0700 Subject: [PATCH 094/237] Document Any (#11580) Summary: Documents the `AnyModule` class in the C++ API. Also changed the API to be friendlier by default. Calling `AnyModule::forward` used to return an `AnyModule::Value` which you had to call `.get()` on to cast to a concrete type. I changed the name of that `forward` method to `any_forward` and instead made `forward` templated on a `ReturnType` template parameter which you can supply to do the `.get` cast for you automatically. I default this parameter to `torch::Tensor` so that it can often be omitted. So where you used to have to write ```cpp any_module.forward(...).get(); any_module.forward(...).get(); ``` you now write ```cpp any_module.forward(...); any_module.forward(...); ``` ebetica ezyang soumith Pull Request resolved: https://github.com/pytorch/pytorch/pull/11580 Differential Revision: D9798626 Pulled By: goldsborough fbshipit-source-id: 060b4ea28facaffc417f53b80b846a9dff9acb73 --- test/cpp/api/any.cpp | 28 +++-- torch/csrc/api/include/torch/nn/modules/any.h | 105 ++++++++++++++++-- .../api/include/torch/nn/modules/sequential.h | 6 +- 3 files changed, 117 insertions(+), 22 deletions(-) diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp index ab044b84dc056d..9368d4dd5a4983 100644 --- a/test/cpp/api/any.cpp +++ b/test/cpp/api/any.cpp @@ -22,8 +22,9 @@ TEST_CASE("any-module") { } }; AnyModule any(M{}); - REQUIRE(any.forward().get() == 123); + REQUIRE(any.forward() == 123); } + SECTION("int(int)") { struct M : torch::nn::Module { int forward(int x) { @@ -31,8 +32,9 @@ TEST_CASE("any-module") { } }; AnyModule any(M{}); - REQUIRE(any.forward(5).get() == 5); + REQUIRE(any.forward(5) == 5); } + SECTION("const char*(const char*)") { struct M : torch::nn::Module { const char* forward(const char* x) { @@ -40,7 +42,7 @@ TEST_CASE("any-module") { } }; AnyModule any(M{}); - REQUIRE(any.forward("hello").get() == std::string("hello")); + REQUIRE(any.forward("hello") == std::string("hello")); } SECTION("string(int, const double)") { @@ -51,7 +53,7 @@ TEST_CASE("any-module") { }; AnyModule any(M{}); int x = 4; - REQUIRE(any.forward(x, 3.14).get() == std::string("7")); + REQUIRE(any.forward(x, 3.14) == std::string("7")); } SECTION("Tensor(string, const string&, string&&)") { @@ -66,8 +68,8 @@ TEST_CASE("any-module") { }; AnyModule any(M{}); REQUIRE( - any.forward(std::string("a"), std::string("ab"), std::string("abc")) - .get() + any.forward( + std::string("a"), std::string("ab"), std::string("abc")) .sum() .toCInt() == 6); } @@ -181,7 +183,7 @@ TEST_CASE("any-module") { any.forward(5), StartsWith("Cannot call forward() on an empty AnyModule")); } - SECTION("can move assign differentm modules") { + SECTION("can move assign different modules") { struct M : torch::nn::Module { std::string forward(int x) { return std::to_string(x); @@ -196,10 +198,10 @@ TEST_CASE("any-module") { REQUIRE(any.is_empty()); any = std::make_shared(); REQUIRE(!any.is_empty()); - REQUIRE(any.forward(5).get() == "5"); + REQUIRE(any.forward(5) == "5"); any = std::make_shared(); REQUIRE(!any.is_empty()); - REQUIRE(any.forward(5.0f).get() == 8); + REQUIRE(any.forward(5.0f) == 8); } SECTION("constructs from ModuleHolder") { struct MImpl : torch::nn::Module { @@ -218,6 +220,10 @@ TEST_CASE("any-module") { AnyModule any(M{5}); REQUIRE(any.get().value == 5); REQUIRE(any.get()->value == 5); + + AnyModule module(Linear(3, 4)); + std::shared_ptr ptr = module.ptr(); + Linear linear(module.get()); } SECTION("converts autograd::Variable to torch::Tensor correctly") { struct M : torch::nn::Module { @@ -232,12 +238,10 @@ TEST_CASE("any-module") { AnyModule any(M{}); REQUIRE( any.forward(torch::autograd::Variable(torch::ones(5))) - .get() .sum() .toCFloat() == 5); // at::Tensors that are not variables work too. - REQUIRE( - any.forward(at::ones(5)).get().sum().toCFloat() == 5); + REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5); } } } diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h index 920aea6aab3a54..626149469b5035 100644 --- a/torch/csrc/api/include/torch/nn/modules/any.h +++ b/torch/csrc/api/include/torch/nn/modules/any.h @@ -21,10 +21,87 @@ namespace torch { namespace nn { -/// A class to store a type erased module, whose `forward()` method can be -/// invoked, with dynamic type checking. An `AnyModule` has an empty state, into -/// which it is default constructed. `is_empty()` can be used to query whether -/// the `AnyModule` is empty. +/// Stores a type erased `Module`. +/// +/// The PyTorch C++ API does not impose an interface on the signature of +/// `forward()` in `Module` subclasses. This gives you complete freedom to +/// design your `forward()` methods to your liking. However, this also means +/// there is no unified base type you could store in order to call `forward()` +/// polymorphically for any module. This is where the `AnyModule` comes in. +/// Instead of inheritance, it relies on type erasure for polymorphism. +/// +/// An `AnyModule` can store any `nn::Module` subclass that provides a +/// `forward()` method. This `forward()` may accept any types and return any +/// type. Once stored in an `AnyModule`, you can invoke the underlying module's +/// `forward()` by calling `AnyModule::forward()` with the arguments you would +/// supply to the stored module (though see one important limitation below). +/// Example: +/// +/// \rst +/// .. code-block:: +/// struct GenericTrainer { +/// torch::nn::AnyModule module; +/// +/// void train(torch::Tensor input) { +/// module.forward(input); +/// } +/// }; +/// +/// GenericTrainer trainer1{torch::nn::Linear(3, 4)}; +/// GenericTrainer trainer2{torch::nn::Conv2d(3, 4, 2)}; +/// \endrst +/// +/// As `AnyModule` erases the static type of the stored module (and its +/// `forward()` method) to achieve polymorphism, type checking of arguments is +/// moved to runtime. That is, passing an argument with an incorrect type to an +/// `AnyModule` will compile, but throw an exception at runtime: +/// +/// \rst +/// .. code-block:: +/// torch::nn::AnyModule module(torch::nn::Linear(3, 4)); +/// // Linear takes a tensor as input, but we are passing an integer. +/// // This will compile, but throw a `torch::Error` exception at runtime. +/// module.forward(123); +/// \endrst +/// +/// \rst +/// .. attention:: +/// One noteworthy limitation of `AnyModule` is that its `forward()` method +/// does not support implicit conversion of argument types. For example, if +/// the stored module's `forward()` method accepts a `float` and you call +/// `any_module.forward(3.4)` (where `3.4` is a `double`), this will throw +/// an exception. +/// \endrst +/// +/// The return type of the `AnyModule`'s `forward()` method is controlled via +/// the first template argument to `AnyModule::forward()`. It defaults to +/// `torch::Tensor`. To change it, you can write `any_module.forward()`, +/// for example. +/// +/// \rst +/// .. code-block:: +/// torch::nn::AnyModule module(torch::nn::Linear(3, 4)); +/// auto output = module.forward(torch::ones({2, 3})); +/// +/// struct IntModule { +/// int forward(int x) { return x; } +/// }; +/// torch::nn::AnyModule module(IntModule{}); +/// int output = module.forward(5); +/// \endrst +/// +/// The only other method an `AnyModule` provides access to on the stored +/// module is `clone()`. However, you may acquire a handle on the module via +/// `.ptr()`, which returns a `shared_ptr`. Further, if you know +/// the concrete type of the stored module, you can get a concrete handle to it +/// using `.get()` where `T` is the concrete module type. +/// +/// \rst +/// .. code-block:: +/// torch::nn::AnyModule module(torch::nn::Linear(3, 4)); +/// std::shared_ptr ptr = module.ptr(); +/// torch::nn::Linear linear(module.get()); +/// \endrst class AnyModule { public: /// A type-erased value. @@ -69,7 +146,13 @@ class AnyModule { /// returns the return value as an `Value`. Use this method when chaining /// `AnyModule`s in a loop. template - Value forward(ArgumentTypes&&... arguments); + Value any_forward(ArgumentTypes&&... arguments); + + /// Invokes `forward()` on the contained module with the given arguments, and + /// casts the returned `Value` to the supplied `ReturnType` (which defaults to + /// `torch::Tensor`). + template + ReturnType forward(ArgumentTypes&&... arguments); /// Attempts to cast the underlying module to the given module type. Throws an /// exception if the types do not match. @@ -358,7 +441,7 @@ AnyModule& AnyModule::operator=(std::shared_ptr module) { } template -AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) { +AnyModule::Value AnyModule::any_forward(ArgumentTypes&&... arguments) { AT_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule"); std::vector values; values.reserve(sizeof...(ArgumentTypes)); @@ -368,6 +451,12 @@ AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) { return content_->forward(std::move(values)); } +template +ReturnType AnyModule::forward(ArgumentTypes&&... arguments) { + return any_forward(std::forward(arguments)...) + .template get(); +} + template T& AnyModule::get() { AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule"); @@ -393,9 +482,9 @@ inline std::shared_ptr AnyModule::ptr() const { template std::shared_ptr AnyModule::ptr() const { AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule"); - /// Call get() but discard the value, just to do the type checking. + // Call get() but discard the value, just to do the type checking. get_(); - return std::static_pointer_cast(ptr()); + return std::dynamic_pointer_cast(ptr()); } inline const std::type_info& AnyModule::type_info() const { diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h index 9f3f7a0b983ab9..384afd1b7814bb 100644 --- a/torch/csrc/api/include/torch/nn/modules/sequential.h +++ b/torch/csrc/api/include/torch/nn/modules/sequential.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -57,10 +58,11 @@ class SequentialImpl : public Cloneable { AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential"); auto iterator = modules_.begin(); - auto input = iterator->forward(std::forward(arguments)...); + auto input = + iterator->any_forward(std::forward(arguments)...); for (++iterator; iterator != modules_.end(); ++iterator) { - input = iterator->forward(std::move(input)); + input = iterator->any_forward(std::move(input)); } // Check the return value and give a nice error message if the requsted From eb7a298489f5e755846c7fb6af1bc6ae338265f1 Mon Sep 17 00:00:00 2001 From: Xiaodong Wang Date: Wed, 12 Sep 2018 15:53:29 -0700 Subject: [PATCH 095/237] Add resnext model to OSS (#11468) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11468 Add resnext model into OSS Caffe 2 repo. Reviewed By: orionr, kuttas Differential Revision: D9506000 fbshipit-source-id: 236005d5d7dbeb8c2864014b1eea03810618d8e8 --- caffe2/python/examples/resnet50_trainer.py | 91 +++++-- caffe2/python/models/resnet.py | 264 ++++++++++++++------- 2 files changed, 253 insertions(+), 102 deletions(-) diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py index e1716857bb16b0..05b753b8fd397e 100644 --- a/caffe2/python/examples/resnet50_trainer.py +++ b/caffe2/python/examples/resnet50_trainer.py @@ -22,8 +22,10 @@ from caffe2.python.predictor_constants import predictor_constants as predictor_constants ''' -Parallelized multi-GPU distributed trainer for Resnet 50. Can be used to train -on imagenet data, for example. +Parallelized multi-GPU distributed trainer for Resne(X)t. +Can be used to train on imagenet data, for example. +The default parameters can train a standard Resnet-50 (1x64d), and parameters +can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d). To run the trainer in single-machine multi-gpu mode by setting num_shards = 1. @@ -39,14 +41,23 @@ ''' logging.basicConfig() -log = logging.getLogger("resnet50_trainer") +log = logging.getLogger("ResNe(X)t_trainer") log.setLevel(logging.DEBUG) dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops') dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops') -def AddImageInput(model, reader, batch_size, img_size, dtype, is_test): +def AddImageInput( + model, + reader, + batch_size, + img_size, + dtype, + is_test, + mean_per_channel=None, + std_per_channel=None, +): ''' The image input operator loads image and label data from the reader and applies transformations to the images (random cropping, mirroring, ...). @@ -58,6 +69,9 @@ def AddImageInput(model, reader, batch_size, img_size, dtype, is_test): output_type=dtype, use_gpu_transform=True if model._device_type == 1 else False, use_caffe_datum=True, + mean_per_channel=mean_per_channel, + std_per_channel=std_per_channel, + # mean_per_channel takes precedence over mean mean=128., std=128., scale=256, @@ -166,6 +180,7 @@ def RunEpoch( # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) + test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. @@ -194,19 +209,25 @@ def RunEpoch( data_parallel_model.GetLearningRateBlobNames(train_model)[0] ) test_accuracy = 0 - if (test_model is not None): + test_accuracy_top5 = 0 + if test_model is not None: # Run 100 iters of testing ntests = 0 - for _ in range(0, 100): + for _ in range(test_epoch_iters): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar(workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy' )) + test_accuracy_top5 += np.asscalar(workspace.FetchBlob( + "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5' + )) ntests += 1 test_accuracy /= ntests + test_accuracy_top5 /= ntests else: test_accuracy = (-1) + test_accuracy_top5 = (-1) explog.log( input_count=num_images, @@ -216,7 +237,8 @@ def RunEpoch( 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, - 'test_accuracy': test_accuracy, + 'top1_test_accuracy': test_accuracy, + 'top5_test_accuracy': test_accuracy_top5, } ) assert loss < 40, "Exploded gradients :(" @@ -243,6 +265,17 @@ def Train(args): total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" + # Verify valid image mean/std per channel + if args.image_mean_per_channel: + assert \ + len(args.image_mean_per_channel) == args.num_channels, \ + "The number of channels of image mean doesn't match input" + + if args.image_std_per_channel: + assert \ + len(args.image_std_per_channel) == args.num_channels, \ + "The number of channels of image std doesn't match input" + # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) @@ -262,7 +295,7 @@ def Train(args): 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( - name="resnet50", arg_scope=train_arg_scope + name='resnext' + str(args.num_layers), arg_scope=train_arg_scope ) num_shards = args.num_shards @@ -324,7 +357,7 @@ def Train(args): rendezvous = None # Model building functions - def create_resnet50_model_ops(model, loss_scale): + def create_resnext_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) @@ -333,11 +366,14 @@ def create_resnet50_model_ops(model, loss_scale): BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): - pred = resnet.create_resnet50( + pred = resnet.create_resnext( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, + num_layers=args.num_layers, + num_groups=args.resnext_num_groups, + num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True, ) @@ -348,7 +384,8 @@ def create_resnet50_model_ops(model, loss_scale): softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) - brew.accuracy(model, [softmax, "label"], "accuracy") + brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) + brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): @@ -408,6 +445,8 @@ def add_image_input(model): img_size=args.image_size, dtype=args.dtype, is_test=False, + mean_per_channel=args.image_mean_per_channel, + std_per_channel=args.image_std_per_channel, ) def add_post_sync_ops(model): @@ -423,7 +462,7 @@ def add_post_sync_ops(model): data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, - forward_pass_builder_fun=create_resnet50_model_ops, + forward_pass_builder_fun=create_resnext_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, @@ -449,7 +488,9 @@ def add_post_sync_ops(model): 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( - name="resnet50_test", arg_scope=test_arg_scope, init_params=False + name='resnext' + str(args.num_layers) + "_test", + arg_scope=test_arg_scope, + init_params=False, ) test_reader = test_model.CreateDB( @@ -466,12 +507,14 @@ def test_input_fn(model): img_size=args.image_size, dtype=args.dtype, is_test=True, + mean_per_channel=args.image_mean_per_channel, + std_per_channel=args.image_std_per_channel, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, - forward_pass_builder_fun=create_resnet50_model_ops, + forward_pass_builder_fun=create_resnext_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, @@ -497,7 +540,8 @@ def test_input_fn(model): else: log.warning("The format of load_model_path doesn't match!") - expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( + expname = "resnext_%d_gpu%d_b%d_L%d_lr%.2f_v2" % ( + args.num_layers, args.num_gpus, total_batch_size, args.num_labels, @@ -534,12 +578,24 @@ def test_input_fn(model): def main(): # TODO: use argv parser = argparse.ArgumentParser( - description="Caffe2: Resnet-50 training" + description="Caffe2: ResNe(X)t training" ) parser.add_argument("--train_data", type=str, default=None, required=True, help="Path to training data (or 'null' to simulate)") + parser.add_argument("--num_layers", type=int, default=50, + help="The number of layers in ResNe(X)t model") + parser.add_argument("--resnext_num_groups", type=int, default=1, + help="The cardinality of resnext") + parser.add_argument("--resnext_width_per_group", type=int, default=64, + help="The cardinality of resnext") parser.add_argument("--test_data", type=str, default=None, help="Path to test data") + parser.add_argument("--image_mean_per_channel", type=float, nargs='+', + help="The per channel mean for the images") + parser.add_argument("--image_std_per_channel", type=float, nargs='+', + help="The per channel standard deviation for the images") + parser.add_argument("--test_epoch_size", type=int, default=50000, + help="Number of test images") parser.add_argument("--db_type", type=str, default="lmdb", help="Database type (such as lmdb or leveldb)") parser.add_argument("--gpus", type=str, @@ -576,7 +632,7 @@ def main(): help="Port of Redis server (for rendezvous)") parser.add_argument("--file_store_path", type=str, default="/tmp", help="Path to directory to use for rendezvous") - parser.add_argument("--save_model_name", type=str, default="resnet50_model", + parser.add_argument("--save_model_name", type=str, default="resnext_model", help="Save the trained model to a given name") parser.add_argument("--load_model_path", type=str, default=None, help="Load previously saved model to continue training") @@ -598,6 +654,7 @@ def main(): Train(args) + if __name__ == '__main__': workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) main() diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py index 60e00ed1a1ae24..7c6c6dc27fe10a 100644 --- a/caffe2/python/models/resnet.py +++ b/caffe2/python/models/resnet.py @@ -6,9 +6,12 @@ from __future__ import print_function from caffe2.python import brew +import logging + ''' -Utility for creating ResNets -See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015 +Utility for creating ResNe(X)t +"Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015 +"Aggregated Residual Transformations for Deep Neural Networks" by Xie et. al. 2016 ''' @@ -17,16 +20,33 @@ class ResNetBuilder(): Helper class for constructing residual blocks. ''' - def __init__(self, model, prev_blob, no_bias, is_test, spatial_bn_mom=0.9): + def __init__( + self, + model, + prev_blob, + no_bias, + is_test, + bn_epsilon=1e-5, + bn_momentum=0.9, + ): self.model = model self.comp_count = 0 self.comp_idx = 0 self.prev_blob = prev_blob self.is_test = is_test - self.spatial_bn_mom = spatial_bn_mom + self.bn_epsilon = bn_epsilon + self.bn_momentum = bn_momentum self.no_bias = 1 if no_bias else 0 - def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0): + def add_conv( + self, + in_filters, + out_filters, + kernel, + stride=1, + group=1, + pad=0, + ): self.comp_idx += 1 self.prev_blob = brew.conv( self.model, @@ -37,6 +57,7 @@ def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0): weight_init=("MSRAFill", {}), kernel=kernel, stride=stride, + group=group, pad=pad, no_bias=self.no_bias, ) @@ -56,8 +77,8 @@ def add_spatial_bn(self, num_filters): self.prev_blob, 'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx), num_filters, - epsilon=1e-3, - momentum=self.spatial_bn_mom, + epsilon=self.bn_epsilon, + momentum=self.bn_momentum, is_test=self.is_test, ) return self.prev_blob @@ -71,7 +92,8 @@ def add_bottleneck( input_filters, # num of feature maps from preceding layer base_filters, # num of filters internally in the component output_filters, # num of feature maps to output - down_sampling=False, + stride=1, + group=1, spatial_batch_norm=True, ): self.comp_idx = 0 @@ -82,7 +104,7 @@ def add_bottleneck( input_filters, base_filters, kernel=1, - stride=1 + stride=1, ) if spatial_batch_norm: @@ -95,8 +117,9 @@ def add_bottleneck( base_filters, base_filters, kernel=3, - stride=(1 if down_sampling is False else 2), - pad=1 + stride=stride, + group=group, + pad=1, ) if spatial_batch_norm: @@ -109,9 +132,10 @@ def add_bottleneck( last_conv = self.add_spatial_bn(output_filters) # Summation with input signal (shortcut) - # If we need to increase dimensions (feature maps), need to - # do a projection for the short cut - if (output_filters > input_filters): + # When the number of feature maps mismatch between the input + # and output (this usually happens when the residual stage + # changes), we need to do a projection for the short cut + if output_filters != input_filters: shortcut_blob = brew.conv( self.model, shortcut_blob, @@ -120,7 +144,7 @@ def add_bottleneck( output_filters, weight_init=("MSRAFill", {}), kernel=1, - stride=(1 if down_sampling is False else 2), + stride=stride, no_bias=self.no_bias, ) if spatial_batch_norm: @@ -129,8 +153,8 @@ def add_bottleneck( shortcut_blob, 'shortcut_projection_%d_spatbn' % self.comp_count, output_filters, - epsilon=1e-3, - momentum=self.spatial_bn_mom, + epsilon=self.bn_epsilon, + momentum=self.bn_momentum, is_test=self.is_test, ) @@ -144,6 +168,8 @@ def add_bottleneck( # Keep track of number of high level components if this ResNetBuilder self.comp_count += 1 + return output_filters + def add_simple_block( self, input_filters, @@ -205,28 +231,102 @@ def add_simple_block( self.comp_count += 1 +def create_resnet_32x32( + model, data, num_input_channels, num_groups, num_labels, is_test=False +): + ''' + Create residual net for smaller images (sec 4.2 of He et. al (2015)) + num_groups = 'n' in the paper + ''' + # conv1 + maxpool + brew.conv( + model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1 + ) + brew.spatial_bn( + model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test + ) + brew.relu(model, 'conv1_spatbn', 'relu1') + + # Number of blocks as described in sec 4.2 + filters = [16, 32, 64] + + builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test) + prev_filters = 16 + for groupidx in range(0, 3): + for blockidx in range(0, 2 * num_groups): + builder.add_simple_block( + prev_filters if blockidx == 0 else filters[groupidx], + filters[groupidx], + down_sampling=(True if blockidx == 0 and + groupidx > 0 else False)) + prev_filters = filters[groupidx] + + # Final layers + brew.average_pool( + model, builder.prev_blob, 'final_avg', kernel=8, stride=1 + ) + brew.fc(model, 'final_avg', 'last_out', 64, num_labels) + softmax = brew.softmax(model, 'last_out', 'softmax') + return softmax + + +RESNEXT_BLOCK_CONFIG = { + 18: (2, 2, 2, 2), + 34: (3, 4, 6, 3), + 50: (3, 4, 6, 3), + 101: (3, 4, 23, 3), + 152: (3, 8, 36, 3), + 200: (3, 24, 36, 3), +} + +RESNEXT_STRIDES = [1, 2, 2, 2] + +logging.basicConfig() +log = logging.getLogger("resnext_builder") +log.setLevel(logging.DEBUG) + + # The conv1 and final_avg kernel/stride args provide a basic mechanism for # adapting resnet50 for different sizes of input images. -def create_resnet50( +def create_resnext( model, data, num_input_channels, num_labels, + num_layers, + num_groups, + num_width_per_group, label=None, is_test=False, no_loss=False, - no_bias=0, + no_bias=1, conv1_kernel=7, conv1_stride=2, final_avg_kernel=7, + log=None, + bn_epsilon=1e-5, + bn_momentum=0.9, ): + if num_layers not in RESNEXT_BLOCK_CONFIG: + log.error("{}-layer is invalid for resnext config".format(num_layers)) + + num_blocks = RESNEXT_BLOCK_CONFIG[num_layers] + strides = RESNEXT_STRIDES + num_filters = [64, 256, 512, 1024, 2048] + + if num_layers in [18, 34]: + num_filters = [64, 64, 128, 256, 512] + + # the number of features before the last FC layer + num_features = num_filters[-1] + # conv1 + maxpool - brew.conv( + conv_blob = brew.conv( model, data, 'conv1', num_input_channels, - 64, + num_filters[0], weight_init=("MSRAFill", {}), kernel=conv1_kernel, stride=conv1_stride, @@ -234,41 +334,40 @@ def create_resnet50( no_bias=no_bias ) - brew.spatial_bn( + bn_blob = brew.spatial_bn( model, - 'conv1', + conv_blob, 'conv1_spatbn_relu', - 64, - epsilon=1e-3, - momentum=0.1, + num_filters[0], + epsilon=bn_epsilon, + momentum=bn_momentum, is_test=is_test ) - brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu') - brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2) + relu_blob = brew.relu(model, bn_blob, bn_blob) + max_pool = brew.max_pool(model, relu_blob, 'pool1', kernel=3, stride=2, pad=1) # Residual blocks... - builder = ResNetBuilder(model, 'pool1', no_bias=no_bias, - is_test=is_test, spatial_bn_mom=0.1) - - # conv2_x (ref Table 1 in He et al. (2015)) - builder.add_bottleneck(64, 64, 256) - builder.add_bottleneck(256, 64, 256) - builder.add_bottleneck(256, 64, 256) - - # conv3_x - builder.add_bottleneck(256, 128, 512, down_sampling=True) - for _ in range(1, 4): - builder.add_bottleneck(512, 128, 512) - - # conv4_x - builder.add_bottleneck(512, 256, 1024, down_sampling=True) - for _ in range(1, 6): - builder.add_bottleneck(1024, 256, 1024) + builder = ResNetBuilder(model, max_pool, no_bias=no_bias, + is_test=is_test, bn_epsilon=1e-5, bn_momentum=0.9) + + inner_dim = num_groups * num_width_per_group + + # 4 different kinds of residual blocks + for residual_idx in range(4): + residual_num = num_blocks[residual_idx] + residual_stride = strides[residual_idx] + dim_in = num_filters[residual_idx] + + for blk_idx in range(residual_num): + dim_in = builder.add_bottleneck( + dim_in, + inner_dim, + num_filters[residual_idx + 1], # dim out + stride=residual_stride if blk_idx == 0 else 1, + group=num_groups, + ) - # conv5_x - builder.add_bottleneck(1024, 512, 2048, down_sampling=True) - builder.add_bottleneck(2048, 512, 2048) - builder.add_bottleneck(2048, 512, 2048) + inner_dim *= 2 # Final layers final_avg = brew.average_pool( @@ -282,7 +381,7 @@ def create_resnet50( # Final dimension of the "image" is reduced to 7x7 last_out = brew.fc( - model, final_avg, 'last_out_L{}'.format(num_labels), 2048, num_labels + model, final_avg, 'last_out_L{}'.format(num_labels), num_features, num_labels ) if no_loss: @@ -301,40 +400,35 @@ def create_resnet50( return brew.softmax(model, last_out, "softmax") -def create_resnet_32x32( - model, data, num_input_channels, num_groups, num_labels, is_test=False +# The conv1 and final_avg kernel/stride args provide a basic mechanism for +# adapting resnet50 for different sizes of input images. +def create_resnet50( + model, + data, + num_input_channels, + num_labels, + label=None, + is_test=False, + no_loss=False, + no_bias=0, + conv1_kernel=7, + conv1_stride=2, + final_avg_kernel=7, ): - ''' - Create residual net for smaller images (sec 4.2 of He et. al (2015)) - num_groups = 'n' in the paper - ''' - # conv1 + maxpool - brew.conv( - model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1 - ) - brew.spatial_bn( - model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test - ) - brew.relu(model, 'conv1_spatbn', 'relu1') - - # Number of blocks as described in sec 4.2 - filters = [16, 32, 64] - - builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test) - prev_filters = 16 - for groupidx in range(0, 3): - for blockidx in range(0, 2 * num_groups): - builder.add_simple_block( - prev_filters if blockidx == 0 else filters[groupidx], - filters[groupidx], - down_sampling=(True if blockidx == 0 and - groupidx > 0 else False)) - prev_filters = filters[groupidx] - - # Final layers - brew.average_pool( - model, builder.prev_blob, 'final_avg', kernel=8, stride=1 + # resnet50 is a special case for ResNeXt50-1x64d + return create_resnext( + model, + data, + num_input_channels, + num_labels, + num_layers=50, + num_groups=1, + num_width_per_group=64, + label=label, + is_test=is_test, + no_loss=no_loss, + no_bias=no_bias, + conv1_kernel=conv1_kernel, + conv1_stride=conv1_stride, + final_avg_kernel=final_avg_kernel, ) - brew.fc(model, 'final_avg', 'last_out', 64, num_labels) - softmax = brew.softmax(model, 'last_out', 'softmax') - return softmax From 316c1679408f99d97d516a8e3b5d7964bd4e78ea Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 12 Sep 2018 16:10:24 -0700 Subject: [PATCH 096/237] Add checking of nullptrs in GetTensorInfo (#11587) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11587 To help debug the issue in T33295362, we add some checks in the function. Possible crashing site in `GetTensorInfo` 1. tc is nullptr, which is checked. 2. tc->capacity_nbytes() hits nullptr, this is unlikely because storage is not a pointer and compute of capacity_nbytes doesn't involve pointers. It's numel * itermsize(). 3. tc->ExtractDeviceOption hits nullpt. One possibility raw_data() is nullptr because tc->ExtractDeviceOption will use that. This is checked. 4. Tensor itself which is not a reference. This is also checked. Reviewed By: salexspb Differential Revision: D9793484 fbshipit-source-id: 3fc72746fc310a23ae45553bbe0d269a4b9edb72 --- caffe2/core/context_gpu.h | 1 + caffe2/core/storage.h | 5 +++++ caffe2/core/tensor.cc | 3 +++ 3 files changed, 9 insertions(+) diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 3090ca57aedc31..cca1b8be6500a8 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -403,6 +403,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext { } void ExtractDeviceOption(DeviceOption* device, const void* data) override { + CAFFE_ENFORCE(data, "data cannot be nullptr"); device->set_device_type(TypeToProto(GetDeviceType())); device->set_cuda_gpu_id(GetGPUIDForPointer(data)); } diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h index 974985625df6b1..12bff9af69ad03 100644 --- a/caffe2/core/storage.h +++ b/caffe2/core/storage.h @@ -214,6 +214,11 @@ class CAFFE2_API Storage { storage_impl_->reset(); } + // For debugging purpose only, please don't call it + StorageImpl* unsafeGetStorageImp() const { + return storage_impl_.get(); + } + template inline bool IsType() const { return storage_impl_->IsType(); diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index 1659e6ba252bab..a24f8cb67b7347 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -94,6 +94,9 @@ vector GetTensorInfo( size_t* capacity, DeviceOption* device) { const Tensor* tc = static_cast(c); + CHECK(tc); + CHECK(tc->unsafeGetTensorImpl()); + CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImp()); *capacity = tc->capacity_nbytes(); tc->ExtractDeviceOption(device); return tc->dims(); From 12efef166aff92742a0816c6f62dca225a5a4c2e Mon Sep 17 00:00:00 2001 From: Eli Amesefe Date: Wed, 12 Sep 2018 16:18:10 -0700 Subject: [PATCH 097/237] Split out copy_op from utility_ops (#11470) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11470 In order to reduce build sizes, we are identifying files that can be split up into smaller units, allowing us to only include the ops we need. Reviewed By: orionr, ajtulloch Differential Revision: D9725819 fbshipit-source-id: def1074a33dffe99bd6a7e6e48aa9e5be3d04a6a --- caffe2/operators/copy_op.cc | 198 ++++++++++++++++++++++++++++++++ caffe2/operators/copy_op.cu | 48 ++++++++ caffe2/operators/copy_op.h | 38 ++++++ caffe2/operators/utility_ops.cc | 191 ------------------------------ caffe2/operators/utility_ops.cu | 43 ------- caffe2/operators/utility_ops.h | 27 ----- 6 files changed, 284 insertions(+), 261 deletions(-) create mode 100644 caffe2/operators/copy_op.cc create mode 100644 caffe2/operators/copy_op.cu create mode 100644 caffe2/operators/copy_op.h diff --git a/caffe2/operators/copy_op.cc b/caffe2/operators/copy_op.cc new file mode 100644 index 00000000000000..582e31475780bf --- /dev/null +++ b/caffe2/operators/copy_op.cc @@ -0,0 +1,198 @@ +#include "caffe2/operators/copy_op.h" + +namespace caffe2 { + +// From CPU, copy it to whatever the current context +REGISTER_CPU_OPERATOR( + CopyFromCPUInput, + CopyOp); +REGISTER_CPU_OPERATOR( + CopyOnDeviceLike, + CopyOnDeviceLikeOp); +REGISTER_CPU_OPERATOR(Copy, CopyOp); + +OPERATOR_SCHEMA(Copy) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .SetDoc(R"DOC( +Copy input tensor into output, potentially across devices. + +Github Links: + +- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.cc +- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.h + + +
+ + Example + +**Code** + +``` + +workspace.ResetWorkspace() + +op = core.CreateOperator( + "Copy", + ["input"], + ["output"] +) + +workspace.FeedBlob("input", np.random.rand(3,3)) +print("input:", workspace.FetchBlob("input")) +workspace.RunOperatorOnce(op) +print("output:", workspace.FetchBlob("output")) + +``` + +**Result** + +``` + +input: +[[0.16826761 0.68168217 0.55196001] + [0.19735483 0.34837823 0.69015595] + [0.09448514 0.57390828 0.37097193]] +output: +[[0.16826761 0.68168217 0.55196001] + [0.19735483 0.34837823 0.69015595] + [0.09448514 0.57390828 0.37097193]] + +``` + +
+ +)DOC") + .Input(0, "input", "(*Tensor*): input tensor to copy") + .Output(0, "output", "(*Tensor*): copy of input tensor"); + +OPERATOR_SCHEMA(CopyGPUToCPU) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .DeviceInferenceFunction([](const OperatorDef& def) { + CAFFE_ENFORCE( + def.has_device_option(), + "CopyGPUToCPU op should have cuda device option."); + auto& cuda_option = def.device_option(); + auto cpu_option = DeviceOption(); + vector in_dev(def.input_size(), cuda_option); + vector out_dev(def.output_size(), cpu_option); + return std::make_pair(in_dev, out_dev); + }) + .SetDoc(R"DOC( +Copy tensor for GPU to CPU context. Must be run under GPU device option. +)DOC") + .Input(0, "input", "The input tensor.") + .Output(0, "output", "Tensor that will contain a copy of the input."); + +OPERATOR_SCHEMA(CopyCPUToGPU) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .DeviceInferenceFunction([](const OperatorDef& def) { + CAFFE_ENFORCE( + def.has_device_option(), + "CopyCPUToGPU op should have cuda device option."); + auto& cuda_option = def.device_option(); + auto cpu_option = DeviceOption(); + vector in_dev(def.input_size(), cpu_option); + vector out_dev(def.output_size(), cuda_option); + return std::make_pair(in_dev, out_dev); + }) + .SetDoc(R"DOC( +Copy tensor for CPU to GPU context. Must be run under GPU device option. +)DOC") + .Input(0, "input", "The input tensor.") + .Output(0, "output", "Tensor that will contain a copy of the input."); + +OPERATOR_SCHEMA(CopyFromCPUInput) + .NumInputs(1) + .NumOutputs(1) + .IdenticalTypeAndShape() + .InputsCanCrossDevices() + .DeviceInferenceFunction([](const OperatorDef& def) { + auto op_device = + def.has_device_option() ? def.device_option() : DeviceOption(); + auto cpu_option = DeviceOption(); + vector in_dev(def.input_size(), cpu_option); + vector out_dev(def.output_size(), op_device); + return std::make_pair(in_dev, out_dev); + }) + .SetDoc(R"DOC( +Take a CPU input tensor and copy it to an output in the current +Context (GPU or CPU). This may involves cross-device MemCpy. +)DOC") + .Input(0, "input", "The input CPU tensor.") + .Output(0, "output", "either a TensorCUDA or a TensorCPU"); + +OPERATOR_SCHEMA(CopyOnDeviceLike) + .NumInputs(2) + .NumOutputs(1) + .SetDoc("Copy input tensor into output to the specific device.") + .Input(0, "input", "The input tensor.") + .Input(1, "dst", "Tensor, on which device the copy will be performed.") + .Output(0, "output", "Tensor that will contain a copy of the input."); + +struct GetCopyGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + return SingleGradientDef( + "CopyOnDeviceLike", + "", + vector{GO(0), I(0)}, + vector{GI(0)}); + } +}; +REGISTER_GRADIENT(Copy, GetCopyGradient); + +struct GetGPUToCPUGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + if (g_output_[0].IsDense()) { + return SingleGradientDef( + "CopyCPUToGPU", "", vector{GO(0)}, vector{GI(0)}); + } else { + return vector{CreateOperatorDef( + "CopyCPUToGPU", + "", + std::vector{GO_I(0)}, + std::vector{GI_I(0)}), + CreateOperatorDef( + "CopyCPUToGPU", + "", + std::vector{GO_V(0)}, + std::vector{GI_V(0)})}; + } + } +}; +REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient); + +struct GetCPUToGPUGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + vector GetGradientDefs() override { + if (g_output_[0].IsDense()) { + return SingleGradientDef( + "CopyGPUToCPU", "", vector{GO(0)}, vector{GI(0)}); + } else { + return vector{CreateOperatorDef( + "CopyGPUToCPU", + "", + std::vector{GO_I(0)}, + std::vector{GI_I(0)}), + CreateOperatorDef( + "CopyGPUToCPU", + "", + std::vector{GO_V(0)}, + std::vector{GI_V(0)})}; + } + } +}; +REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/copy_op.cu b/caffe2/operators/copy_op.cu new file mode 100644 index 00000000000000..e833e720e556f3 --- /dev/null +++ b/caffe2/operators/copy_op.cu @@ -0,0 +1,48 @@ +#include "caffe2/core/context_gpu.h" +#include "caffe2/operators/copy_op.h" + +namespace caffe2 { + +template <> +class CopyOnDeviceLikeOp + : public Operator { + public: + CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + USE_OPERATOR_FUNCTIONS(CUDAContext); + + bool RunOnDevice() override { + auto& input = Input(0); + auto* output = OperatorBase::Output(0, CUDA); + CUDAContext context(GetGPUIDForPointer(Input(1).raw_data())); + output->ResizeLike(input); + context.template CopyItems( + input.meta(), + input.size(), + input.raw_data(), + output->raw_mutable_data(input.meta())); + return true; + } +}; + +// From CPU, copy it to whatever the current context +REGISTER_CUDA_OPERATOR( + CopyFromCPUInput, + CopyOp); + +// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context, +// since gpu code will be involved. +REGISTER_CUDA_OPERATOR( + CopyGPUToCPU, + CopyOp); +REGISTER_CUDA_OPERATOR( + CopyCPUToGPU, + CopyOp); +// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe +// involving different GPUs. +REGISTER_CUDA_OPERATOR(Copy, CopyOp); + +REGISTER_CUDA_OPERATOR( + CopyOnDeviceLike, + CopyOnDeviceLikeOp); +} // namespace caffe2 diff --git a/caffe2/operators/copy_op.h b/caffe2/operators/copy_op.h new file mode 100644 index 00000000000000..11e8e15fbcf005 --- /dev/null +++ b/caffe2/operators/copy_op.h @@ -0,0 +1,38 @@ +#ifndef CAFFE2_OPERATORS_COPY_OP_H_ +#define CAFFE2_OPERATORS_COPY_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { + +template +class CopyOp : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + USE_SIMPLE_CTOR_DTOR(CopyOp) + + bool RunOnDevice() override { + auto& input = this->template Input(0, SrcContext::GetDeviceType()); + auto* output = + this->template Output(0, DstContext::GetDeviceType()); + output->ResizeLike(input); + this->context_.template CopyItems( + input.meta(), + input.size(), + input.raw_data(), + output->raw_mutable_data(input.meta())); + return true; + } +}; + +template +class CopyOnDeviceLikeOp : public CopyOp { + public: + CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) + : CopyOp(operator_def, ws) {} +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_COPY_OP_H_ diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 6c287c37f3d6c9..b370b5ecb438cc 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -52,14 +52,6 @@ REGISTER_CPU_OPERATOR( ScatterWeightedSumOp); REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); -// From CPU, copy it to whatever the current context -REGISTER_CPU_OPERATOR( - CopyFromCPUInput, - CopyOp); -REGISTER_CPU_OPERATOR( - CopyOnDeviceLike, - CopyOnDeviceLikeOp); -REGISTER_CPU_OPERATOR(Copy, CopyOp); REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp); REGISTER_CPU_OPERATOR(HasElements, HasElementsOp); REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp); @@ -379,133 +371,6 @@ Currently only works on CPU because of access to INDICES. "Update slices, with shape len(INDICES) + shape(X_0)[1:]") .Output(0, "DATA", "Has to be exactly the same tensor as the input 0"); -OPERATOR_SCHEMA(Copy) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .SetDoc(R"DOC( -Copy input tensor into output, potentially across devices. - -Github Links: - -- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc -- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h - - -
- - Example - -**Code** - -``` - -workspace.ResetWorkspace() - -op = core.CreateOperator( - "Copy", - ["input"], - ["output"] -) - -workspace.FeedBlob("input", np.random.rand(3,3)) -print("input:", workspace.FetchBlob("input")) -workspace.RunOperatorOnce(op) -print("output:", workspace.FetchBlob("output")) - -``` - -**Result** - -``` - -input: -[[0.16826761 0.68168217 0.55196001] - [0.19735483 0.34837823 0.69015595] - [0.09448514 0.57390828 0.37097193]] -output: -[[0.16826761 0.68168217 0.55196001] - [0.19735483 0.34837823 0.69015595] - [0.09448514 0.57390828 0.37097193]] - -``` - -
- -)DOC") - .Input(0, "input", "(*Tensor*): input tensor to copy") - .Output(0, "output", "(*Tensor*): copy of input tensor"); - -OPERATOR_SCHEMA(CopyGPUToCPU) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .DeviceInferenceFunction([](const OperatorDef& def) { - CAFFE_ENFORCE( - def.has_device_option(), - "CopyGPUToCPU op should have cuda device option."); - auto& cuda_option = def.device_option(); - auto cpu_option = DeviceOption(); - vector in_dev(def.input_size(), cuda_option); - vector out_dev(def.output_size(), cpu_option); - return std::make_pair(in_dev, out_dev); - }) - .SetDoc(R"DOC( -Copy tensor for GPU to CPU context. Must be run under GPU device option. -)DOC") - .Input(0, "input", "The input tensor.") - .Output(0, "output", "Tensor that will contain a copy of the input."); - -OPERATOR_SCHEMA(CopyCPUToGPU) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .DeviceInferenceFunction([](const OperatorDef& def) { - CAFFE_ENFORCE( - def.has_device_option(), - "CopyCPUToGPU op should have cuda device option."); - auto& cuda_option = def.device_option(); - auto cpu_option = DeviceOption(); - vector in_dev(def.input_size(), cpu_option); - vector out_dev(def.output_size(), cuda_option); - return std::make_pair(in_dev, out_dev); - }) - .SetDoc(R"DOC( -Copy tensor for CPU to GPU context. Must be run under GPU device option. -)DOC") - .Input(0, "input", "The input tensor.") - .Output(0, "output", "Tensor that will contain a copy of the input."); - -OPERATOR_SCHEMA(CopyFromCPUInput) - .NumInputs(1) - .NumOutputs(1) - .IdenticalTypeAndShape() - .InputsCanCrossDevices() - .DeviceInferenceFunction([](const OperatorDef& def) { - auto op_device = - def.has_device_option() ? def.device_option() : DeviceOption(); - auto cpu_option = DeviceOption(); - vector in_dev(def.input_size(), cpu_option); - vector out_dev(def.output_size(), op_device); - return std::make_pair(in_dev, out_dev); - }) - .SetDoc(R"DOC( -Take a CPU input tensor and copy it to an output in the current -Context (GPU or CPU). This may involves cross-device MemCpy. -)DOC") - .Input(0, "input", "The input CPU tensor.") - .Output(0, "output", "either a TensorCUDA or a TensorCPU"); - -OPERATOR_SCHEMA(CopyOnDeviceLike) - .NumInputs(2) - .NumOutputs(1) - .SetDoc("Copy input tensor into output to the specific device.") - .Input(0, "input", "The input tensor.") - .Input(1, "dst", "Tensor, on which device the copy will be performed.") - .Output(0, "output", "Tensor that will contain a copy of the input."); OPERATOR_SCHEMA(HasElements) .NumInputs(1) @@ -937,62 +802,6 @@ struct GetFlattenToVecGradient : public GradientMakerBase { }; REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient); -struct GetCopyGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "CopyOnDeviceLike", - "", - vector{GO(0), I(0)}, - vector{GI(0)}); - } -}; -REGISTER_GRADIENT(Copy, GetCopyGradient); - -struct GetGPUToCPUGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - if (g_output_[0].IsDense()) { - return SingleGradientDef( - "CopyCPUToGPU", "", vector{GO(0)}, vector{GI(0)}); - } else { - return vector{CreateOperatorDef( - "CopyCPUToGPU", - "", - std::vector{GO_I(0)}, - std::vector{GI_I(0)}), - CreateOperatorDef( - "CopyCPUToGPU", - "", - std::vector{GO_V(0)}, - std::vector{GI_V(0)})}; - } - } -}; -REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient); - -struct GetCPUToGPUGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - if (g_output_[0].IsDense()) { - return SingleGradientDef( - "CopyGPUToCPU", "", vector{GO(0)}, vector{GI(0)}); - } else { - return vector{CreateOperatorDef( - "CopyGPUToCPU", - "", - std::vector{GO_I(0)}, - std::vector{GI_I(0)}), - CreateOperatorDef( - "CopyGPUToCPU", - "", - std::vector{GO_V(0)}, - std::vector{GI_V(0)})}; - } - } -}; -REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient); - SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds); SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths); SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges); diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu index e771c4ee36e1cc..8272bbcad3e55c 100644 --- a/caffe2/operators/utility_ops.cu +++ b/caffe2/operators/utility_ops.cu @@ -36,28 +36,6 @@ bool SumOp::RunOnDevice() { return false; } -template <> -class CopyOnDeviceLikeOp - : public Operator { - public: - CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} - USE_OPERATOR_FUNCTIONS(CUDAContext); - - bool RunOnDevice() override { - auto& input = Input(0); - auto* output = OperatorBase::Output(0, CUDA); - CUDAContext context(GetGPUIDForPointer(Input(1).raw_data())); - output->ResizeLike(input); - context.template CopyItems( - input.meta(), - input.size(), - input.raw_data(), - output->raw_mutable_data(input.meta())); - return true; - } -}; - REGISTER_CUDA_OPERATOR(Print, PrintOp); REGISTER_CUDA_OPERATOR(Flatten, FlattenOp); REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp); @@ -66,27 +44,6 @@ REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp); REGISTER_CUDA_OPERATOR(Sum, SumOp); REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp); -// From CPU, copy it to whatever the current context -REGISTER_CUDA_OPERATOR( - CopyFromCPUInput, - CopyOp); - -// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context, -// since gpu code will be involved. -REGISTER_CUDA_OPERATOR( - CopyGPUToCPU, - CopyOp); -REGISTER_CUDA_OPERATOR( - CopyCPUToGPU, - CopyOp); -// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe -// involving different GPUs. -REGISTER_CUDA_OPERATOR(Copy, CopyOp); - -REGISTER_CUDA_OPERATOR( - CopyOnDeviceLike, - CopyOnDeviceLikeOp); - REGISTER_CUDA_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp); CAFFE_KNOWN_TYPE(const float*); diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index c661ec2f5c55ec..9a615f53e25394 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -698,33 +698,6 @@ class ScatterAssignOp : public Operator { INPUT_TAGS(DATA, INDICES, SLICES); }; -template -class CopyOp : public Operator { - public: - USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_SIMPLE_CTOR_DTOR(CopyOp); - - bool RunOnDevice() override { - auto& input = this->template Input(0, SrcContext::GetDeviceType()); - auto* output = - this->template Output(0, DstContext::GetDeviceType()); - output->ResizeLike(input); - this->context_.template CopyItems( - input.meta(), - input.size(), - input.raw_data(), - output->raw_mutable_data(input.meta())); - return true; - } -}; - -template -class CopyOnDeviceLikeOp : public CopyOp { - public: - CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws) - : CopyOp(operator_def, ws) {} -}; - template class LengthsToSegmentIdsOp : public Operator { public: From 130d55a5f4470690885668bbe1df10a2183638ac Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Wed, 12 Sep 2018 16:40:30 -0700 Subject: [PATCH 098/237] Allow building the C++ API without cereal (#11498) Summary: I am working on unifying the C++ extensions and C++ API, and one constraint for this is that we will want to be able to build the C++ API without cereal, since we won't want to ship it with the Python `torch` package. For this I introduce a `TORCH_WITH_CEREAL` option to CMake. If on, the C++ API will be built with cereal and thus serialization support. If off, serialization functions will throw exceptions, but the library will otherwise still compile the same. __This option is on by default, so for regular C++ API users nothing will change__. However, from C++ extensions, we'll be able to turn it off. This effectively means we won't be searching for any cereal headers from C++ API headers, which wouldn't be installed in the Python package. ebetica ezyang soumith Pull Request resolved: https://github.com/pytorch/pytorch/pull/11498 Differential Revision: D9784803 Pulled By: goldsborough fbshipit-source-id: 5d0a1f2501993012d28cf3d730f45932b483abc4 --- CMakeLists.txt | 1 + cmake/Summary.cmake | 1 + tools/build_libtorch.py | 13 +++-- tools/build_pytorch_libs.sh | 5 ++ torch/CMakeLists.txt | 24 +++++--- torch/csrc/api/include/torch/optim/adagrad.h | 15 ++++- torch/csrc/api/include/torch/optim/adam.h | 16 +++-- torch/csrc/api/include/torch/optim/lbfgs.h | 9 +-- torch/csrc/api/include/torch/optim/rmsprop.h | 16 +++-- torch/csrc/api/include/torch/optim/sgd.h | 15 ++++- torch/csrc/api/include/torch/serialization.h | 61 +++++++++++--------- 11 files changed, 116 insertions(+), 60 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60d69ef2d9bc68..827121b1fc5931 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,6 +124,7 @@ cmake_dependent_option( cmake_dependent_option( USE_GLOO_IBVERBS "Use Gloo IB verbs for distributed. Only available if USE_GLOO is on." OFF "USE_GLOO" OFF) +option(TORCH_USE_CEREAL "Build the C++ API with Cereal for serialization support" OFF) # Used when building Caffe2 through setup.py option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 16d18ac7634d0d..a314e4d348b708 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -125,6 +125,7 @@ function (caffe2_print_configuration_summary) message(STATUS " USE_GLOO : ${USE_GLOO}") message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}") endif() + message(STATUS " TORCH_USE_CEREAL : ${TORCH_USE_CEREAL}") message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}") message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}") diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py index 8428282ab7b9f8..df29875666063e 100644 --- a/tools/build_libtorch.py +++ b/tools/build_libtorch.py @@ -9,7 +9,8 @@ if __name__ == '__main__': # Placeholder for future interface. For now just gives a nice -h. parser = argparse.ArgumentParser(description='Build libtorch') - args = parser.parse_args() + parser.add_argument('--use-cereal', action='store_true') + options = parser.parse_args() os.environ['BUILD_TORCH'] = 'ON' os.environ['BUILD_TEST'] = 'ON' @@ -19,11 +20,13 @@ tools_path = os.path.dirname(os.path.abspath(__file__)) build_pytorch_libs = os.path.join(tools_path, 'build_pytorch_libs.sh') - command = '{} --use-nnpack '.format(build_pytorch_libs) + command = [build_pytorch_libs, '--use-nnpack'] if USE_CUDA: - command += '--use-cuda ' - command += 'caffe2' + command.append('--use-cuda') + if options.use_cereal: + command.append('--use-cereal') + command.append('caffe2') sys.stdout.flush() sys.stderr.flush() - subprocess.check_call(shlex.split(command), universal_newlines=True) + subprocess.check_call(command, universal_newlines=True) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 66c8e6d65cf0b0..79dfc948ea07b2 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -22,6 +22,7 @@ USE_NNPACK=0 USE_MKLDNN=0 USE_GLOO_IBVERBS=0 CAFFE2_STATIC_LINK_CUDA=0 +TORCH_USE_CEREAL=0 RERUN_CMAKE=1 while [[ $# -gt 0 ]]; do case "$1" in @@ -46,6 +47,9 @@ while [[ $# -gt 0 ]]; do --cuda-static-link) CAFFE2_STATIC_LINK_CUDA=1 ;; + --use-cereal) + TORCH_USE_CEREAL=1 + ;; *) break ;; @@ -190,6 +194,7 @@ function build() { -DTHCUNN_SO_VERSION=1 \ -DTHD_SO_VERSION=1 \ -DUSE_CUDA=$USE_CUDA \ + -DTORCH_USE_CEREAL=$TORCH_USE_CEREAL \ -DBUILD_EXAMPLES=OFF \ -DBUILD_TEST=$BUILD_TEST \ -DNO_NNPACK=$((1-$USE_NNPACK)) \ diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index b0f07a7ef3a73b..427270d2a89fa1 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -211,7 +211,6 @@ if (NOT NO_API AND NOT USE_ROCM) ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp ) - endif() @@ -302,6 +301,13 @@ if (NOT NO_API AND NOT USE_ROCM) target_include_directories(torch PUBLIC ${TORCH_SRC_DIR}/csrc/api ${TORCH_SRC_DIR}/csrc/api/include) + + if (TORCH_USE_CEREAL) + target_compile_definitions(torch PUBLIC TORCH_USE_CEREAL) + # SYSTEM headers are included with -isystem and thus do not trigger warnings. + target_include_directories(torch SYSTEM PUBLIC + "${TORCH_ROOT}/third_party/cereal/include") # For cereal/ + endif() endif() if(USE_CUDA) @@ -369,10 +375,6 @@ target_include_directories(torch PRIVATE ${ATen_CPU_INCLUDE}) target_include_directories(torch PUBLIC ${TORCH_SRC_DIR}/csrc) -# SYSTEM headers are included with -isystem and thus do not trigger warnings. -target_include_directories(torch SYSTEM PUBLIC - "${TORCH_ROOT}/third_party/cereal/include") # For cereal/ - set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1) if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1") @@ -407,7 +409,7 @@ endif() if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api") - add_executable(test_api + set(TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/any.cpp ${TORCH_API_TEST_DIR}/cursor.cpp ${TORCH_API_TEST_DIR}/integration.cpp @@ -419,15 +421,19 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) ${TORCH_API_TEST_DIR}/parallel.cpp ${TORCH_API_TEST_DIR}/rnn.cpp ${TORCH_API_TEST_DIR}/sequential.cpp - ${TORCH_API_TEST_DIR}/serialization.cpp ${TORCH_API_TEST_DIR}/static.cpp ${TORCH_API_TEST_DIR}/tensor_cuda.cpp ${TORCH_API_TEST_DIR}/tensor.cpp ${TORCH_API_TEST_DIR}/jit.cpp - # Temporary until ATen tests are built with Caffe2 ${TORCH_API_TEST_DIR}/tensor_options.cpp ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp - ) + ) + + if (TORCH_USE_CEREAL) + list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/serialization.cpp) + endif() + + add_executable(test_api ${TORCH_API_TEST_SOURCES}) target_include_directories(test_api PUBLIC diff --git a/torch/csrc/api/include/torch/optim/adagrad.h b/torch/csrc/api/include/torch/optim/adagrad.h index ced6a68a83f53b..203ce24a33cb14 100644 --- a/torch/csrc/api/include/torch/optim/adagrad.h +++ b/torch/csrc/api/include/torch/optim/adagrad.h @@ -2,13 +2,11 @@ #include #include +#include #include #include -#include -#include - #include #include @@ -37,12 +35,16 @@ class Adagrad : public Optimizer { template void serialize(Archive& ar) { +#if defined(TORCH_USE_CEREAL) ar(CEREAL_NVP(sum_)); ar(CEREAL_NVP(step_)); +#endif // defined(TORCH_USE_CEREAL) } private: +#if defined(TORCH_USE_CEREAL) friend class cereal::access; +#endif // defined(TORCH_USE_CEREAL) Adagrad() : options(0) {} std::vector sum_; @@ -50,3 +52,10 @@ class Adagrad : public Optimizer { }; } // namespace optim } // namespace torch + +#if defined(TORCH_USE_CEREAL) +CEREAL_REGISTER_TYPE(torch::optim::Adagrad); +CEREAL_REGISTER_POLYMORPHIC_RELATION( + torch::optim::Optimizer, + torch::optim::Adagrad); +#endif // defined(TORCH_USE_CEREAL) diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h index 87d00e6a6676a7..514715b6b0c181 100644 --- a/torch/csrc/api/include/torch/optim/adam.h +++ b/torch/csrc/api/include/torch/optim/adam.h @@ -3,12 +3,10 @@ #include #include #include +#include #include -#include -#include - #include #include @@ -36,16 +34,20 @@ class Adam : public Optimizer { template void serialize(Archive& ar) { +#if defined(TORCH_USE_CEREAL) ar(CEREAL_NVP(step_buffers_), CEREAL_NVP(exp_average_buffers_), CEREAL_NVP(exp_average_sq_buffers_), CEREAL_NVP(max_exp_average_sq_buffers_)); +#endif // defined(TORCH_USE_CEREAL) } AdamOptions options; private: +#if defined(TORCH_USE_CEREAL) friend class cereal::access; +#endif // defined(TORCH_USE_CEREAL) Adam() : options(0) {} std::vector step_buffers_; @@ -53,6 +55,12 @@ class Adam : public Optimizer { std::vector exp_average_sq_buffers_; std::vector max_exp_average_sq_buffers_; }; - } // namespace optim } // namespace torch + +#if defined(TORCH_USE_CEREAL) +CEREAL_REGISTER_TYPE(torch::optim::Adam); +CEREAL_REGISTER_POLYMORPHIC_RELATION( + torch::optim::Optimizer, + torch::optim::Adam); +#endif // defined(TORCH_USE_CEREAL) diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h index 9ea8065f000872..eec8c8fdc415d0 100644 --- a/torch/csrc/api/include/torch/optim/lbfgs.h +++ b/torch/csrc/api/include/torch/optim/lbfgs.h @@ -2,12 +2,10 @@ #include #include +#include #include -#include -#include - #include #include #include @@ -41,6 +39,7 @@ class LBFGS : public LossClosureOptimizer { template void serialize(Archive& ar) { +#if defined(TORCH_USE_CEREAL) ar(CEREAL_NVP(d)); ar(CEREAL_NVP(t)); ar(CEREAL_NVP(H_diag)); @@ -48,10 +47,13 @@ class LBFGS : public LossClosureOptimizer { ar(CEREAL_NVP(prev_loss)); ar(CEREAL_NVP(old_dirs)); ar(CEREAL_NVP(old_stps)); +#endif // defined(TORCH_USE_CEREAL) } private: +#if defined(TORCH_USE_CEREAL) friend class cereal::access; +#endif // defined(TORCH_USE_CEREAL) LBFGS() : options(0) {} Tensor gather_flat_grad(); @@ -69,6 +71,5 @@ class LBFGS : public LossClosureOptimizer { int64_t func_evals{0}; int64_t state_n_iter{0}; }; - } // namespace optim } // namespace torch diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h index f7f1ddb96a0971..4a84331f92810c 100644 --- a/torch/csrc/api/include/torch/optim/rmsprop.h +++ b/torch/csrc/api/include/torch/optim/rmsprop.h @@ -2,12 +2,10 @@ #include #include +#include #include -#include -#include - #include #include #include @@ -41,19 +39,29 @@ class RMSprop : public Optimizer { template void serialize(Archive& ar) { +#if defined(TORCH_USE_CEREAL) ar(CEREAL_NVP(square_average_buffers_)); ar(CEREAL_NVP(momentum_buffers_)); ar(CEREAL_NVP(grad_average_buffers_)); +#endif // defined(TORCH_USE_CEREAL) } private: +#if defined(TORCH_USE_CEREAL) friend class cereal::access; +#endif // defined(TORCH_USE_CEREAL) RMSprop() : options(0) {} std::vector square_average_buffers_; std::vector momentum_buffers_; std::vector grad_average_buffers_; }; - } // namespace optim } // namespace torch + +#if defined(TORCH_USE_CEREAL) +CEREAL_REGISTER_TYPE(torch::optim::RMSprop); +CEREAL_REGISTER_POLYMORPHIC_RELATION( + torch::optim::Optimizer, + torch::optim::RMSprop); +#endif // defined(TORCH_USE_CEREAL) diff --git a/torch/csrc/api/include/torch/optim/sgd.h b/torch/csrc/api/include/torch/optim/sgd.h index 8dc283b8b68da8..345d0343c1ba85 100644 --- a/torch/csrc/api/include/torch/optim/sgd.h +++ b/torch/csrc/api/include/torch/optim/sgd.h @@ -3,13 +3,11 @@ #include #include #include +#include #include #include -#include -#include - #include #include #include @@ -37,13 +35,17 @@ class SGD : public Optimizer { template void serialize(Archive& ar) { +#if defined(TORCH_USE_CEREAL) ar(CEREAL_NVP(momentum_buffers_)); +#endif // defined(TORCH_USE_CEREAL) } SGDOptions options; private: +#if defined(TORCH_USE_CEREAL) friend class cereal::access; +#endif // defined(TORCH_USE_CEREAL) SGD() : options(0) {} std::vector momentum_buffers_; @@ -52,3 +54,10 @@ class SGD : public Optimizer { }; } // namespace optim } // namespace torch + +#if defined(TORCH_USE_CEREAL) +CEREAL_REGISTER_TYPE(torch::optim::SGD); +CEREAL_REGISTER_POLYMORPHIC_RELATION( + torch::optim::Optimizer, + torch::optim::SGD); +#endif // defined(TORCH_USE_CEREAL) diff --git a/torch/csrc/api/include/torch/serialization.h b/torch/csrc/api/include/torch/serialization.h index 1f6be330090d31..a85c027ce5917a 100644 --- a/torch/csrc/api/include/torch/serialization.h +++ b/torch/csrc/api/include/torch/serialization.h @@ -2,45 +2,69 @@ #include -#include #include #include +#if defined(TORCH_USE_CEREAL) +#include +#include +#include + #include "cereal/archives/binary.hpp" -#include "cereal/types/polymorphic.hpp" #include "cereal/types/string.hpp" #include "cereal/types/unordered_map.hpp" #include "cereal/types/vector.hpp" +#endif // defined(TORCH_USE_CEREAL) namespace torch { - // Some convenience functions for saving and loading template void save(std::ostream& stream, T const& obj) { +#if defined(TORCH_USE_CEREAL) cereal::BinaryOutputArchive archive(stream); archive(*obj); +#else + AT_ERROR("PyTorch compiled without serialization support"); +#endif } + template void load(std::istream& stream, T& obj) { +#if defined(TORCH_USE_CEREAL) cereal::BinaryInputArchive archive(stream); archive(*obj); +#else + AT_ERROR("PyTorch compiled without serialization support"); +#endif } + template void save(std::ostream& stream, T const* obj) { +#if defined(TORCH_USE_CEREAL) cereal::BinaryOutputArchive archive(stream); archive(*obj); +#else + AT_ERROR("PyTorch compiled without serialization support"); +#endif } + template void load(std::istream& stream, T* obj) { +#if defined(TORCH_USE_CEREAL) cereal::BinaryInputArchive archive(stream); archive(*obj); +#else + AT_ERROR("PyTorch compiled without serialization support"); +#endif } + template void save(std::string const& path, T const& obj) { std::ofstream os(path, std::ios::binary); torch::save(os, obj); } + template void load(std::string const& path, T& obj) { std::ifstream is(path, std::ios::binary); @@ -74,8 +98,7 @@ inline int32_t scalarTypeId(torch::Dtype type) { case torch::Dtype::Undefined: return 8; default: - throw std::runtime_error( - "Unknown scalar type: " + std::to_string(static_cast(type))); + AT_ERROR("Unknown scalar type: ", static_cast(type)); } } @@ -100,7 +123,7 @@ inline torch::Dtype scalarTypeFromId(int32_t id) { case 8: return torch::Dtype::Undefined; default: - throw std::runtime_error("Unknown scalar type id: " + std::to_string(id)); + AT_ERROR("Unknown scalar type id: ", id); } } @@ -117,8 +140,7 @@ inline int32_t backendId(at::Backend backend) { case at::Backend::Undefined: return 4; default: - throw std::runtime_error( - "Unknown backend: " + std::to_string(static_cast(backend))); + AT_ERROR("Unknown backend: ", static_cast(backend)); } } @@ -135,33 +157,15 @@ inline at::Backend backendFromId(int32_t id) { case 4: return at::Backend::Undefined; default: - throw std::runtime_error("Unknown backend id: " + std::to_string(id)); + AT_ERROR("Unknown backend id: ", id); } } } // namespace detail } // namespace torch -// This is super ugly and I don't know how to simplify it -CEREAL_REGISTER_TYPE(torch::optim::SGD); -CEREAL_REGISTER_POLYMORPHIC_RELATION( - torch::optim::Optimizer, - torch::optim::SGD); -CEREAL_REGISTER_TYPE(torch::optim::Adagrad); -CEREAL_REGISTER_POLYMORPHIC_RELATION( - torch::optim::Optimizer, - torch::optim::Adagrad); -CEREAL_REGISTER_TYPE(torch::optim::RMSprop); -CEREAL_REGISTER_POLYMORPHIC_RELATION( - torch::optim::Optimizer, - torch::optim::RMSprop); -CEREAL_REGISTER_TYPE(torch::optim::Adam); -CEREAL_REGISTER_POLYMORPHIC_RELATION( - torch::optim::Optimizer, - torch::optim::Adam); - +#if defined(TORCH_USE_CEREAL) namespace cereal { - namespace agimpl { template @@ -269,3 +273,4 @@ void load(Archive& archive, torch::Tensor& tensor) { } } } // namespace cereal +#endif // defined(TORCH_USE_CEREAL) From 5b2efcf4258a7a80fcfd59e2aa54306c4da40568 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Wed, 12 Sep 2018 16:41:58 -0700 Subject: [PATCH 099/237] Document the Conv module (#11566) Summary: Document the C++ API conv module. No code changes. ebetica ezyang soumith Pull Request resolved: https://github.com/pytorch/pytorch/pull/11566 Differential Revision: D9793665 Pulled By: goldsborough fbshipit-source-id: 5f7f0605f952fadc62ffbcb8eca4183d4142c451 --- .../csrc/api/include/torch/nn/modules/conv.h | 81 ++++++++++++++++++- torch/csrc/api/include/torch/nn/modules/rnn.h | 12 +-- torch/csrc/api/src/nn/modules/conv.cpp | 3 +- 3 files changed, 88 insertions(+), 8 deletions(-) diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h index 3f562eab1c5a52..c6e6a2392dbe36 100644 --- a/torch/csrc/api/include/torch/nn/modules/conv.h +++ b/torch/csrc/api/include/torch/nn/modules/conv.h @@ -10,6 +10,8 @@ namespace torch { namespace nn { + +/// Options for a `D`-dimensional convolution module. template struct ConvOptions { ConvOptions( @@ -17,18 +19,59 @@ struct ConvOptions { int64_t output_channels, ExpandingArray kernel_size); + /// The number of channels the input volumes will have. + /// Changing this parameter after construction __has no effect__. TORCH_ARG(int64_t, input_channels); + + /// The number of output channels the convolution should produce. + /// Changing this parameter after construction __has no effect__. TORCH_ARG(int64_t, output_channels); + + /// The kernel size to use. + /// For a `D`-dim convolution, must be a single number or a list of `D` + /// numbers. + /// This parameter __can__ be changed after construction. TORCH_ARG(ExpandingArray, kernel_size); + + /// The stride of the convolution. + /// For a `D`-dim convolution, must be a single number or a list of `D` + /// numbers. + /// This parameter __can__ be changed after construction. TORCH_ARG(ExpandingArray, stride) = 1; + + /// The padding to add to the input volumes. + /// For a `D`-dim convolution, must be a single number or a list of `D` + /// numbers. + /// This parameter __can__ be changed after construction. TORCH_ARG(ExpandingArray, padding) = 0; + + /// The kernel dilation. + /// For a `D`-dim convolution, must be a single number or a list of `D` + /// numbers. + /// This parameter __can__ be changed after construction. TORCH_ARG(ExpandingArray, dilation) = 1; + + /// For transpose convolutions, the padding to add to output volumes. + /// For a `D`-dim convolution, must be a single number or a list of `D` + /// numbers. + /// This parameter __can__ be changed after construction. TORCH_ARG(ExpandingArray, output_padding) = 0; + + /// If true, convolutions will be transpose convolutions (a.k.a. + /// deconvolutions). + /// Changing this parameter after construction __has no effect__. TORCH_ARG(bool, transposed) = false; + + /// Whether to add a bias after individual applications of the kernel. + /// Changing this parameter after construction __has no effect__. TORCH_ARG(bool, with_bias) = true; + + /// The number of convolution groups. + /// This parameter __can__ be changed after construction. TORCH_ARG(int64_t, groups) = 1; }; +/// Base class for all (dimension-specialized) convolution modules. template class ConvImpl : public torch::nn::Cloneable { public: @@ -36,44 +79,80 @@ class ConvImpl : public torch::nn::Cloneable { int64_t input_channels, int64_t output_channels, ExpandingArray kernel_size) - : ConvImpl(ConvOptions(input_channels, output_channels, kernel_size)) {} + : ConvImpl(ConvOptions(input_channels, output_channels, kernel_size)) { + } explicit ConvImpl(ConvOptions options); void reset() override; + /// The options with which this `Module` was constructed. ConvOptions options; + + /// The learned kernel (or "weight"). Tensor weight; + + /// The learned bias. Only defined if the `with_bias` option was true. Tensor bias; }; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +/// Applies convolution over a 1-D input. +/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv1d to learn about +/// the exact behavior of this module. class Conv1dImpl : public ConvImpl<1, Conv1dImpl> { public: using ConvImpl<1, Conv1dImpl>::ConvImpl; Tensor forward(Tensor input); }; + +/// `ConvOptions` specialized for 1-D convolution. using Conv1dOptions = ConvOptions<1>; + +/// A `ModuleHolder` subclass for `Conv1dImpl`. +/// See the documentation for `Conv1dImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(Conv1d); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +/// Applies convolution over a 2-D input. +/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d to learn about +/// the exact behavior of this module. class Conv2dImpl : public ConvImpl<2, Conv2dImpl> { public: using ConvImpl<2, Conv2dImpl>::ConvImpl; Tensor forward(Tensor input); }; + +/// `ConvOptions` specialized for 2-D convolution. using Conv2dOptions = ConvOptions<2>; + +/// A `ModuleHolder` subclass for `Conv2dImpl`. +/// See the documentation for `Conv2dImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(Conv2d); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +/// Applies convolution over a 3-D input. +/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv3d to learn about +/// the exact behavior of this module. class Conv3dImpl : public ConvImpl<3, Conv3dImpl> { public: using ConvImpl<3, Conv3dImpl>::ConvImpl; Tensor forward(Tensor input); }; + +/// `ConvOptions` specialized for 3-D convolution. using Conv3dOptions = ConvOptions<3>; + +/// A `ModuleHolder` subclass for `Conv3dImpl`. +/// See the documentation for `Conv3dImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(Conv3d); } // namespace nn diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h index 326e9e267a9f32..e287b79cacd18a 100644 --- a/torch/csrc/api/include/torch/nn/modules/rnn.h +++ b/torch/csrc/api/include/torch/nn/modules/rnn.h @@ -170,8 +170,8 @@ struct RNNOptions { }; /// A multi-layer Elman RNN module with Tanh or ReLU activation. -/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNN for more -/// documenation. +/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNN to learn about the +/// exact behavior of this module. class RNNImpl : public detail::RNNImplBase { public: RNNImpl(int64_t input_size, int64_t hidden_size) @@ -198,8 +198,8 @@ TORCH_MODULE(RNN); using LSTMOptions = detail::RNNOptionsBase; /// A multi-layer long-short-term-memory (LSTM) module. -/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM for more -/// documenation. +/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM to learn about the +/// exact behavior of this module. class LSTMImpl : public detail::RNNImplBase { public: LSTMImpl(int64_t input_size, int64_t hidden_size) @@ -224,8 +224,8 @@ TORCH_MODULE(LSTM); using GRUOptions = detail::RNNOptionsBase; /// A multi-layer gated recurrent unit (GRU) module. -/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRU for more -/// documenation. +/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRU to learn about the +/// exact behavior of this module. class GRUImpl : public detail::RNNImplBase { public: GRUImpl(int64_t input_size, int64_t hidden_size) diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp index 06ed1d10ea3170..30a25714de4277 100644 --- a/torch/csrc/api/src/nn/modules/conv.cpp +++ b/torch/csrc/api/src/nn/modules/conv.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace torch { @@ -61,7 +62,7 @@ void ConvImpl::reset() { options.input_channels_, std::multiplies{}); const auto stdv = 1.0 / std::sqrt(number_of_features); - NoGradGuard no_grad;; + NoGradGuard no_grad; for (auto& p : this->parameters()) { p->uniform_(-stdv, stdv); } From def44c96fd7bda4cc4a9574e6f8466eea6606fb4 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 16:46:27 -0700 Subject: [PATCH 100/237] Revert D9779866: [pytorch][PR] Move function deletion from the stack to the heap. Differential Revision: D9779866 Original commit changeset: 96753eead790 fbshipit-source-id: 959deeb63318d48f4c563e10e70ef6ec7fabd3b4 --- torch/csrc/autograd/function.cpp | 122 +++++++++++++++++++++---------- torch/csrc/autograd/function.h | 2 + torch/csrc/autograd/init.cpp | 5 ++ 3 files changed, 89 insertions(+), 40 deletions(-) diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp index 6090b520bd36ab..ba33a6f116bac2 100644 --- a/torch/csrc/autograd/function.cpp +++ b/torch/csrc/autograd/function.cpp @@ -36,53 +36,95 @@ AnomalyMetadata* Function::metadata() noexcept { return anomaly_metadata_.get(); } -static void gatherFunctions(Function* func, - std::vector>& stack) { - for (auto& edge : func->next_edges()) { - if (edge.function.use_count() == 1) { - stack.emplace_back(std::move(edge.function)); - } +/* + * Fix for #5534: prevent stack overflow on deletion of deep computation graph + * + * Sometimes one can end up with a very big computation graph of Functions + * and Edges. Each std::shared_ptr contains a list of Edge, and + * each Edge contains a std::shared_ptr. Deleting a + * std::shared_ptr can trigger the recursive deletion of other + * std::shared_ptr's: this can stack overflow if the graph + * is deep enough. Here is an example of such a graph: + * + * shared_ptr -> Edge -> shared_ptr -> Edge -> ... -> shared_ptr + * + * The solution here is to use a custom deleter with each + * std::shared_ptr. The custom deleter keeps track of how many + * nested deleters it is in. When this number exceeds the maximum allowed + * depth, the Function* to be deleted are accumulated in a per-thread + * delete queue and handled by one of the deleters. + * + * Note that these custom deleters are NOT necessary for deleting PyFunction. + * This is because a THPFunction Python object owns a PyFunction that is in a + * computation graph. When Python objects get recursively destroyed, they + * are also queued into a delete list. This happens very early for them + * (at 50 deleters): https://github.com/python/cpython/blob/f320be77ffb73e3b9e7fc98c37b8df3975d84b40/Include/object.h#L1024-L1063 + * so we don't need to worry about them. + */ + +thread_local std::deque deleteFunctionQueue; +thread_local size_t deleteFunctionRecursionDepth = 0; + +/* + * If this number is set too high, a deep computation graph can still + * stack overflow. The procedure for setting this number was to + * 1) find the smallest value that would not guard against stack overflows + * on various machines + * 2) Take the minimum of all such values and subtract some leeway because + * the memory of these stack frames will probably grow as time passes. + * Testing on a few machines machines, the magic numbers were: + * - Mac OSX (Macbook Pro 15) : ~60000 + * - A beefy Ubuntu 16.04 box : ~15000 + * - Windows AWS instance (g3.4xlarge): variable. My two attempts at different + * times have gotten the following numbers: ~8300, 3669 + */ +#ifdef _WIN32 +size_t deleteFunctionMaxRecursionDepth = 3000; +#else +size_t deleteFunctionMaxRecursionDepth = 10000; +#endif + +struct RecursionDepthCounter { + public: + explicit RecursionDepthCounter() { + ++deleteFunctionRecursionDepth; + } + ~RecursionDepthCounter() { + --deleteFunctionRecursionDepth; } -} + + size_t value() { + return deleteFunctionRecursionDepth; + } +}; /* - * Fix for #5534: prevent stack overflow on deletion of deep computation graph - * - * Sometimes one can end up with a very big computation graph of Functions - * and Edges. Each std::shared_ptr contains a list of Edge, and - * each Edge contains a std::shared_ptr. Deleting a - * std::shared_ptr can trigger the recursive deletion of other - * std::shared_ptr's: this can stack overflow if the graph - * is deep enough. Here is an example of such a graph: - * - * shared_ptr -> Edge -> shared_ptr -> Edge -> ... -> shared_ptr - * - * The solution here is to detect when we are decrementing away the last - * reference to a Function, and when doing so to buffer up the Function's - * that will be recursively decremented. We can then decrement (and free) - * the original Function without causing a recursive cascade, before - * draining the buffer applying the same behavior. This is, in effect, - * converting recursion to a loop, using a heap buffer in place of the - * recursive call stack. - */ + * Note that the custom deleter deletes in BFS style. Without using + * the custom deleter, the computation graph is deleted in a DFS style. + * The BFS deletion is valid (and safe) because if a shared_ptr + * 's reference count hits 0, nothing else will access it. + */ void deleteFunction(Function* function) { - // To avoid stack overflow on large computational graphs, - // we need to track reference decrementing and freeing - // on the heap. - std::vector> stack; - gatherFunctions(function, stack); - delete function; + RecursionDepthCounter recursion_depth; - while (!stack.empty()) { - auto& curr_func = stack.back(); + if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) { + deleteFunctionQueue.push_back(function); + return; + } - if (curr_func.use_count() == 1) { - // If this is the last reference, gather function references - // that will be recursively decremented. - gatherFunctions(curr_func.get(), stack); - } + delete function; - stack.pop_back(); + if (deleteFunctionQueue.empty()) { + return; + } + if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) { + AT_ERROR("Only one deleter per thread should be able to process " + "the delete queue. Please open an issue."); + } + while (!deleteFunctionQueue.empty()) { + auto queued_function = deleteFunctionQueue.front(); + deleteFunctionQueue.pop_front(); + delete queued_function; } } diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index b4c90b1489a261..dfe8683197f8ee 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -33,6 +33,8 @@ using edge_list = std::vector; using saved_variable_list = std::vector; using IndexRange = std::pair; +TORCH_API extern size_t deleteFunctionMaxRecursionDepth; + // Custom deleter to prevent stack overflows. void deleteFunction(Function* function); diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 0be1cc2600a2f0..b306426025e025 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -50,6 +50,11 @@ PyObject * THPAutograd_initExtension(PyObject *_unused) }); m.def("_pop_range", []() { torch::autograd::profiler::popRange(); }); + /// TODO: Replace this ASAP with a better solution for deep autograd graphs! + m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) { + torch::autograd::deleteFunctionMaxRecursionDepth = value; + }); + Py_RETURN_TRUE; } From 776a9992e15b896627b5703646e579339117280d Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Wed, 12 Sep 2018 16:47:00 -0700 Subject: [PATCH 101/237] topk test fix, hgemm integration (#11593) Summary: After discussions in #11584 , new PR for just the test skip and hgemm integration. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11593 Differential Revision: D9798527 Pulled By: ezyang fbshipit-source-id: e2ef5609676571caef2f8e6844909fe3a11d8b3e --- aten/src/THC/THCBlas.cu | 8 ++++++++ test/test_torch.py | 1 + 2 files changed, 9 insertions(+) diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index bb9f7b92bd691a..20b13d82b9a152 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -285,6 +285,13 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6 cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); +#ifdef __HIP_PLATFORM_HCC__ + THCublasCheck(rocblas_hgemm(handle, opa, opb, i_m, i_n, i_k, + reinterpret_cast(&alpha), reinterpret_cast(a), i_lda, + reinterpret_cast(b), i_ldb, reinterpret_cast(&beta), + reinterpret_cast(c), i_ldc)); +#else + // Simulated Hgemm float fAlpha = THC_half2float(alpha); float fBeta = THC_half2float(beta); @@ -314,6 +321,7 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6 a, CUDA_R_16F, i_lda, b, CUDA_R_16F, i_ldb, &fBeta, c, CUDA_R_16F, i_ldc)); } +#endif #endif return; } diff --git a/test/test_torch.py b/test/test_torch.py index ea0d69d7b0e265..dce509cb7c5b0e 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -3513,6 +3513,7 @@ def test_topk_arguments(self): self.assertRaises(TypeError, lambda: q.topk(4, True)) @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA') + @skipIfRocm def test_topk_noncontiguous_gpu(self): t = torch.randn(20, device="cuda")[::2] top1, idx1 = t.topk(5) From 7f7cda99cd83c0356d7e2cd919a505f4a8e3ddfd Mon Sep 17 00:00:00 2001 From: Xiaomeng Yang Date: Wed, 12 Sep 2018 16:53:16 -0700 Subject: [PATCH 102/237] Optimize order_swich_ops on GPU (#11404) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11404 Optimize order_swich_ops on GPU Reviewed By: houseroad Differential Revision: D9728642 fbshipit-source-id: 74ff62268856fb1613fa61eb214bed6ec6716632 --- caffe2/operators/order_switch_ops.cu | 130 ++++++++------ caffe2/operators/order_switch_ops_cudnn.cc | 160 ++++++++++++++++++ .../python/operator_test/order_switch_test.py | 19 ++- 3 files changed, 249 insertions(+), 60 deletions(-) create mode 100644 caffe2/operators/order_switch_ops_cudnn.cc diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu index 27a71a69141098..c213b7cc848d0d 100644 --- a/caffe2/operators/order_switch_ops.cu +++ b/caffe2/operators/order_switch_ops.cu @@ -1,91 +1,115 @@ #include "caffe2/operators/order_switch_ops.h" + #include "caffe2/core/context_gpu.h" +#include "caffe2/utils/fixed_divisor.h" namespace caffe2 { -__global__ void NHWC2NCHWKernel( - const int N, - const int HW, - const int C, - const float* X, - float* Y) { - CUDA_1D_KERNEL_LOOP(i, N * HW * C) { - const int c = i % C; - const int hw = i / C % HW; - const int n = i / C / HW; - Y[(n * C + c) * HW + hw] = X[i]; +template +__global__ void NHWC2NCHWCUDAKernel( + const int size, + const FixedDivisor C, + const FixedDivisor HxW, + const T* X, + T* Y) { + CUDA_1D_KERNEL_LOOP(i, size) { + int n; + int c; + int hxw; + HxW.DivMod(i, &c, &hxw); + C.DivMod(c, &n, &c); +#if __CUDA_ARCH__ >= 350 + Y[i] = __ldg(X + (n * HxW.d() + hxw) * C.d() + c); +#else + Y[i] = X[(n * HxW.d() + hxw) * C.d() + c]; +#endif } } -__global__ void NCHW2NHWCKernel( - const int N, - const int C, - const int HW, - const float* X, - float* Y) { - CUDA_1D_KERNEL_LOOP(i, N * C * HW) { - const int hw = i % HW; - const int c = i / HW % C; - const int n = i / C / HW; - Y[(n * HW + hw) * C + c] = X[i]; +template +__global__ void NCHW2NHWCCUDAKernel( + const int size, + const FixedDivisor C, + const FixedDivisor HxW, + const T* X, + T* Y) { + CUDA_1D_KERNEL_LOOP(i, size) { + int n; + int c; + int hxw; + C.DivMod(i, &hxw, &c); + HxW.DivMod(hxw, &n, &hxw); +#if __CUDA_ARCH__ >= 350 + Y[i] = __ldg(X + (n * C.d() + c) * HxW.d() + hxw); +#else + Y[i] = X[(n * C.d() + c) * HxW.d() + hxw]; +#endif } } template <> bool NHWC2NCHWOp::RunOnDevice() { - auto& X = Input(0); + const auto& X = Input(0); auto* Y = Output(0); - - auto ndim = X.ndim(); - DCHECK_GE(ndim, 3); - const int N = X.dim32(0), C = X.dim32(ndim - 1); + const int ndim = X.ndim(); + CAFFE_ENFORCE_GE(ndim, 3); + const int N = X.dim32(0); + const int C = X.dim32(ndim - 1); vector Y_dims(ndim); Y_dims[0] = N; Y_dims[1] = C; - size_t image_size = 1; - for (auto i = 2; i < ndim; ++i) { + int HxW = 1; + for (int i = 2; i < ndim; ++i) { Y_dims[i] = X.dim32(i - 1); - image_size *= Y_dims[i]; + HxW *= Y_dims[i]; } Y->Resize(Y_dims); - - NHWC2NCHWKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - N, image_size, C, X.data(), Y->template mutable_data()); + const int size = X.size(); + NHWC2NCHWCUDAKernel + <<>>( + size, + FixedDivisor(C), + FixedDivisor(HxW), + X.data(), + Y->template mutable_data()); return true; } template <> bool NCHW2NHWCOp::RunOnDevice() { - auto& X = Input(0); + const auto& X = Input(0); auto* Y = Output(0); - - auto ndim = X.ndim(); - DCHECK_GE(X.ndim(), 3); - const int N = X.dim32(0), C = X.dim32(1); + const int ndim = X.ndim(); + CAFFE_ENFORCE_GE(X.ndim(), 3); + const int N = X.dim32(0); + const int C = X.dim32(1); vector Y_dims(ndim); Y_dims[0] = N; - size_t image_size = 1; + int HxW = 1; for (auto i = 1; i < ndim - 1; ++i) { Y_dims[i] = X.dim32(i + 1); - image_size *= Y_dims[i]; + HxW *= Y_dims[i]; } Y_dims[ndim - 1] = C; Y->Resize(Y_dims); - - NCHW2NHWCKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - N, C, image_size, X.data(), Y->template mutable_data()); + const int size = X.size(); + NCHW2NHWCCUDAKernel + <<>>( + size, + FixedDivisor(C), + FixedDivisor(HxW), + X.data(), + Y->template mutable_data()); return true; } - REGISTER_CUDA_OPERATOR(NHWC2NCHW, NHWC2NCHWOp); REGISTER_CUDA_OPERATOR(NCHW2NHWC, NCHW2NHWCOp); -} // namespace caffe2 + +} // namespace caffe2 diff --git a/caffe2/operators/order_switch_ops_cudnn.cc b/caffe2/operators/order_switch_ops_cudnn.cc new file mode 100644 index 00000000000000..4cb0034e7ee60d --- /dev/null +++ b/caffe2/operators/order_switch_ops_cudnn.cc @@ -0,0 +1,160 @@ +#include "caffe2/operators/order_switch_ops.h" + +#include +#include +#include + +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/cudnn_wrappers.h" +#include "caffe2/core/types.h" + +namespace caffe2 { + +namespace { + +class CuDNNOrderSwithOpBase : public Operator { + public: + USE_OPERATOR_FUNCTIONS(CUDAContext); + + CuDNNOrderSwithOpBase(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), cudnn_wrapper_(&context_) { + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_)); + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_)); + } + + virtual ~CuDNNOrderSwithOpBase() { + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_)); + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_)); + } + + protected: + void SetTensorDescriptor( + const cudnnDataType_t data_type, + const StorageOrder order, + const std::vector& data_dims, + cudnnTensorDescriptor_t data_desc) const { + const int ndim = data_dims.size(); + const int N = data_dims[0]; + const int C = order == StorageOrder::NCHW ? data_dims[1] : data_dims.back(); + if (ndim == 3) { + const int H = 1; + const int W = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1]; + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W)); + } else if (ndim == 4) { + const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1]; + const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2]; + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W)); + } else { + const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1]; + const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2]; + const auto l_iter = order == StorageOrder::NCHW ? data_dims.cbegin() + 4 + : data_dims.cbegin() + 3; + const auto r_iter = + order == StorageOrder::NCHW ? data_dims.cend() : data_dims.cend() - 1; + const int D = std::accumulate(l_iter, r_iter, 1, std::multiplies()); + const std::array dims = {N, C, H, W, D}; + const std::array strides = order == StorageOrder::NCHW + ? std::array{C * H * W * D, H * W * D, W * D, D, 1} + : std::array{C * H * W * D, 1, W * D * C, D * C, C}; + CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( + data_desc, data_type, 5, dims.data(), strides.data())); + } + } + + CuDNNWrapper cudnn_wrapper_; + cudnnTensorDescriptor_t X_desc_; + cudnnTensorDescriptor_t Y_desc_; + + std::vector cached_X_dims_; +}; + +class CuDNNNHWC2NCHWOp final : public CuDNNOrderSwithOpBase { + public: + CuDNNNHWC2NCHWOp(const OperatorDef& operator_def, Workspace* ws) + : CuDNNOrderSwithOpBase(operator_def, ws) {} + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + auto* Y = Output(0); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = X.dim32(ndim - 1); + const std::vector X_dims(X.dims().cbegin(), X.dims().cend()); + std::vector Y_dims(ndim); + Y_dims[0] = N; + Y_dims[1] = C; + std::copy(X_dims.cbegin() + 1, X_dims.cend() - 1, Y_dims.begin() + 2); + Y->Resize(Y_dims); + if (cached_X_dims_ != X_dims) { + cached_X_dims_ = X_dims; + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NHWC, X_dims, X_desc_); + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NCHW, Y_dims, Y_desc_); + } + CUDNN_ENFORCE(cudnnTransformTensor( + cudnn_wrapper_.inline_cudnn_handle(), + cudnnTypeWrapper::kOne(), + X_desc_, + X.template data(), + cudnnTypeWrapper::kZero(), + Y_desc_, + Y->template mutable_data())); + return true; + } +}; + +class CuDNNNCHW2NHWCOp final : public CuDNNOrderSwithOpBase { + public: + CuDNNNCHW2NHWCOp(const OperatorDef& operator_def, Workspace* ws) + : CuDNNOrderSwithOpBase(operator_def, ws) {} + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + const auto& X = Input(0); + auto* Y = Output(0); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = X.dim32(1); + const std::vector X_dims(X.dims().cbegin(), X.dims().cend()); + std::vector Y_dims(ndim); + Y_dims[0] = N; + Y_dims[ndim - 1] = C; + std::copy(X_dims.cbegin() + 2, X_dims.cend(), Y_dims.begin() + 1); + Y->Resize(Y_dims); + if (cached_X_dims_ != X_dims) { + cached_X_dims_ = X_dims; + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NCHW, X_dims, X_desc_); + SetTensorDescriptor( + cudnnTypeWrapper::type, StorageOrder::NHWC, Y_dims, Y_desc_); + } + CUDNN_ENFORCE(cudnnTransformTensor( + cudnn_wrapper_.inline_cudnn_handle(), + cudnnTypeWrapper::kOne(), + X_desc_, + X.template data(), + cudnnTypeWrapper::kZero(), + Y_desc_, + Y->template mutable_data())); + return true; + } +}; + +} // namespace + +REGISTER_CUDNN_OPERATOR(NHWC2NCHW, CuDNNNHWC2NCHWOp); +REGISTER_CUDNN_OPERATOR(NCHW2NHWC, CuDNNNCHW2NHWCOp); + +} // namespace caffe2 diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py index d54ac26c55fb36..5d3fd0e60f5e08 100644 --- a/caffe2/python/operator_test/order_switch_test.py +++ b/caffe2/python/operator_test/order_switch_test.py @@ -1,14 +1,17 @@ from __future__ import absolute_import, division, print_function, unicode_literals import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st + from caffe2.python import core from hypothesis import given class OrderSwitchOpsTest(hu.HypothesisTestCase): - @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs) - def test_nchw2nhwc(self, X, gc, dc): - op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], device_option=gc) + @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), + engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) + def test_nchw2nhwc(self, X, engine, gc, dc): + op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], engine=engine) def nchw2nhwc_ref(X): X_reshaped = X.transpose((0,) + tuple(range(2, X.ndim)) + (1,)) @@ -18,12 +21,14 @@ def nchw2nhwc_ref(X): self.assertGradientChecks(gc, op, [X], 0, [0]) self.assertDeviceChecks(dc, op, [X], [0]) - @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs) - def test_nhwc2nchw(self, X, gc, dc): - op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], device_option=gc) + @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), + engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) + def test_nhwc2nchw(self, X, engine, gc, dc): + op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], engine=engine) def nhwc2nchw_ref(X): - X_reshaped = X.transpose((0, X.ndim - 1) + tuple(range(1, X.ndim - 1))) + X_reshaped = X.transpose( + (0, X.ndim - 1) + tuple(range(1, X.ndim - 1))) return (X_reshaped,) self.assertReferenceChecks(gc, op, [X], nhwc2nchw_ref) From e2cd627cce1e4cc2c6af8d5911a9d2f18adede7e Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 17:27:33 -0700 Subject: [PATCH 103/237] Temporarily disable docs build. (#11608) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11608 Differential Revision: D9803369 Pulled By: ezyang fbshipit-source-id: a206d6137e8e729f702189c926ec898444d1dc53 --- .jenkins/pytorch/build.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 2dc64157c5d00d..3159ee1ad629e6 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -117,8 +117,9 @@ fi if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then pushd docs # TODO: Don't run this here - pip install -r requirements.txt || true - LC_ALL=C make html + # TODO: Reenable doc build + #pip install -r requirements.txt || true + #LC_ALL=C make html popd fi From daa379ffd7e684c4db02914918dcb298849ed600 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 17:30:48 -0700 Subject: [PATCH 104/237] Disable flaky test ObserverTest.TestMultipleNetBase (#11596) Summary: Tracked in https://github.com/pytorch/pytorch/issues/9137 Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11596 Differential Revision: D9803256 Pulled By: ezyang fbshipit-source-id: 973393203ed8343a3a0feef36d34e561d9f653c4 --- caffe2/core/observer_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc index f4f4e81a3cb45c..fa8aee6d818366 100644 --- a/caffe2/core/observer_test.cc +++ b/caffe2/core/observer_test.cc @@ -151,6 +151,9 @@ TEST(ObserverTest, TestDAGNetBase) { EXPECT_EQ(1212, count_after - count_before); } +#if 0 +// This test intermittently segfaults, +// see https://github.com/pytorch/pytorch/issues/9137 TEST(ObserverTest, TestMultipleNetBase) { Workspace ws; ws.CreateBlob("in"); @@ -176,4 +179,5 @@ TEST(ObserverTest, TestMultipleNetBase) { EXPECT_EQ(net.get()->NumObservers(), prev_num); } +#endif } // namespace caffe2 From f00f99ebcce966c8b349b2cc5798f389dbead4ad Mon Sep 17 00:00:00 2001 From: Roy Li Date: Wed, 12 Sep 2018 17:35:32 -0700 Subject: [PATCH 105/237] use at::Half in THC (#11322) Summary: - use Half instead of half in THC - clean up TH_float2half, TH_half2float, etc. conversions Pull Request resolved: https://github.com/pytorch/pytorch/pull/11322 Differential Revision: D9799553 Pulled By: li-roy fbshipit-source-id: 9aa3e003bff73d9df6224a393f3ec0624b1f44ed --- aten/src/ATen/core/Half-inl.h | 8 + aten/src/ATen/cuda/NumericLimits.cuh | 3 +- aten/src/ATen/function_wrapper.py | 6 - aten/src/ATen/gen.py | 4 +- aten/src/TH/CMakeLists.txt | 1 - aten/src/TH/THDiskFile.cpp | 6 +- aten/src/TH/THGenerateHalfType.h | 4 +- aten/src/TH/THHalf.cpp | 30 -- aten/src/TH/THHalf.h | 10 +- aten/src/TH/THMemoryFile.cpp | 6 +- aten/src/TH/generic/THStorageCopy.cpp | 48 +-- aten/src/TH/generic/THTensorCopy.cpp | 36 +-- aten/src/THC/CMakeLists.txt | 1 - aten/src/THC/THCAtomics.cuh | 18 +- aten/src/THC/THCBlas.cu | 24 +- aten/src/THC/THCBlas.h | 10 +- aten/src/THC/THCGeneral.cpp | 33 +-- aten/src/THC/THCGenerateHalfType.h | 4 +- aten/src/THC/THCHalf.h | 18 -- aten/src/THC/THCNumerics.cuh | 276 ++++-------------- aten/src/THC/THCStorage.cpp | 2 +- aten/src/THC/THCStorage.cu | 2 +- aten/src/THC/THCStorageCopy.cu | 2 +- aten/src/THC/THCStorageCopy.h | 2 +- aten/src/THC/THCTensorCopy.cu | 2 +- aten/src/THC/THCTensorCopy.h | 2 +- aten/src/THC/THCTensorIndex.cu | 2 +- aten/src/THC/THCTensorMathPairwise.cu | 113 +------ aten/src/THC/THCTensorMathPointwise.cuh | 119 +------- aten/src/THC/THCTensorMode.cuh | 20 +- aten/src/THC/THCTensorRandom.cu | 20 +- aten/src/THC/THCTensorTopK.cuh | 12 +- aten/src/THC/THCTensorTypeUtils.cuh | 54 +--- aten/src/THC/generic/THCTensorCopy.cu | 2 +- aten/src/THC/generic/THCTensorMathBlas.cu | 5 +- aten/src/THC/generic/THCTensorMathPairwise.cu | 10 - aten/src/THCUNN/Abs.cu | 2 +- aten/src/THCUNN/AbsCriterion.cu | 2 +- aten/src/THCUNN/BCECriterion.cu | 2 +- aten/src/THCUNN/BatchNormalization.cu | 2 +- aten/src/THCUNN/ClassNLLCriterion.cu | 2 +- aten/src/THCUNN/Col2Im.cu | 2 +- aten/src/THCUNN/DistKLDivCriterion.cu | 2 +- aten/src/THCUNN/ELU.cu | 2 +- aten/src/THCUNN/GatedLinearUnit.cu | 2 +- aten/src/THCUNN/HardTanh.cu | 2 +- aten/src/THCUNN/Im2Col.cu | 2 +- aten/src/THCUNN/IndexLinear.cu | 2 +- aten/src/THCUNN/L1Cost.cu | 2 +- aten/src/THCUNN/LeakyReLU.cu | 2 +- aten/src/THCUNN/LogSigmoid.cu | 2 +- aten/src/THCUNN/LookupTable.cu | 2 +- aten/src/THCUNN/LookupTableBag.cu | 2 +- aten/src/THCUNN/MSECriterion.cu | 2 +- aten/src/THCUNN/MarginCriterion.cu | 2 +- aten/src/THCUNN/MultiLabelMarginCriterion.cu | 2 +- aten/src/THCUNN/MultiMarginCriterion.cu | 2 +- aten/src/THCUNN/PReLU.cu | 8 +- aten/src/THCUNN/RReLU.cu | 6 +- aten/src/THCUNN/Sigmoid.cu | 2 +- aten/src/THCUNN/SmoothL1Criterion.cu | 2 +- aten/src/THCUNN/SoftMarginCriterion.cu | 2 +- aten/src/THCUNN/SoftPlus.cu | 2 +- aten/src/THCUNN/SoftShrink.cu | 2 +- aten/src/THCUNN/SparseLinear.cu | 2 +- .../THCUNN/SpatialAdaptiveAveragePooling.cu | 2 +- aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu | 2 +- aten/src/THCUNN/SpatialAveragePooling.cu | 2 +- aten/src/THCUNN/SpatialClassNLLCriterion.cu | 2 +- aten/src/THCUNN/SpatialConvolutionLocal.cu | 2 +- aten/src/THCUNN/SpatialConvolutionMM.cu | 2 +- aten/src/THCUNN/SpatialCrossMapLRN.cu | 2 +- aten/src/THCUNN/SpatialDilatedConvolution.cu | 2 +- aten/src/THCUNN/SpatialDilatedMaxPooling.cu | 2 +- .../src/THCUNN/SpatialFractionalMaxPooling.cu | 2 +- aten/src/THCUNN/SpatialFullConvolution.cu | 2 +- .../THCUNN/SpatialFullDilatedConvolution.cu | 2 +- aten/src/THCUNN/SpatialReflectionPadding.cu | 2 +- aten/src/THCUNN/SpatialReplicationPadding.cu | 2 +- aten/src/THCUNN/SpatialSubSampling.cu | 2 +- aten/src/THCUNN/SpatialUpSamplingBilinear.cu | 2 +- aten/src/THCUNN/SpatialUpSamplingNearest.cu | 2 +- aten/src/THCUNN/Sqrt.cu | 2 +- aten/src/THCUNN/Square.cu | 2 +- aten/src/THCUNN/THCHalfAutoNumerics.cuh | 229 ++------------- aten/src/THCUNN/Tanh.cu | 2 +- aten/src/THCUNN/TemporalConvolution.cu | 2 +- aten/src/THCUNN/TemporalMaxPooling.cu | 2 +- aten/src/THCUNN/TemporalReflectionPadding.cu | 2 +- aten/src/THCUNN/TemporalReplicationPadding.cu | 2 +- aten/src/THCUNN/TemporalRowConvolution.cu | 2 +- aten/src/THCUNN/TemporalUpSamplingLinear.cu | 2 +- aten/src/THCUNN/TemporalUpSamplingNearest.cu | 2 +- aten/src/THCUNN/Threshold.cu | 2 +- .../VolumetricAdaptiveAveragePooling.cu | 2 +- .../THCUNN/VolumetricAdaptiveMaxPooling.cu | 2 +- aten/src/THCUNN/VolumetricAveragePooling.cu | 2 +- aten/src/THCUNN/VolumetricConvolution.cu | 2 +- .../THCUNN/VolumetricDilatedConvolution.cu | 2 +- .../src/THCUNN/VolumetricDilatedMaxPooling.cu | 2 +- .../THCUNN/VolumetricFractionalMaxPooling.cu | 2 +- aten/src/THCUNN/VolumetricFullConvolution.cu | 2 +- .../VolumetricFullDilatedConvolution.cu | 2 +- aten/src/THCUNN/VolumetricMaxUnpooling.cu | 2 +- .../THCUNN/VolumetricReplicationPadding.cu | 2 +- .../src/THCUNN/VolumetricUpSamplingNearest.cu | 2 +- .../THCUNN/VolumetricUpSamplingTrilinear.cu | 2 +- tools/amd_build/disabled_features.yaml | 7 - torch/csrc/cuda/THCP.h | 2 +- torch/csrc/generic/utils.h | 2 +- torch/csrc/utils.h | 9 +- 111 files changed, 271 insertions(+), 1043 deletions(-) delete mode 100644 aten/src/TH/THHalf.cpp delete mode 100644 aten/src/THC/THCHalf.h diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h index a1786d0bb9db6e..42633508e4b714 100644 --- a/aten/src/ATen/core/Half-inl.h +++ b/aten/src/ATen/core/Half-inl.h @@ -43,6 +43,14 @@ inline AT_HOSTDEVICE Half::operator __half() const { } #endif +// CUDA intrinsics + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) +inline __device__ Half __ldg(const Half* ptr) { + return __ldg(reinterpret_cast(ptr)); +} +#endif + /// Arithmetic inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { diff --git a/aten/src/ATen/cuda/NumericLimits.cuh b/aten/src/ATen/cuda/NumericLimits.cuh index 325cbce737dd51..981bf8c1c34efa 100644 --- a/aten/src/ATen/cuda/NumericLimits.cuh +++ b/aten/src/ATen/cuda/NumericLimits.cuh @@ -3,6 +3,7 @@ #include #include #include +#include // NumericLimits.cuh is a holder for numeric limits definitions of commonly used // types. This header is very specific to ROCm HIP and may be removed in the future. @@ -101,4 +102,4 @@ struct numeric_limits { static inline __host__ __device__ double upper_bound() { return inf; } }; -} // namespace at \ No newline at end of file +} // namespace at diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 287b7895008cc2..266f9645a18527 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -202,8 +202,6 @@ def TypedDict(name, attrs, total=True): # type: ignore CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})") -HALF_CONVERSION = CodeTemplate("convert(${value})") - class NYIError(Exception): """Indicates we don't support this declaration yet""" @@ -1202,8 +1200,6 @@ def create_derived(backend_type_env, declarations): is_cuda = 'CUDA' in backend_type_env['Backend'] - real_is_half = backend_type_env['ScalarName'] == 'Half' - def replace_with_null(argument): # type: (THFormal) -> bool return (argument['type'] == 'THGenerator*' and @@ -1230,8 +1226,6 @@ def get_argument(argument, option): elif requires_checked_cast(argument): checked_use = CHECKED_USE.get( argument['type'], '{}_').format(argument['name']) - if real_is_half and argument['type'] == 'real': - checked_use = HALF_CONVERSION.substitute(value=checked_use) if nullable_argument(argument): checked_use = CHECKED_USE_NULLABLE.substitute( env={}, arg_name=argument['name'], usage=checked_use) diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index f0827077827a25..76ed7c74a88e07 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -162,7 +162,7 @@ def check_all_files_written(self): ('Int', 'int', 'Long', 'int32_t', False), ('Long', 'int64_t', 'Long', 'int64_t', False), ('Short', 'int16_t', 'Long', 'int16_t', False), - ('Half', 'Half', 'Double', 'THHalf', True), + ('Half', 'Half', 'Double', 'at::Half', True), ] # shared environment for non-derived base classes Type.h Tensor.h Storage.h @@ -292,7 +292,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if scalar_name == "Half": env['SparseTensor'] = 'Tensor' if backend == "CUDA": - env['AS_REAL'] = 'convert' + env['AS_REAL'] = 'convert' declarations, definitions = function_wrapper.create_derived( env, declarations) diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index 9fe22beb0dc54e..10d43e1433c811 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -26,7 +26,6 @@ SET(hdr set(ATen_TH_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp index a806df6ac62ebf..ddf993df6605d7 100644 --- a/aten/src/TH/THDiskFile.cpp +++ b/aten/src/TH/THDiskFile.cpp @@ -359,9 +359,9 @@ READ_WRITE_METHODS(float, Float, int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++, int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++) -READ_WRITE_METHODS(THHalf, Half, - float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; }, - int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++) +READ_WRITE_METHODS(at::Half, Half, + float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= static_cast(buf); nread++; }, + int ret = fprintf(dfself->handle, "%.9g", static_cast(data[i])); if(ret <= 0) break; else nwrite++) READ_WRITE_METHODS(double, Double, int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++, diff --git a/aten/src/TH/THGenerateHalfType.h b/aten/src/TH/THGenerateHalfType.h index 09d4c878d0f11e..8e1b5eaed4d946 100644 --- a/aten/src/TH/THGenerateHalfType.h +++ b/aten/src/TH/THGenerateHalfType.h @@ -5,8 +5,8 @@ #include "THHalf.h" #define scalar_t THHalf #define accreal float -#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val) -#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val) +#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val) +#define TH_CONVERT_ACCREAL_TO_REAL(_val) (scalar_t)(_val) #define Real Half #define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF) #define TH_REAL_IS_HALF diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp deleted file mode 100644 index c95272c5f13e34..00000000000000 --- a/aten/src/TH/THHalf.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include "THHalf.h" -#include - -/* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */ - -THHalf TH_float2half(float f) -{ - THHalf h; - TH_float2halfbits(&f, &h.x); - return h; -} - -TH_API float TH_half2float(THHalf h) -{ - float f; - TH_halfbits2float(&h.x, &f); - return f; -} - - -void TH_halfbits2float(unsigned short* src, float* res) -{ - *res = at::detail::halfbits2float(*src); -} - - -void TH_float2halfbits(float* src, unsigned short* dest) -{ - *dest = at::detail::float2halfbits(*src); -} diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h index 68a884f22bd0ad..f7c884f2cc67bd 100644 --- a/aten/src/TH/THHalf.h +++ b/aten/src/TH/THHalf.h @@ -1,10 +1,8 @@ #ifndef TH_HALF_H #define TH_HALF_H -#include - #ifdef __cplusplus -#include +#include #endif #ifdef __cplusplus @@ -14,10 +12,4 @@ typedef struct at_Half at_Half; #define THHalf at_Half #endif -TH_API void TH_float2halfbits(float*, unsigned short*); -TH_API void TH_halfbits2float(unsigned short*, float*); - -TH_API THHalf TH_float2half(float); -TH_API float TH_half2float(THHalf); - #endif diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp index 3f2187b68f74ea..b74d7926ebff21 100644 --- a/aten/src/TH/THMemoryFile.cpp +++ b/aten/src/TH/THMemoryFile.cpp @@ -343,11 +343,11 @@ READ_WRITE_METHODS(float, Float, nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]), 1) -READ_WRITE_METHODS(THHalf, Half, +READ_WRITE_METHODS(at::Half, Half, int nByteRead_; float buf; \ int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \ - data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])), + data[i] = static_cast(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", static_cast(data[i])), 1) READ_WRITE_METHODS(double, Double, diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp index 1de588bbd2d75b..ea8a0d5808cb16 100644 --- a/aten/src/TH/generic/THStorageCopy.cpp +++ b/aten/src/TH/generic/THStorageCopy.cpp @@ -29,40 +29,6 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage data[i] = static_cast(src_data[i]); \ } -#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \ -void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ -{ \ - THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ - ptrdiff_t i; \ - auto data = THStorage_(data)(storage); \ - auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->numel(); i++) \ - data[i] = (scalar_t)TH_half2float(src_data[i]); \ -} - -#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \ -void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ -{ \ - THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ - ptrdiff_t i; \ - auto data = THStorage_(data)(storage); \ - auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->numel(); i++) \ - data[i] = TH_float2half((float)(src_data[i])); \ -} - -#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \ -void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ -{ \ - THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ - ptrdiff_t i; \ - auto data = THStorage_(data)(storage); \ - auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->numel(); i++) \ - data[i] = static_cast(src_data[i]); \ -} - -#ifndef TH_REAL_IS_HALF IMPLEMENT_THStorage_COPY(Byte) IMPLEMENT_THStorage_COPY(Char) IMPLEMENT_THStorage_COPY(Short) @@ -70,18 +36,6 @@ IMPLEMENT_THStorage_COPY(Int) IMPLEMENT_THStorage_COPY(Long) IMPLEMENT_THStorage_COPY(Float) IMPLEMENT_THStorage_COPY(Double) -IMPLEMENT_THStorage_COPY_FROM_HALF(Half) -#else -/* only allow pass-through for Half */ -IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half) -IMPLEMENT_THStorage_COPY_TO_HALF(Byte) -IMPLEMENT_THStorage_COPY_TO_HALF(Char) -IMPLEMENT_THStorage_COPY_TO_HALF(Short) -IMPLEMENT_THStorage_COPY_TO_HALF(Int) -IMPLEMENT_THStorage_COPY_TO_HALF(Long) -IMPLEMENT_THStorage_COPY_TO_HALF(Float) -IMPLEMENT_THStorage_COPY_TO_HALF(Double) -#endif - +IMPLEMENT_THStorage_COPY(Half) #endif diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp index a9e0564fb574c8..f04c75cd3f9105 100644 --- a/aten/src/TH/generic/THTensorCopy.cpp +++ b/aten/src/TH/generic/THTensorCopy.cpp @@ -203,28 +203,6 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src static_cast>(*src_data));) \ } -#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \ -void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ -{ \ - TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \ -} - -#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ -void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ -{ \ - TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, \ - *tensor_data = static_cast( \ - static_cast>( \ - TH_half2float(*src_data)));) \ -} - -#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ -void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ -{ \ - TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \ -} - -#ifndef TH_REAL_IS_HALF IMPLEMENT_THTensor_COPY(Byte, uint8_t) IMPLEMENT_THTensor_COPY(Char, int8_t) IMPLEMENT_THTensor_COPY(Short, int16_t) @@ -232,18 +210,6 @@ IMPLEMENT_THTensor_COPY(Int, int32_t) IMPLEMENT_THTensor_COPY(Long, int64_t) IMPLEMENT_THTensor_COPY(Float, float) IMPLEMENT_THTensor_COPY(Double, double) -IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf) -#else -/* only allow pass-through for Half */ -IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf) -IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t) -IMPLEMENT_THTensor_COPY_TO_HALF(Float, float) -IMPLEMENT_THTensor_COPY_TO_HALF(Double, double) - -#endif /* REAL_IS_HALF */ +IMPLEMENT_THTensor_COPY(Half, at::Half) #endif diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index 44f5d188d5b42b..4b8ab9f4ab101a 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -92,7 +92,6 @@ INSTALL(FILES THCGenerateFloatType.h THCGenerateFloatTypes.h THCGenerateDoubleType.h - THCHalf.h THCIntegerDivider.cuh THCNumerics.cuh THCTensorSort.cuh diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index 8fec96dde6f647..756fa0f905ac13 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -2,7 +2,7 @@ #define THC_ATOMICS_INC #include "THC.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCNumerics.cuh" #include "ATen/ATen.h" @@ -95,7 +95,7 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) { AtomicAddIntegerImpl()(address, val); } -static inline __device__ void atomicAdd(half *address, half val) { +static inline __device__ void atomicAdd(at::Half *address, at::Half val) { unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); unsigned int old = *address_as_ui; @@ -103,23 +103,13 @@ static inline __device__ void atomicAdd(half *address, half val) { do { assumed = old; -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - half hsum; + at::Half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - hsum = THCNumerics::add(hsum, val); -#else - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = THCNumerics::add(hsum, val); - hsum = __half_raw(tmpres); -#endif + hsum = THCNumerics::add(hsum, val); old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; old = atomicCAS(address_as_ui, assumed, old); } while (assumed != old); } -static inline __device__ void atomicAdd(at::Half *address, at::Half val) { - atomicAdd(reinterpret_cast(address), val); -} #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) // from CUDA C Programmic Guide diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index 20b13d82b9a152..6375ced8c691cc 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -1,6 +1,6 @@ #include "THCBlas.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include @@ -50,7 +50,7 @@ double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, doub return 0; } -half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy) +at::Half THCudaBlas_Hdot(THCState *state, int64_t n, at::Half *x, int64_t incx, at::Half *y, int64_t incy) { #if CUDA_VERSION >= 8000 if (n == 1) { @@ -59,7 +59,7 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, } if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { - half result; + at::Half result; cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasDotEx(handle, n, @@ -72,10 +72,10 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, THError("Cublas_Hdot only supports n, incx and incy " "up to signed integer limits: %d", INT_MAX); - return THC_float2half(0); + return 0.0; #else THError("Cublas_Hdot requires CUDA 8.0+"); - return THC_float2half(0); + return 0.0; #endif } @@ -267,7 +267,7 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6 # define CUDA_R_16F CUBLAS_DATA_HALF #endif -void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc) +void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc) { adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); @@ -293,8 +293,8 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6 #else // Simulated Hgemm - float fAlpha = THC_half2float(alpha); - float fBeta = THC_half2float(beta); + float fAlpha = alpha; + float fBeta = beta; #if CUDA_VERSION < 9000 THCublasCheck(cublasSgemmEx(handle, opa, opb, @@ -355,8 +355,8 @@ void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int6 #if CUDA_VERSION >= 9010 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, - half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB, - half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount) + at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB, + at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount) { if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) @@ -371,8 +371,8 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); - float fAlpha = THC_half2float(alpha); - float fBeta = THC_half2float(beta); + float fAlpha = alpha; + float fBeta = beta; THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); THCublasCheck(cublasGemmStridedBatchedEx(handle, opa, opb, (int)m, (int)n, (int)k, diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h index 36a0b35294dff2..a73b47f162c3f3 100644 --- a/aten/src/THC/THCBlas.h +++ b/aten/src/THC/THCBlas.h @@ -2,12 +2,12 @@ #define THC_BLAS_INC #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" /* Level 1 */ THC_API float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy); THC_API double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy); -THC_API half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy); +THC_API THHalf THCudaBlas_Hdot(THCState *state, int64_t n, THHalf *x, int64_t incx, THHalf *y, int64_t incy); /* Level 2 */ THC_API void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy); @@ -19,7 +19,7 @@ THC_API void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc); THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); -THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc); +THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc); THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, @@ -38,8 +38,8 @@ THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char t #if CUDA_VERSION >= 9010 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, - half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB, - half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount); + THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB, + THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount); #endif /* Inverse */ diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index 6bc7da7cee6e2d..05da61bd56e754 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -125,7 +125,7 @@ void THCudaShutdown(THCState* state) for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]); - + // Frees BLAS handle if (res->blasHandle) { THCublasCheck(cublasDestroy(res->blasHandle)); @@ -256,7 +256,7 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) THError("THCState and sparseHandles must be set as there is no default sparseHandle"); return NULL; } - + int device; THCudaCheck(cudaGetDevice(&device)); @@ -280,7 +280,7 @@ cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state) int device; THCudaCheck(cudaGetDevice(&device)); - + // Creates the sparse handle if not created yet THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); if (!res->sparseHandle) { @@ -474,30 +474,3 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB #include "THCStorage.cpp" #include "THCAllocator.cpp" - -/* from THCHalf.h */ - -half THC_float2half(float f) -{ -#if CUDA_VERSION < 9000 - half h; - TH_float2halfbits(&f, &h.x); - return h; -#else - __half_raw h_raw; - TH_float2halfbits(&f, &h_raw.x); - return half(h_raw); -#endif -} - -float THC_half2float(half h) -{ - float f; -#if CUDA_VERSION < 9000 - TH_halfbits2float(&h.x, &f); -#else - __half_raw h_raw(h); - TH_halfbits2float(&h_raw.x, &f); -#endif - return f; -} diff --git a/aten/src/THC/THCGenerateHalfType.h b/aten/src/THC/THCGenerateHalfType.h index 54358a9be1a59d..596ea47904820f 100644 --- a/aten/src/THC/THCGenerateHalfType.h +++ b/aten/src/THC/THCGenerateHalfType.h @@ -2,9 +2,9 @@ #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h" #endif -#include "THCHalf.h" +#include "TH/THHalf.h" -#define scalar_t half +#define scalar_t THHalf #define accreal float #define Real Half diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h deleted file mode 100644 index aeae06fc4739ba..00000000000000 --- a/aten/src/THC/THCHalf.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef THC_HALF_CONVERSION_INC -#define THC_HALF_CONVERSION_INC - -#include "THCGeneral.h" - -#include -#include - -#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) -#ifndef __cplusplus -typedef __half_raw half; -#endif -#endif - -THC_API half THC_float2half(float a); -THC_API float THC_half2float(half a); - -#endif diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh index 286d36f7b284b3..157a324f6e45b8 100644 --- a/aten/src/THC/THCNumerics.cuh +++ b/aten/src/THC/THCNumerics.cuh @@ -4,7 +4,7 @@ #include #include #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "ATen/ATen.h" #include "ATen/cuda/NumericLimits.cuh" @@ -165,210 +165,81 @@ struct THCNumerics { // DEPRECATED: use math functions from std and NumericLimits.cuh template <> -struct THCNumerics { - static inline __host__ __device__ half min() { return at::numeric_limits::lowest(); } - static inline __host__ __device__ half max() { return at::numeric_limits::max(); } - static inline __host__ __device__ half lower_bound() { return at::numeric_limits::lower_bound(); } - static inline __host__ __device__ half upper_bound() { return at::numeric_limits::upper_bound(); } - - static inline __host__ __device__ bool lt(half a, half b) { - return static_cast(a) < static_cast(b); - } - - static inline __host__ __device__ bool le(half a, half b) { - return static_cast(a) <= static_cast(b); - } - - static inline __host__ __device__ bool gt(half a, half b) { - return static_cast(a) > static_cast(b); - } - - static inline __host__ __device__ bool ge(half a, half b) { - return static_cast(a) >= static_cast(b); - } - - static inline __host__ __device__ bool eq(half a, half b) { - // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands - // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve - return static_cast(static_cast(a)) == static_cast(static_cast(b)); - } - - static inline __host__ __device__ bool ne(half a, half b) { - // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands - // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve - return static_cast(static_cast(a)) != static_cast(static_cast(b)); - } - - static inline __host__ __device__ half exp(half a) { - return static_cast(std::exp(static_cast(a))); - } - - // note that exp10 is not in the std namespace. - static inline __host__ __device__ half exp10(half a) { - return static_cast(::exp10(static_cast(a))); - } - - static inline __host__ __device__ half log(half a) { - return static_cast(::log(static_cast(a))); - } - - static inline __host__ __device__ half log10(half a) { - return static_cast(::log10(static_cast(a))); - } - - static inline __host__ __device__ half log1p(half a) { - return static_cast(::log1p(static_cast(a))); - } - - static inline __host__ __device__ half log2(half a) { - return static_cast(::log2(static_cast(a))); - } - - static inline __host__ __device__ half lgamma(half a) { - return static_cast(::lgamma(static_cast(a))); - } - - static inline __host__ __device__ half expm1(half a) { - return static_cast(::expm1(static_cast(a))); - } - - static inline __host__ __device__ half cos(half a) { - return static_cast(::cos(static_cast(a))); - } - - static inline __host__ __device__ half sin(half a) { - return static_cast(::sin(static_cast(a))); - } - - static inline __host__ __device__ half sqrt(half a) { - return static_cast(::sqrt(static_cast(a))); - } - - // note that rsqrt is not in the std namespace. - static inline __host__ __device__ half rsqrt(half a) { - return static_cast(::rsqrt(static_cast(a))); - } - - static inline __host__ __device__ half ceil(half a) { - return static_cast(::ceil(static_cast(a))); - } - - static inline __host__ __device__ half floor(half a) { - return static_cast(::floor(static_cast(a))); - } - - static inline __host__ __device__ half trunc(half a) { - return static_cast(::trunc(static_cast(a))); - } - - static inline __host__ __device__ half neg(half a) { - return static_cast(-(static_cast(a))); - } - - static inline __host__ __device__ half acos(half a) { - return static_cast(::acos(static_cast(a))); - } - - static inline __host__ __device__ half cosh(half a) { - return static_cast(::cosh(static_cast(a))); - } - - static inline __host__ __device__ half asin(half a) { - return static_cast(::asin(static_cast(a))); - } - - static inline __host__ __device__ half sinh(half a) { - return static_cast(::sinh(static_cast(a))); - } - - static inline __host__ __device__ half tan(half a) { - return static_cast(::tan(static_cast(a))); - } - - static inline __host__ __device__ half atan(half a) { - return static_cast(::atan(static_cast(a))); - } - - static inline __host__ __device__ half tanh(half a) { - return static_cast(::tanh(static_cast(a))); - } - - - static inline __host__ __device__ half erf(half a) { - return static_cast(::erf(static_cast(a))); - } - - - static inline __host__ __device__ half erfc(half a) { - return static_cast(::erfc(static_cast(a))); - } - - // note that erfinv is not in the std namespace. - static inline __host__ __device__ half erfinv(half a) { - return static_cast(::erfinv(static_cast(a))); - } - - static inline __host__ __device__ half abs(half a) { - return static_cast(::abs(static_cast(a))); - } - - static inline __host__ __device__ half round(half a) { - return static_cast(::round(static_cast(a))); - } - - static inline __host__ __device__ half frac(half a) { +struct THCNumerics { + static inline __host__ __device__ at::Half min() { return at::numeric_limits::lowest(); } + static inline __host__ __device__ at::Half max() { return at::numeric_limits::max(); } + static inline __host__ __device__ at::Half lower_bound() { return at::numeric_limits::lower_bound(); } + static inline __host__ __device__ at::Half upper_bound() { return at::numeric_limits::upper_bound(); } + + static inline __host__ __device__ bool lt(at::Half a, at::Half b) { return a < b; } + static inline __host__ __device__ bool le(at::Half a, at::Half b) { return a <= b; } + static inline __host__ __device__ bool gt(at::Half a, at::Half b) { return a > b; } + static inline __host__ __device__ bool ge(at::Half a, at::Half b) { return a >= b; } + static inline __host__ __device__ bool eq(at::Half a, at::Half b) { return a == b; } + static inline __host__ __device__ bool ne(at::Half a, at::Half b) { return a != b; } + + static inline __host__ __device__ at::Half exp(at::Half a) { return std::exp(a); } + static inline __host__ __device__ at::Half exp10(at::Half a) { return ::exp10(a); } + static inline __host__ __device__ at::Half log(at::Half a) { return ::log(a); } + static inline __host__ __device__ at::Half log10(at::Half a) { return ::log10(a); } + static inline __host__ __device__ at::Half log1p(at::Half a) { return ::log1p(a); } + static inline __host__ __device__ at::Half log2(at::Half a) { return ::log2(a); } + static inline __host__ __device__ at::Half lgamma(at::Half a) { return ::lgamma(a); } + static inline __host__ __device__ at::Half expm1(at::Half a) { return ::expm1(a); } + static inline __host__ __device__ at::Half cos(at::Half a) { return ::cos(a); } + static inline __host__ __device__ at::Half sin(at::Half a) { return ::sin(a); } + static inline __host__ __device__ at::Half sqrt(at::Half a) { return ::sqrt(a); } + static inline __host__ __device__ at::Half rsqrt(at::Half a) { return ::rsqrt(a); } + static inline __host__ __device__ at::Half ceil(at::Half a) { return ::ceil(a); } + static inline __host__ __device__ at::Half floor(at::Half a) { return ::floor(a); } + static inline __host__ __device__ at::Half trunc(at::Half a) { return ::trunc(a); } + static inline __host__ __device__ at::Half neg(at::Half a) { return -a; } + static inline __host__ __device__ at::Half acos(at::Half a) { return ::acos(a); } + static inline __host__ __device__ at::Half cosh(at::Half a) { return ::cosh(a); } + static inline __host__ __device__ at::Half asin(at::Half a) { return ::asin(a); } + static inline __host__ __device__ at::Half sinh(at::Half a) { return ::sinh(a); } + static inline __host__ __device__ at::Half tan(at::Half a) { return ::tan(a); } + static inline __host__ __device__ at::Half atan(at::Half a) { return ::atan(a); } + static inline __host__ __device__ at::Half tanh(at::Half a) { return ::tanh(a); } + static inline __host__ __device__ at::Half erf(at::Half a) { return ::erf(a); } + static inline __host__ __device__ at::Half erfc(at::Half a) { return ::erfc(a); } + static inline __host__ __device__ at::Half erfinv(at::Half a) { return ::erfinv(a); } + static inline __host__ __device__ at::Half abs(at::Half a) { return ::abs(a); } + static inline __host__ __device__ at::Half round(at::Half a) { return ::round(a); } + + static inline __host__ __device__ at::Half frac(at::Half a) { #ifdef __CUDA_ARCH__ - return static_cast(a) - static_cast(::trunc(static_cast(a))); + return a - ::trunc(a); #else // __CUDA_ARCH__ - return static_cast(a) - static_cast(::floor(static_cast(a))); + return a - ::floor(a); #endif } - static inline __host__ __device__ half cinv(half a) { - return static_cast(1.0f / static_cast(a)); - } - - static inline __host__ __device__ half add(half a, half b) { - return static_cast(a) + static_cast(b); - } - - static inline __host__ __device__ half div(half a, half b) { - return static_cast(a) / static_cast(b); - } - - static inline __host__ __device__ half mul(half a, half b) { - return static_cast(a) * static_cast(b); - } - - static inline __host__ __device__ half sub(half a, half b) { - return static_cast(a) - static_cast(b); - } + static inline __host__ __device__ at::Half cinv(at::Half a) { return 1.0f / a; } + static inline __host__ __device__ at::Half add(at::Half a, at::Half b) { return a + b; } + static inline __host__ __device__ at::Half div(at::Half a, at::Half b) { return a / b; } + static inline __host__ __device__ at::Half mul(at::Half a, at::Half b) { return a * b; } + static inline __host__ __device__ at::Half sub(at::Half a, at::Half b) { return a - b; } + static inline __host__ __device__ at::Half pow(at::Half a, at::Half b) { return ::pow(a, b); } + static inline __host__ __device__ at::Half atan2(at::Half a, at::Half b) { return ::atan2(a, b); } - static inline __host__ __device__ half pow(half a, half b) { - return static_cast(::pow(static_cast(a), static_cast(b))); - } - - static inline __host__ __device__ half atan2(half a, half b) { - return static_cast(::atan2(static_cast(a), static_cast(b))); - } - - static inline __host__ __device__ bool isnan(half a) { + static inline __host__ __device__ bool isnan(at::Half a) { #ifdef _MSC_VER // Windows requires this explicit conversion. The reason is unclear // related issue with clang: https://reviews.llvm.org/D37906 - return ::isnan((float)static_cast(a)); + return ::isnan((float) a); #else - return ::isnan(static_cast(a)); + return ::isnan(a); #endif } - static inline __host__ __device__ bool isinf(half a) { + static inline __host__ __device__ bool isinf(at::Half a) { #ifdef _MSC_VER // Windows requires this explicit conversion. The reason is unclear // related issue with clang: https://reviews.llvm.org/D37906 - return ::isinf((float)static_cast(a)); + return ::isinf((float) a); #else - return ::isinf(static_cast(a)); + return ::isinf(a); #endif } @@ -506,35 +377,6 @@ struct ScalarConvert { static __host__ __device__ Out to(const In v) { return (Out) v; } }; -template -struct ScalarConvert { - static __host__ __device__ Out to(const half v) { -#ifdef __CUDA_ARCH__ - return (Out) __half2float(v); -#else - return (Out) THC_half2float(v); -#endif - } -}; - -template -struct ScalarConvert { - static __host__ __device__ half to(const In v) { -#ifdef __CUDA_ARCH__ - return __float2half((float) v); -#else - return THC_float2half((float) v); -#endif - } -}; - -template <> -struct ScalarConvert { - static __host__ __device__ half to(const half v) { - return v; - } -}; - // DEPRECATED: use static_cast in kernels instead of scalar_cast template __host__ __device__ T scalar_cast(U u) { diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index 96e3938e20b0f9..3d1630adedbe5b 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -1,7 +1,7 @@ #include "THCStorage.hpp" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu index 43a293422335fc..97b66ac2b5f891 100644 --- a/aten/src/THC/THCStorage.cu +++ b/aten/src/THC/THCStorage.cu @@ -7,7 +7,7 @@ #include #endif -#include "THCHalf.h" +#include "TH/THHalf.h" #include "generic/THCStorage.cu" #include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCStorageCopy.cu b/aten/src/THC/THCStorageCopy.cu index 8d7c869c12c004..c4f53f7160ca5b 100644 --- a/aten/src/THC/THCStorageCopy.cu +++ b/aten/src/THC/THCStorageCopy.cu @@ -1,7 +1,7 @@ #include "THCStorageCopy.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensorCopy.h" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THC/THCStorageCopy.h b/aten/src/THC/THCStorageCopy.h index 837056fc3801d2..250b60fcb2fe74 100644 --- a/aten/src/THC/THCStorageCopy.h +++ b/aten/src/THC/THCStorageCopy.h @@ -3,7 +3,7 @@ #include "THCStorage.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "generic/THCStorageCopy.h" #include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu index 259912a8d91806..a3a8ed2e182086 100644 --- a/aten/src/THC/THCTensorCopy.cu +++ b/aten/src/THC/THCTensorCopy.cu @@ -1,5 +1,5 @@ #include "THCApply.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCNumerics.cuh" #include "THCTensorCopy.hpp" #include diff --git a/aten/src/THC/THCTensorCopy.h b/aten/src/THC/THCTensorCopy.h index 74f2b592f54657..48dcc64b9fbedc 100644 --- a/aten/src/THC/THCTensorCopy.h +++ b/aten/src/THC/THCTensorCopy.h @@ -3,7 +3,7 @@ #include "THCTensor.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCStream.h" #include "generic/THCTensorCopy.h" diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu index 0ea5951d4ea734..6b4a77ea816225 100644 --- a/aten/src/THC/THCTensorIndex.cu +++ b/aten/src/THC/THCTensorIndex.cu @@ -4,7 +4,7 @@ #include "THCBlas.h" #include "THCTensorCopy.h" #include "THCTensorRandom.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCApply.cuh" #include "THCReduce.cuh" #include "THCDeviceUtils.cuh" diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu index b1be123b886e03..cd5a77c2227181 100644 --- a/aten/src/THC/THCTensorMathPairwise.cu +++ b/aten/src/THC/THCTensorMathPairwise.cu @@ -1,6 +1,6 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" @@ -21,26 +21,6 @@ struct TensorAddConstantOp { const T val; }; -template <> -struct TensorAddConstantOp { - TensorAddConstantOp(half v) : fval(THC_half2float(v)) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); - } - - const float fval; -}; - - template struct TensorSubConstantOp { TensorSubConstantOp(T v) : val(v) {} @@ -55,27 +35,6 @@ struct TensorSubConstantOp { const T val; }; - -template <> -struct TensorSubConstantOp { - TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); - } - - const float fval; -}; - - template struct TensorMulConstantOp { TensorMulConstantOp(T v) : val(v) {} @@ -90,25 +49,6 @@ struct TensorMulConstantOp { const T val; }; -template <> -struct TensorMulConstantOp { - TensorMulConstantOp(half v) : fval(THC_half2float(v)) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); - } - - const float fval; -}; - template struct TensorDivConstantOp { TensorDivConstantOp(T v) : val(v) {} @@ -151,24 +91,6 @@ struct TensorDivConstantOp { const double val; }; -template <> -struct TensorDivConstantOp { - TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {} - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); - } - - const float fval; -}; - template static __device__ __forceinline__ typename std::enable_if::value, bool>::type @@ -232,22 +154,18 @@ struct TensorRemainderOp { }; template <> -struct TensorRemainderOp { - TensorRemainderOp(half v): fval(THC_half2float(v)) {} +struct TensorRemainderOp { + TensorRemainderOp(at::Half v): val(v) {} - __device__ __forceinline__ void operator()(half* out, half* in) { - float fin = __half2float(*in); - float fout = fin - fval * floorf(fin / fval); - *out = __float2half(fout); + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { + *out = *in - val * floorf(*in / val); } - __device__ __forceinline__ void operator()(half* v) { - float fv = __half2float(*v); - fv = fv - fval * floorf(fv / fval); - *v = __float2half(fv); + __device__ __forceinline__ void operator()(at::Half* v) { + *v = *v - val * floorf(*v / val); } - const float fval; + const at::Half val; }; template @@ -278,21 +196,6 @@ struct TensorFmodOp { const double val; }; -template <> -struct TensorFmodOp { - TensorFmodOp(half v): fval(THC_half2float(v)) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - *out = __float2half(fmodf(__half2float(*in), fval)); - } - - __device__ __forceinline__ void operator()(half* v) { - *v = __float2half(fmodf(__half2float(*v), fval)); - } - - const float fval; -}; - template struct TensorTriOp { TensorTriOp(T *start_, int64_t stride0_, int64_t stride1_, int64_t k_) diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh index fb15a05b155c0f..fb1072b8bbddc1 100644 --- a/aten/src/THC/THCTensorMathPointwise.cuh +++ b/aten/src/THC/THCTensorMathPointwise.cuh @@ -4,7 +4,7 @@ #include #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" @@ -31,19 +31,6 @@ struct TensorSigmoidOp { } }; -template <> -struct TensorSigmoidOp { - __device__ __forceinline__ void operator()(half* out, half* in) const { - float fin = __half2float(*in); - *out = __float2half(1.0f / (1.0f + expf(- fin))); - } - - __device__ __forceinline__ void operator()(half* v) const { - float fv = __half2float(*v); - *v = __float2half(1.0f / (1.0f + expf(- fv))); - } -}; - template struct TensorSignOp { __device__ __forceinline__ void operator()(T* out, T* in) { @@ -70,19 +57,6 @@ struct TensorSignOp { } }; -template <> -struct TensorSignOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - float orig = __half2float(*in); - *out = __float2half((orig > 0) - (orig < 0)); - } - - __device__ __forceinline__ void operator()(half* v) { - float orig = __half2float(*v); - *v = __float2half((orig > 0) - (orig < 0)); - } -}; - template struct TensorCAddOp { TensorCAddOp(T v) : val(v) {} @@ -98,31 +72,6 @@ struct TensorCAddOp { T val; }; -template <> -struct TensorCAddOp { - TensorCAddOp(half v) : val(v) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - float fout = __half2float(*out); - float fval = __half2float(val); - float fin = __half2float(*in); - - fout += fval * fin; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fval = __half2float(val); - - float fout = fin1 + fval * fin2; - *out = __float2half(fout); - } - - half val; -}; - template struct TensorMulOp { __device__ __forceinline__ void operator()(T* out, T* in) { @@ -134,23 +83,6 @@ struct TensorMulOp { } }; -template <> -struct TensorMulOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - float fout = __half2float(*out); - float fin = __half2float(*in); - fout *= fin; - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 * fin2; - *out = __float2half(fout); - } -}; - template struct TensorPowOp { TensorPowOp(T v) : val(v) {} @@ -249,7 +181,6 @@ struct TensorCPowOp { } }; - template <> struct TensorCPowOp { __device__ __forceinline__ void operator()(double* out, double* in) { @@ -261,25 +192,6 @@ struct TensorCPowOp { } }; -template <> -struct TensorCPowOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - // No fp16 pow function yet - float fout = __half2float(*out); - float fin = __half2float(*in); - fout = powf(fout, fin); - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - // No fp16 pow function yet - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = powf(fin1, fin2); - *out = __float2half(fout); - } -}; - template static __device__ __forceinline__ typename std::enable_if::value, bool>::type @@ -336,17 +248,13 @@ struct TensorCRemainderOp { }; template <> -struct TensorCRemainderOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - float fout = __half2float(*out); - float fin = __half2float(*in); - *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN); +struct TensorCRemainderOp { + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { + *out = *in != 0.f ? *out - *in * floorf(*out / *in) : NAN; } - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN); + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) { + *out = *in2 != 0.f ? *in1 - *in2 * floorf(*in1 / *in2) : NAN; } }; @@ -384,13 +292,13 @@ struct TensorCFmodOp { }; template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - *out = __float2half(fmodf(__half2float(*out), __half2float(*in))); +struct TensorCFmodOp { + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { + *out = fmodf(*out, *in); } - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2))); + __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) { + *out = fmodf(*in1, *in2); } }; @@ -594,7 +502,6 @@ struct TensorRShiftOp { } }; - template <> struct TensorRShiftOp { __device__ __forceinline__ void @@ -671,7 +578,7 @@ template struct TensorDigammaOp { __device__ __forceinline__ void operator()(T* out, T* in) { - using compute_type = typename std::conditional::value, accreal, T>::type; + using compute_type = typename std::conditional::value, accreal, T>::type; static const double PI_f64 = 3.14159265358979323846; static const compute_type PSI_10 = 2.25175258906672110764; static const compute_type A[] = { @@ -731,7 +638,7 @@ struct TensorDigammaOp { template struct TensorTrigammaOp { - using compute_type = typename std::conditional::value, accreal, T>::type; + using compute_type = typename std::conditional::value, accreal, T>::type; __device__ __forceinline__ void operator()(T* out, T* in) { const compute_type PI = 3.14159265358979323846; diff --git a/aten/src/THC/THCTensorMode.cuh b/aten/src/THC/THCTensorMode.cuh index 0158f254a2d014..a3ed2ae8e4be93 100644 --- a/aten/src/THC/THCTensorMode.cuh +++ b/aten/src/THC/THCTensorMode.cuh @@ -7,33 +7,33 @@ struct ThrustHalfLess { - __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { - return THCNumerics::lt(lhs, rhs); + __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) { + return THCNumerics::lt(lhs, rhs); } }; struct ThrustHalfNotEqualTo { - __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { - return THCNumerics::ne(lhs, rhs); + __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) { + return THCNumerics::ne(lhs, rhs); } }; struct ThrustHalfEqualTo { - __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) { - return THCNumerics::eq(lhs, rhs); + __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) { + return THCNumerics::eq(lhs, rhs); } }; struct ThrustHalfEqualToPredicate { - ThrustHalfEqualToPredicate(half val): val_(val) {} - __host__ __device__ inline bool operator()(half x) { - return THCNumerics::eq(val_, x); + ThrustHalfEqualToPredicate(at::Half val): val_(val) {} + __host__ __device__ inline bool operator()(at::Half x) { + return THCNumerics::eq(val_, x); } - half val_; + at::Half val_; }; template diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index 8eb580169cc953..386473a430329a 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -14,7 +14,7 @@ #include #include -#define MAX_NUM_BLOCKS 200 +#define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 @@ -107,11 +107,11 @@ __device__ inline T reverse_bounds(T value) { } -__device__ inline half half_uniform_scale_and_shift(float x, double a, double b) { - half width = ScalarConvert::to(b - a); - half start = ScalarConvert::to(a); - half scaled = THCNumerics::mul(reverse_bounds(ScalarConvert::to(x)), width); - return THCNumerics::add(scaled, start); +__device__ inline at::Half half_uniform_scale_and_shift(float x, double a, double b) { + at::Half width = ScalarConvert::to(b - a); + at::Half start = ScalarConvert::to(a); + at::Half scaled = THCNumerics::mul(reverse_bounds(ScalarConvert::to(x)), width); + return THCNumerics::add(scaled, start); } #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM) \ @@ -181,10 +181,10 @@ GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uni GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5)))) GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5)))) -GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b))) -GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert::to((x * stdv) + mean))) -GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert::to((float)(-1. / lambda * log(x))))) -GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert::to((float)(median + sigma * tan(M_PI*(x-0.5)))))) +GENERATE_KERNEL2(generate_uniform, at::Half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b))) +GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert::to((x * stdv) + mean))) +GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert::to((float)(-1. / lambda * log(x))))) +GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert::to((float)(median + sigma * tan(M_PI*(x-0.5)))))) #include "generic/THCTensorRandom.cu" #include "THCGenerateAllTypes.h" diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh index 4f7a6b8c697913..6d1ef1b6bbfe37 100644 --- a/aten/src/THC/THCTensorTopK.cuh +++ b/aten/src/THC/THCTensorTopK.cuh @@ -113,10 +113,10 @@ struct TopKTypeConfig { }; template <> -struct TopKTypeConfig { +struct TopKTypeConfig { typedef uint32_t RadixType; - static inline __device__ RadixType convert(half v) { + static inline __device__ RadixType convert(at::Half v) { #if CUDA_VERSION >= 8000 RadixType x = __half_as_ushort(v); RadixType mask = -((x >> 15)) | 0x8000; @@ -127,16 +127,16 @@ struct TopKTypeConfig { #endif } - static inline __device__ half deconvert(RadixType v) { + static inline __device__ at::Half deconvert(RadixType v) { #if CUDA_VERSION >= 8000 RadixType mask = ((v >> 15) - 1) | 0x8000; return __ushort_as_half(v ^ mask); #else assert(false); - return ScalarConvert::to(0); + return ScalarConvert::to(0); #endif } -}; +}; // This function counts the distribution of all input values in a // slice we are selecting by radix digit at `radixDigitPos`, but only @@ -214,7 +214,7 @@ __device__ DataType findPattern(DataType* smem, BitDataType desired, BitDataType desiredMask) { #ifdef __HIP_PLATFORM_HCC__ - if (threadIdx.x < 64) { + if (threadIdx.x < 64) { #else if (threadIdx.x < 32) { #endif diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh index 377b363c006ba2..aecd8f01713d55 100644 --- a/aten/src/THC/THCTensorTypeUtils.cuh +++ b/aten/src/THC/THCTensorTypeUtils.cuh @@ -4,7 +4,7 @@ #include #include #include "THCGeneral.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCTensor.hpp" #include "THCTensorInfo.cuh" #include "THCTensor.hpp" @@ -80,56 +80,4 @@ struct ScalarInv { static __host__ __device__ T to(const T v) { return ((T) 1) / v; } }; -template <> -struct ScalarNegate { - static __host__ __device__ half to(const half v) { -#ifdef __CUDA_ARCH__ - return __float2half(-__half2float(v)); -#else -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - half out = v; -#else - __half_raw out = __half_raw(v); -#endif - out.x ^= 0x8000; // toggle sign bit - return out; -#endif - } -}; - -template <> -struct ScalarInv { - static __host__ __device__ half to(const half v) { -#if defined (__CUDA_ARCH_) || defined(__HIP_PLATFORM_HCC__) - return __float2half(1.0f / __half2float(v)); -#else - float fv = THC_half2float(v); - fv = 1.0f / fv; - return THC_float2half(fv); -#endif - } -}; - -inline bool operator==(half a, half b) { -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - return a.x == b.x; -#else - __half_raw araw, braw; - araw = __half_raw(a); - braw = __half_raw(b); - return araw.x == braw.x; -#endif -} - -inline bool operator!=(half a, half b) { -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - return a.x != b.x; -#else - __half_raw araw, braw; - araw = __half_raw(a); - braw = __half_raw(b); - return araw.x != braw.x; -#endif -} - #endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 0320fdfe8035a5..4919a7a2a364ab 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -72,7 +72,7 @@ IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long, int64_t) // THCudaTensor aka the non-existent THCudaFloatTensor IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, , float) IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double, double) -IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, half) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, at::Half) #undef IMPLEMENT_THC_CUDA_TENSOR_COPY diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index a37645de394de8..45364c7574cad4 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -27,11 +27,10 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) THCTensor_(data)(state, self), 1, THCTensor_(data)(state, src), 1); #elif defined(THC_REAL_IS_HALF) - accreal result = ScalarConvert::to( - THCudaBlas_Hdot(state, + accreal result = THCudaBlas_Hdot(state, THCTensor_(nElement)(state, self), THCTensor_(data)(state, self), 1, - THCTensor_(data)(state, src), 1)); + THCTensor_(data)(state, src), 1); #endif THCTensor_(free)(state, src); diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu index 40d6bdb6382983..06720b358df985 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -43,23 +43,13 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THC_API void THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) { -#ifdef THC_REAL_IS_HALF - auto v = THC_half2float(value) * THC_half2float(alpha); - THCTensor_(add)(state, self_, src_, THC_float2half(v)); -#else THCTensor_(add)(state, self_, src_, value * alpha); -#endif } THC_API void THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) { -#ifdef THC_REAL_IS_HALF - auto v = THC_half2float(value) * THC_half2float(alpha); - THCTensor_(sub)(state, self_, src_, THC_float2half(v)); -#else THCTensor_(sub)(state, self_, src_, value * alpha); -#endif } THC_API void diff --git a/aten/src/THCUNN/Abs.cu b/aten/src/THCUNN/Abs.cu index 72b7ff3c2f53d9..98542eda7e8a75 100644 --- a/aten/src/THCUNN/Abs.cu +++ b/aten/src/THCUNN/Abs.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu index cb0f47510bc559..30aa975594a160 100644 --- a/aten/src/THCUNN/AbsCriterion.cu +++ b/aten/src/THCUNN/AbsCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu index 3624588015c8a1..e458bb81c9725e 100644 --- a/aten/src/THCUNN/BCECriterion.cu +++ b/aten/src/THCUNN/BCECriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu index 0c393c3a9d6db0..97579d1c4aefd5 100644 --- a/aten/src/THCUNN/BatchNormalization.cu +++ b/aten/src/THCUNN/BatchNormalization.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/ClassNLLCriterion.cu b/aten/src/THCUNN/ClassNLLCriterion.cu index 1043454ff1528b..dd430e9b88d120 100644 --- a/aten/src/THCUNN/ClassNLLCriterion.cu +++ b/aten/src/THCUNN/ClassNLLCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" diff --git a/aten/src/THCUNN/Col2Im.cu b/aten/src/THCUNN/Col2Im.cu index d7fd995de4b88b..73eca7ff16ad30 100644 --- a/aten/src/THCUNN/Col2Im.cu +++ b/aten/src/THCUNN/Col2Im.cu @@ -4,7 +4,7 @@ #include "THCTensor.hpp" #include "THCStorage.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/Col2Im.cu" diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu index e4e85b71045f8e..bd26c0c003bb7e 100644 --- a/aten/src/THCUNN/DistKLDivCriterion.cu +++ b/aten/src/THCUNN/DistKLDivCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu index 9c4c2ea1fdc8b6..ec441645e8d7d5 100644 --- a/aten/src/THCUNN/ELU.cu +++ b/aten/src/THCUNN/ELU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/GatedLinearUnit.cu b/aten/src/THCUNN/GatedLinearUnit.cu index aba9f1e794e308..376fdb3855eb23 100644 --- a/aten/src/THCUNN/GatedLinearUnit.cu +++ b/aten/src/THCUNN/GatedLinearUnit.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include #include "common.h" diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu index 539b22fec5a515..8d6a7953975f75 100644 --- a/aten/src/THCUNN/HardTanh.cu +++ b/aten/src/THCUNN/HardTanh.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/Im2Col.cu b/aten/src/THCUNN/Im2Col.cu index 95bdcd4e8b9cd7..252c488df33f00 100644 --- a/aten/src/THCUNN/Im2Col.cu +++ b/aten/src/THCUNN/Im2Col.cu @@ -2,7 +2,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/IndexLinear.cu b/aten/src/THCUNN/IndexLinear.cu index eebb7efc153d88..032e8e31d2cffb 100644 --- a/aten/src/THCUNN/IndexLinear.cu +++ b/aten/src/THCUNN/IndexLinear.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/L1Cost.cu b/aten/src/THCUNN/L1Cost.cu index eda58c18e2c6e4..4f11a94f8106c3 100644 --- a/aten/src/THCUNN/L1Cost.cu +++ b/aten/src/THCUNN/L1Cost.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/LeakyReLU.cu b/aten/src/THCUNN/LeakyReLU.cu index ec9efb836c3441..dec13dfd112dcc 100644 --- a/aten/src/THCUNN/LeakyReLU.cu +++ b/aten/src/THCUNN/LeakyReLU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/LogSigmoid.cu b/aten/src/THCUNN/LogSigmoid.cu index 59ee39a6871bf7..e318fcea4f92f5 100644 --- a/aten/src/THCUNN/LogSigmoid.cu +++ b/aten/src/THCUNN/LogSigmoid.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu index 59aa7e8f4612ba..05eb432871ed34 100644 --- a/aten/src/THCUNN/LookupTable.cu +++ b/aten/src/THCUNN/LookupTable.cu @@ -2,7 +2,7 @@ #include "common.h" #include "THCThrustAllocator.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensorSort.cuh" #include "../THC/THCTensorMathReduce.cuh" diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu index c2ba9f52081972..c579b93bcb2331 100644 --- a/aten/src/THCUNN/LookupTableBag.cu +++ b/aten/src/THCUNN/LookupTableBag.cu @@ -11,7 +11,7 @@ #include #endif #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensorSort.cuh" diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu index e9571fe06c4e30..99b287baba38a9 100644 --- a/aten/src/THCUNN/MSECriterion.cu +++ b/aten/src/THCUNN/MSECriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/MarginCriterion.cu b/aten/src/THCUNN/MarginCriterion.cu index 7ccdbb725fe6b1..459d62a4e57a81 100644 --- a/aten/src/THCUNN/MarginCriterion.cu +++ b/aten/src/THCUNN/MarginCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/MultiLabelMarginCriterion.cu index 13b432c15c38c6..220de837d2349e 100644 --- a/aten/src/THCUNN/MultiLabelMarginCriterion.cu +++ b/aten/src/THCUNN/MultiLabelMarginCriterion.cu @@ -2,7 +2,7 @@ #include "THCTensor.hpp" #include "common.h" #include "THCReduceApplyUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu index c2fa2134626101..de802c82979985 100644 --- a/aten/src/THCUNN/MultiMarginCriterion.cu +++ b/aten/src/THCUNN/MultiMarginCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/PReLU.cu b/aten/src/THCUNN/PReLU.cu index cdc6b2b71a1ee4..c89152487d2f3f 100644 --- a/aten/src/THCUNN/PReLU.cu +++ b/aten/src/THCUNN/PReLU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include #include "THCTensor.hpp" @@ -69,7 +69,7 @@ struct PReLUAccGradParametersShared { __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) { - *gradInput = (*input) * (*gradOutput) * (*input <= 0); + *gradInput = (*input) * (*gradOutput) * static_cast(*input <= 0); } }; @@ -84,7 +84,7 @@ struct PReLUAccGradParameters __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) { - *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0); + *gradInput = (*input) * (*gradOutput) * scale * static_cast(*input <= 0); } }; @@ -99,7 +99,7 @@ struct PReLUAccGradParameters1to1 __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput) { - *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0); + *gradWeight += (*input) * (*gradOutput) * scale * static_cast(*input <= 0); } }; diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu index 388627791a012f..d044fadadfb0d1 100644 --- a/aten/src/THCUNN/RReLU.cu +++ b/aten/src/THCUNN/RReLU.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include #include "common.h" @@ -15,8 +15,8 @@ template inline T __device__ curand_uniform_type(curandStateMtgp32 *state); template <> -inline half __device__ curand_uniform_type(curandStateMtgp32 *state) { - return ScalarConvert::to(curand_uniform(state)); +inline THHalf __device__ curand_uniform_type(curandStateMtgp32 *state) { + return ScalarConvert::to(curand_uniform(state)); } template <> diff --git a/aten/src/THCUNN/Sigmoid.cu b/aten/src/THCUNN/Sigmoid.cu index 0be57d6a621dfc..6ade198fe02af1 100644 --- a/aten/src/THCUNN/Sigmoid.cu +++ b/aten/src/THCUNN/Sigmoid.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu index c8018d997365dc..48a86e3efec996 100644 --- a/aten/src/THCUNN/SmoothL1Criterion.cu +++ b/aten/src/THCUNN/SmoothL1Criterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCThrustAllocator.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu index ee53e76dca2625..63b0ef3cc7ca7e 100644 --- a/aten/src/THCUNN/SoftMarginCriterion.cu +++ b/aten/src/THCUNN/SoftMarginCriterion.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCApply.cuh" diff --git a/aten/src/THCUNN/SoftPlus.cu b/aten/src/THCUNN/SoftPlus.cu index 42b2c3c5ef0617..a8a36c22fa918a 100644 --- a/aten/src/THCUNN/SoftPlus.cu +++ b/aten/src/THCUNN/SoftPlus.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/SoftShrink.cu b/aten/src/THCUNN/SoftShrink.cu index a4e45d87c6b3a3..5ec104155ea737 100644 --- a/aten/src/THCUNN/SoftShrink.cu +++ b/aten/src/THCUNN/SoftShrink.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu index 2bc17a75a76f7e..08f72046341ca7 100644 --- a/aten/src/THCUNN/SparseLinear.cu +++ b/aten/src/THCUNN/SparseLinear.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu index 2c671dad5a8364..ff68ab8440f757 100644 --- a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu +++ b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu index 592e6fd8b08154..591dd012fdd521 100644 --- a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu +++ b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/SpatialAveragePooling.cu b/aten/src/THCUNN/SpatialAveragePooling.cu index ce9941a62398c7..d07255a954ee77 100644 --- a/aten/src/THCUNN/SpatialAveragePooling.cu +++ b/aten/src/THCUNN/SpatialAveragePooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "common.h" diff --git a/aten/src/THCUNN/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/SpatialClassNLLCriterion.cu index 83addd09a2769b..92262354b3b696 100644 --- a/aten/src/THCUNN/SpatialClassNLLCriterion.cu +++ b/aten/src/THCUNN/SpatialClassNLLCriterion.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "common.h" diff --git a/aten/src/THCUNN/SpatialConvolutionLocal.cu b/aten/src/THCUNN/SpatialConvolutionLocal.cu index 17801d52b12688..0af7685432991b 100644 --- a/aten/src/THCUNN/SpatialConvolutionLocal.cu +++ b/aten/src/THCUNN/SpatialConvolutionLocal.cu @@ -2,7 +2,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu index 4a59acb2975188..d9f6f128efe81b 100644 --- a/aten/src/THCUNN/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/SpatialConvolutionMM.cu @@ -3,7 +3,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/SpatialConvolutionMM.cu" diff --git a/aten/src/THCUNN/SpatialCrossMapLRN.cu b/aten/src/THCUNN/SpatialCrossMapLRN.cu index cd6f081b1302df..2262ddba4743ed 100644 --- a/aten/src/THCUNN/SpatialCrossMapLRN.cu +++ b/aten/src/THCUNN/SpatialCrossMapLRN.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/SpatialDilatedConvolution.cu b/aten/src/THCUNN/SpatialDilatedConvolution.cu index b8e96024fdaa87..b0edadb2b357fc 100644 --- a/aten/src/THCUNN/SpatialDilatedConvolution.cu +++ b/aten/src/THCUNN/SpatialDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "common.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu index 6732e4f2b53409..3aef4ecf524cce 100644 --- a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu +++ b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCNumerics.cuh" #include "common.h" diff --git a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu index f3ca162453107a..71ddce8c84a995 100644 --- a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu +++ b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu @@ -3,7 +3,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialFullConvolution.cu b/aten/src/THCUNN/SpatialFullConvolution.cu index 4e37ecf280bbf5..b4eff4b9c22d1a 100644 --- a/aten/src/THCUNN/SpatialFullConvolution.cu +++ b/aten/src/THCUNN/SpatialFullConvolution.cu @@ -1,7 +1,7 @@ #include "THCUNN.h" #include "im2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/SpatialFullConvolution.cu" diff --git a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu index 61e1fe5910ad18..9ba2236cea66cd 100644 --- a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu +++ b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "im2col.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/SpatialFullDilatedConvolution.cu" diff --git a/aten/src/THCUNN/SpatialReflectionPadding.cu b/aten/src/THCUNN/SpatialReflectionPadding.cu index 96472eed08b839..0b0643c8e41342 100644 --- a/aten/src/THCUNN/SpatialReflectionPadding.cu +++ b/aten/src/THCUNN/SpatialReflectionPadding.cu @@ -7,7 +7,7 @@ #include "THCReduceApplyUtils.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialReplicationPadding.cu b/aten/src/THCUNN/SpatialReplicationPadding.cu index f63c2090d5fb14..1ee5c62f6e7132 100644 --- a/aten/src/THCUNN/SpatialReplicationPadding.cu +++ b/aten/src/THCUNN/SpatialReplicationPadding.cu @@ -7,7 +7,7 @@ #include "THCReduceApplyUtils.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialSubSampling.cu b/aten/src/THCUNN/SpatialSubSampling.cu index bb0484662254fd..8e8e390c136388 100644 --- a/aten/src/THCUNN/SpatialSubSampling.cu +++ b/aten/src/THCUNN/SpatialSubSampling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu index 07daa0e9fec01d..b093ee287edce0 100644 --- a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu +++ b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu index 889d64e1817e1c..8b0784cfd75351 100644 --- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu +++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/Sqrt.cu b/aten/src/THCUNN/Sqrt.cu index a52ce34117aaf0..7358f8c6bd0bc7 100644 --- a/aten/src/THCUNN/Sqrt.cu +++ b/aten/src/THCUNN/Sqrt.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/Square.cu b/aten/src/THCUNN/Square.cu index 66bbec49d29cd9..f44fbfe7ea4fc4 100644 --- a/aten/src/THCUNN/Square.cu +++ b/aten/src/THCUNN/Square.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh index 5f8fda89909552..fe8a8bbc3cdc1a 100644 --- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh +++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh @@ -1,23 +1,23 @@ #ifndef THC_HALF_AUTO_NUMERICS_INC #define THC_HALF_AUTO_NUMERICS_INC -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCNumerics.cuh" -// WARNING: THCNumerics is being deprecated. Read the comments and function usage +// WARNING: THCNumerics is being deprecated. Read the comments and function usage // in THCNumerics to learn about the deprecation -// +// // Half numerics functions defined as free functions, so cunn code can be -//written generically, i.e. without excessive calling of THCNumerics functions. +// written generically, i.e. without excessive calling of THCNumerics functions. // these functions should move to THCNumerics -inline __host__ __device__ half fmaxType(half x, half y) { - return THCNumerics::ge(x, y) ? x : y; +inline __host__ __device__ THHalf fmaxType(THHalf x, THHalf y) { + return THCNumerics::ge(x, y) ? x : y; } -inline __host__ __device__ float fmaxType(float x, half y) { - return fmaxf(x, ScalarConvert::to(y)); +inline __host__ __device__ float fmaxType(float x, THHalf y) { + return fmaxf(x, ScalarConvert::to(y)); } inline __host__ __device__ float fmaxType(float x, float y) { @@ -31,217 +31,40 @@ inline __host__ __device__ double fmaxType(double x, double y) { // arithmetic functions -inline __host__ __device__ half operator+(half a, half b) { - return THCNumerics::add(a, b); +inline __host__ __device__ THHalf abs(THHalf a) { + return THCNumerics::abs(a); } -inline __host__ __device__ float operator+(half a, float b) { - return ScalarConvert::to(a) + b; +inline __host__ __device__ THHalf exp(THHalf a) { + return THCNumerics::exp(a); } -inline __host__ __device__ float operator+(float a, half b) { - return a + ScalarConvert::to(b); +inline __host__ __device__ THHalf log10(THHalf a) { + return THCNumerics::log10(a); } -inline __host__ __device__ double operator+(double a, half b) { - return a + ScalarConvert::to(b); +inline __host__ __device__ THHalf log1p(THHalf a) { + return THCNumerics::log1p(a); } -inline __host__ __device__ half operator-(half a) { - return THCNumerics::neg(a); +inline __host__ __device__ THHalf log2(THHalf a) { + return THCNumerics::log2(a); } -inline __host__ __device__ half operator-(half a, half b) { - return THCNumerics::add(a, THCNumerics::neg(b)); +inline __host__ __device__ THHalf expm1(THHalf a) { + return THCNumerics::expm1(a); } -inline __host__ __device__ half operator-(half a, int b) { - return THCNumerics::add(a, THCNumerics::neg(ScalarConvert::to(b))); +inline __host__ __device__ THHalf pow(THHalf a, THHalf b) { + return THCNumerics::pow(a, b); } -inline __host__ __device__ float operator-(half a, float b) { - return ScalarConvert::to(a) - b; +inline __host__ __device__ THHalf sqrt(THHalf a) { + return THCNumerics::sqrt(a); } -inline __host__ __device__ double operator-(half a, double b) { - return ScalarConvert::to(a) - b; -} - -inline __host__ __device__ half operator-(int a, half b) { - return THCNumerics::add(ScalarConvert::to(a), THCNumerics::neg(b)); -} - -inline __host__ __device__ float operator-(float a, half b) { - return a - ScalarConvert::to(b); -} - -inline __host__ __device__ double operator-(double a, half b) { - return a - ScalarConvert::to(b); -} - -inline __host__ __device__ half operator*(half a, half b) { - return THCNumerics::mul(a, b); -} - -inline __host__ __device__ float operator*(half a, float b) { - return ScalarConvert::to(a) * b; -} - -inline __host__ __device__ double operator*(half a, double b) { - return ScalarConvert::to(a) * b; -} - -inline __host__ __device__ half operator*(half a, int b) { - return a * ScalarConvert::to(b); -} - -inline __host__ __device__ float operator*(float a, half b) { - return a * ScalarConvert::to(b); -} - -inline __host__ __device__ double operator*(double a, half b) { - return a * ScalarConvert::to(b); -} - -inline __host__ __device__ half operator/(half a, half b) { - return THCNumerics::div(a, b); -} - -inline __host__ __device__ float operator/(float a, half b) { - return a / ScalarConvert::to(b); -} - -inline __host__ __device__ double operator/(double a, half b) { - return a / ScalarConvert::to(b); -} - -inline __host__ __device__ half operator/(int a, half b) { - return ScalarConvert::to(a) / b; -} - -inline __host__ __device__ float operator/(half a, float b) { - return ScalarConvert::to(a) / b; -} - -inline __host__ __device__ double operator/(half a, double b) { - return ScalarConvert::to(a) / b; -} - -inline __host__ __device__ half operator/(half a, int b) { - return a / ScalarConvert::to(b); -} - -inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) { - lhs = lhs + rhs; - return lhs; -} -inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) { - lhs = lhs + rhs; - return lhs; -} - -inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) { - lhs = lhs - rhs; - return lhs; -} - -inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) { - lhs = lhs * rhs; - return lhs; -} - -inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) { - lhs = lhs / rhs; - return lhs; -} - -inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) { - lhs = lhs / rhs; - return lhs; -} - -inline __host__ __device__ half abs(half a) { - return THCNumerics::abs(a); -} - -inline __host__ __device__ half exp(half a) { - return THCNumerics::exp(a); -} - -inline __host__ __device__ half log10(half a) { - return THCNumerics::log10(a); -} - -inline __host__ __device__ half log1p(half a) { - return THCNumerics::log1p(a); -} - -inline __host__ __device__ half log2(half a) { - return THCNumerics::log2(a); -} - -inline __host__ __device__ half expm1(half a) { - return THCNumerics::expm1(a); -} - -inline __host__ __device__ half pow(half a, half b) { - return THCNumerics::pow(a, b); -} - -inline __host__ __device__ half sqrt(half a) { - return THCNumerics::sqrt(a); -} - -inline __host__ __device__ half tanh(half a) { - return THCNumerics::tanh(a); -} - -#if defined(_MSC_VER) && CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) -inline __host__ __device__ half operator+(half a, int b) { - return THCNumerics::add(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ double operator+(half a, double b) { - return ScalarConvert::to(a) + b; -} - -inline __host__ __device__ half operator*(half a, bool b) { - return THCNumerics::mul(a, ScalarConvert::to(b)); -} -#endif - -// comparison functions - -inline __host__ __device__ bool operator<(half a, half b) { - return THCNumerics::lt(a, b); -} - -inline __host__ __device__ bool operator<=(half a, half b) { - return THCNumerics::le(a, b); -} - -inline __host__ __device__ bool operator<=(half a, int b) { - return THCNumerics::le(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ bool operator<(half a, int b) { - return THCNumerics::lt(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ bool operator>(half a, half b) { - return THCNumerics::gt(a, b); -} - -inline __host__ __device__ bool operator>(half a, int b) { - return THCNumerics::gt(a, ScalarConvert::to(b)); -} - -inline __host__ __device__ bool operator>=(half a, half b) { - return THCNumerics::ge(a, b); -} - -inline __host__ __device__ bool operator>=(half a, int b) { - return THCNumerics::ge(a, ScalarConvert::to(b)); +inline __host__ __device__ THHalf tanh(THHalf a) { + return THCNumerics::tanh(a); } #endif diff --git a/aten/src/THCUNN/Tanh.cu b/aten/src/THCUNN/Tanh.cu index a19cc71c045dd2..676da711f33443 100644 --- a/aten/src/THCUNN/Tanh.cu +++ b/aten/src/THCUNN/Tanh.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/TemporalConvolution.cu b/aten/src/THCUNN/TemporalConvolution.cu index af12169d7a4ce8..847b82449ac60f 100644 --- a/aten/src/THCUNN/TemporalConvolution.cu +++ b/aten/src/THCUNN/TemporalConvolution.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/TemporalMaxPooling.cu b/aten/src/THCUNN/TemporalMaxPooling.cu index 2508f835177b25..de478339d8f984 100644 --- a/aten/src/THCUNN/TemporalMaxPooling.cu +++ b/aten/src/THCUNN/TemporalMaxPooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/TemporalReflectionPadding.cu b/aten/src/THCUNN/TemporalReflectionPadding.cu index 4dd4da84c0a2d6..9e905f914653ca 100644 --- a/aten/src/THCUNN/TemporalReflectionPadding.cu +++ b/aten/src/THCUNN/TemporalReflectionPadding.cu @@ -8,7 +8,7 @@ #include "THCTensor.hpp" #include "THCStorage.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/TemporalReplicationPadding.cu b/aten/src/THCUNN/TemporalReplicationPadding.cu index 2c812bda8d64f5..3f74d1e62473b2 100644 --- a/aten/src/THCUNN/TemporalReplicationPadding.cu +++ b/aten/src/THCUNN/TemporalReplicationPadding.cu @@ -7,7 +7,7 @@ #include "THCReduceApplyUtils.cuh" #include -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/TemporalRowConvolution.cu b/aten/src/THCUNN/TemporalRowConvolution.cu index 745fef807510d8..097c78fde6f81b 100644 --- a/aten/src/THCUNN/TemporalRowConvolution.cu +++ b/aten/src/THCUNN/TemporalRowConvolution.cu @@ -2,7 +2,7 @@ #include "common.h" #include "row2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" #include "THCStorage.hpp" diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu index 89b0c37b1fa78d..2aaf4bcf4435e4 100644 --- a/aten/src/THCUNN/TemporalUpSamplingLinear.cu +++ b/aten/src/THCUNN/TemporalUpSamplingLinear.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu index c87129da7e1563..225e319423e8ad 100644 --- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu +++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/Threshold.cu b/aten/src/THCUNN/Threshold.cu index 37290894103919..1d44e442e21277 100644 --- a/aten/src/THCUNN/Threshold.cu +++ b/aten/src/THCUNN/Threshold.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu index 84e2c7f7063c3a..89ecfe09bfaf94 100644 --- a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu +++ b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "THCTensor.hpp" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu index 6d542ba39037ee..3e631b2755e9bd 100644 --- a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu +++ b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu @@ -1,5 +1,5 @@ #include "THCUNN.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include "THCTensor.hpp" diff --git a/aten/src/THCUNN/VolumetricAveragePooling.cu b/aten/src/THCUNN/VolumetricAveragePooling.cu index 110eac44dcb997..66e89d2a950b72 100644 --- a/aten/src/THCUNN/VolumetricAveragePooling.cu +++ b/aten/src/THCUNN/VolumetricAveragePooling.cu @@ -4,7 +4,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu index da66140fb61537..2e405f3c3b00c2 100644 --- a/aten/src/THCUNN/VolumetricConvolution.cu +++ b/aten/src/THCUNN/VolumetricConvolution.cu @@ -1,7 +1,7 @@ #include "THCUNN.h" #include "THCTensor.hpp" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" // Kernel for fast unfold+copy diff --git a/aten/src/THCUNN/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/VolumetricDilatedConvolution.cu index 8a32c70b6701ad..09fc83b01a54f2 100644 --- a/aten/src/THCUNN/VolumetricDilatedConvolution.cu +++ b/aten/src/THCUNN/VolumetricDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "THCTensor.hpp" #include "common.h" #include "vol2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/VolumetricDilatedConvolution.cu" diff --git a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu index 1a0f2f617d343e..bff981f73d3a3a 100644 --- a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu +++ b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu @@ -4,7 +4,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu index e6260ceabbe282..4875ae9f7da07a 100644 --- a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu +++ b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu @@ -3,7 +3,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricFullConvolution.cu b/aten/src/THCUNN/VolumetricFullConvolution.cu index 556b5bc1d4a5ac..6c4ace126e04e7 100644 --- a/aten/src/THCUNN/VolumetricFullConvolution.cu +++ b/aten/src/THCUNN/VolumetricFullConvolution.cu @@ -1,6 +1,6 @@ #include "THCUNN.h" #include "common.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/VolumetricFullConvolution.cu" diff --git a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu index c5c7196bac899e..d315ace7582440 100644 --- a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu +++ b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu @@ -2,7 +2,7 @@ #include "THCTensor.hpp" #include "common.h" #include "vol2col.h" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "generic/VolumetricFullDilatedConvolution.cu" diff --git a/aten/src/THCUNN/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/VolumetricMaxUnpooling.cu index eac3b2d17af5e4..0974ebc763cd8d 100644 --- a/aten/src/THCUNN/VolumetricMaxUnpooling.cu +++ b/aten/src/THCUNN/VolumetricMaxUnpooling.cu @@ -4,7 +4,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include diff --git a/aten/src/THCUNN/VolumetricReplicationPadding.cu b/aten/src/THCUNN/VolumetricReplicationPadding.cu index 27ea3ecad3faa2..e9ff31de27240b 100644 --- a/aten/src/THCUNN/VolumetricReplicationPadding.cu +++ b/aten/src/THCUNN/VolumetricReplicationPadding.cu @@ -5,7 +5,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" #include "THCReduceApplyUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" #include diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu index babbd58b0d4a0f..2f06bdaa78ca0b 100644 --- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu +++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu index 0f353b91acb7ea..ea4c50433370fe 100644 --- a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu +++ b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu @@ -7,7 +7,7 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" -#include "THCHalf.h" +#include "TH/THHalf.h" #include "THCHalfAutoNumerics.cuh" #include "THCAtomics.cuh" diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml index 74eb2669300241..44dc23ade9cd8d 100644 --- a/tools/amd_build/disabled_features.yaml +++ b/tools/amd_build/disabled_features.yaml @@ -150,13 +150,6 @@ "gamma_grad_cuda_kernel", ] }, - { - "path": "aten/src/THC/THCGeneral.cpp", - "functions": [ - "THC_float2half", - "THC_half2float" - ] - }, { "path": "aten/src/THCUNN/generic/SparseLinear.cu", "functions": [ diff --git a/torch/csrc/cuda/THCP.h b/torch/csrc/cuda/THCP.h index bd3b2d1c5580cd..8ed6f6bf91ce2c 100644 --- a/torch/csrc/cuda/THCP.h +++ b/torch/csrc/cuda/THCP.h @@ -4,7 +4,7 @@ #include "torch/csrc/python_headers.h" #include #include -#include +#include #include #include "torch/csrc/THP.h" diff --git a/torch/csrc/generic/utils.h b/torch/csrc/generic/utils.h index ade810564fd2ae..bd7c2fb1050bff 100644 --- a/torch/csrc/generic/utils.h +++ b/torch/csrc/generic/utils.h @@ -15,7 +15,7 @@ typedef class THPPointer THWStoragePtr; typedef class THPPointer THWTensorPtr; typedef class THPPointer THPStoragePtr; -#if (!defined(THC_GENERIC_FILE) || defined(THC_REAL_IS_HALF)) && \ +#if (!defined(THC_GENERIC_FILE)) && \ (!defined(THD_GENERIC_FILE)) template<> struct THPUtils_typeTraits { diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h index 60b9b8b05352d9..1d1d0188940216 100644 --- a/torch/csrc/utils.h +++ b/torch/csrc/utils.h @@ -74,13 +74,8 @@ #define THPFloatUtils_unpackAccreal(object) (double)THPUtils_unpackReal_FLOAT(object) #define THPFloatUtils_newAccreal(value) THPUtils_newReal_FLOAT(value) #define THPHalfUtils_checkReal(object) THPUtils_checkReal_FLOAT(object) -#ifndef THP_HOST_HALF -#define THPHalfUtils_unpackReal(object) (half)THC_float2half(THPUtils_unpackReal_FLOAT(object)) -#define THPHalfUtils_newReal(value) PyFloat_FromDouble(THC_half2float(value)) -#else -#define THPHalfUtils_unpackReal(object) TH_float2half(THPUtils_unpackReal_FLOAT(object)) -#define THPHalfUtils_newReal(value) PyFloat_FromDouble(TH_half2float(value)) -#endif +#define THPHalfUtils_unpackReal(object) (at::Half)THPUtils_unpackReal_FLOAT(object) +#define THPHalfUtils_newReal(value) PyFloat_FromDouble(value) #define THPHalfUtils_checkAccreal(object) THPUtils_checkReal_FLOAT(object) #define THPHalfUtils_unpackAccreal(object) (double)THPUtils_unpackReal_FLOAT(object) #define THPHalfUtils_newAccreal(value) THPUtils_newReal_FLOAT(value) From 5da0b31bee56038d9a4c197a7b05536d8bb3dd44 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 12 Sep 2018 17:36:35 -0700 Subject: [PATCH 106/237] More native docs on TensorOptions. (#11558) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11558 Differential Revision: D9783655 Pulled By: ezyang fbshipit-source-id: 17c749c9ef99fd9dfd0ff365ebfe22102fb891d7 --- aten/src/ATen/native/README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 9b06a513a14cd5..df937d6464b487 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -74,10 +74,19 @@ signature. - `std::array` (where N is `1-4`). NB: you MUST NOT put a space after the comma, otherwise this argument will not parse correctly. (If you decide to fix this, make sure you fix the argument parser both in ATen and in PyTorch.) +- `TensorOptions`. Tensor options provide information about how a + tensor should be constructed; it is most useful when you are writing a + factory function, where you have no `Tensor` inputs and thus + cannot otherwise determine how to construct a `Tensor`. - `*` is a special sentinel argument, which doesn't translate into an actual argument, but indicates that in the Python bindings, any subsequent arguments must be specified as keyword arguments (and cannot be provided positionally). +Functions with no tensor inputs are called *factory functions*, and +are handled specially by code generation. If your function is behaving +differently than another example, check first and see if one is a +factory while another is not. + **Return types.** These types are permissible as ReturnType: - `Tensor` and `TensorList`, which translate into the C++ types `Tensor` and `std::vector`, @@ -218,8 +227,9 @@ direct consequences on valid implementations: * Never create a `Tensor` directly (e.g., `at::CPU` or `at::CUDA`), as a caller will be expecting to get `Variable`s out if it passes `Variable`. - Instead, create tensors from the `type()` of one of the input tensors, e.g., - `input.type().tensor()` or `input.type().toScalarType(kByte)` if you need + Instead, create tensors using the `options()` of one of the input + tensors. E.g., `at::empty(sizes, input.options())` or + `at::ones(input.options().dtype(kByte))`, if you need a different scalar type. * If you need to call other ATen functions, be sure to qualify the call From 0a6931cfee93a4c70d17980786337799ed5d56ee Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Wed, 12 Sep 2018 18:23:57 -0700 Subject: [PATCH 107/237] Only reference ONNX through onnx_pb.h (#11609) Summary: I think this is needed to land https://github.com/onnx/onnx/pull/1407 without CI errors. cc mingzhe09088 houseroad Pull Request resolved: https://github.com/pytorch/pytorch/pull/11609 Reviewed By: houseroad Differential Revision: D9803490 Pulled By: orionr fbshipit-source-id: 26193f38ab0a2eef9ad7d0da9a0310dc40ef0f2d --- torch/csrc/jit/export.cpp | 2 +- torch/csrc/jit/import.cpp | 2 +- torch/csrc/onnx/init.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index c4ef14f191f0ee..0d304a91583e8d 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -1,7 +1,7 @@ #include "torch/csrc/jit/export.h" #include "torch/csrc/jit/serialization.h" #include "torch/csrc/autograd/symbolic.h" -#include "onnx/onnx.pb.h" +#include "onnx/onnx_pb.h" #include "torch/csrc/onnx/onnx.h" #include "torch/csrc/utils/functional.h" diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index d2f828a2517ef8..0ca9f79e6f50dc 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -1,6 +1,6 @@ #include "torch/csrc/jit/import.h" #include "torch/csrc/jit/serialization.h" -#include "onnx/onnx.pb.h" +#include "onnx/onnx_pb.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/utils/functional.h" #include "torch/csrc/jit/assertions.h" diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index ea78e83ad01b36..fe40b60ce674f7 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -1,6 +1,6 @@ #include "torch/csrc/onnx/init.h" #include "torch/csrc/onnx/onnx.h" -#include "onnx/onnx.pb.h" +#include "onnx/onnx_pb.h" namespace torch { namespace onnx { void initONNXBindings(PyObject* module) { From 17637f2b03b2dc9a64fb31ff9aafb2503bb05305 Mon Sep 17 00:00:00 2001 From: Guan Pang Date: Wed, 12 Sep 2018 18:39:15 -0700 Subject: [PATCH 108/237] enable_mkl support for resnet18+lstm model Summary: * Many op in lstm part of the model don't have implementation in ideep/mkl, and it doesn't make sense to copy back and forth for the few available ops because majority of RNN will be on CPU * Thus the strategy is to enable mkl only for the resnet18 part of the model, then switch to default cpu engine for the lstm part * The net may contain some external_inputs falsely added during ONNX->Caffe2. Canary in service shows their existence could leads to service crash (presumably due to these blob somehow get shared between threads). They're now manually removed which seem to be enough to avoid the crash. Reviewed By: viswanathgs Differential Revision: D8888763 fbshipit-source-id: da7761bcb7d876ff7bbb6640ae4b24712c0b1de6 --- caffe2/python/mkl/rewrite_graph.py | 125 +++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index ae8b6cd889ef38..146d5eb53cebe9 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -86,6 +86,131 @@ def mkl_tmp(name): fix_BoxWithNMSLimit(net) +def rewrite_run_net_simple_xrayocr_lstm(net, ideep=True): + # For xrayocr model with lstm, only rewrite the non-lstm part of the net to + # enable mkl, then copy the temporary output blob at the break point + # and all external inputs for lstm part to cpu, and execuate rest of the net + # (two lstm) on cpu + # This only works for the xrayocr lstm model which uses the first 'Shape' op + # to decide the break point, and after two lstm it's external_output + # directly so there's no need to copy back to ideep/mkl + + def mkl_tmp(name): + return "{}__MKL__".format(name) + + def cpu_tmp(name): + return "{}__CPU__".format(name) + + input_blob = net.external_input[0] + if input_blob != net.op[0].input[0]: + raise Exception( + "Input blob: {} is not consumed by first op: {}".format( + input_blob, net.op[0])) + # Modify input/outputs to point to copied MKL blobs. + from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL" + to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU" + copy_input_op = core.CreateOperator( + from_cpu, input_blob, mkl_tmp(input_blob)) + net.op[0].input[0] = mkl_tmp(input_blob) + + # the net may contain some external_inputs falsely added during ONNX->Caffe2 + # This should be taken care of in early steps during pytorch_to_caffe2, + # but if not it can cause issue in follow up steps, so check here to confirm + for input_blob in net.external_input: + for op in net.op: + # look for if the external_input blob is output of any op in the net + assert input_blob not in op.output + + external_output = None + external_inputs_to_cpu = set() + find_first_shape_op = False + cpu_op_start_idx = -1 + for op_idx, op in enumerate(net.op): + # the first Shape op mark the starting point of LSTM chunk of the net + if not find_first_shape_op: + if op.type == 'Shape': + external_output = op.input + find_first_shape_op = True + cpu_op_start_idx = op_idx + else: + # any external input in the LSTM part need to be copied to CPU + for in_blob in op.input: + if in_blob in net.external_input: + external_inputs_to_cpu.add(in_blob) + + # make sure we found the expected break point of the net + assert external_output is not None + + # create op to copy external input blobs used in LSTM part from IDEEP to CPU + copy_extra_input_ops = [] + for in_blob in external_inputs_to_cpu: + copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob, + cpu_tmp(in_blob))) + # rename input blobs in LSTM part to use the CPU copy + for op in net.op[cpu_op_start_idx:]: + renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob) + for blob in op.input] + op.input[:] = renamed_input + + copy_output_ops = [ + core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob) + for output_blob in external_output] + + for output_blob in external_output: + last_producer_idx = last_producer(net.op, output_blob) + renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob) + for blob in net.op[last_producer_idx].output] + net.op[last_producer_idx].output[:] = renamed_outputs + + # rearrange all ops in correct order + ops = [copy_input_op] + net.op[:cpu_op_start_idx] \ + + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:] + del net.op[:] + net.op.extend(ops) + + device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN + for op in net.op: + # the first Shape op mark the starting point of LSTM chunk of the net + if op.type == 'Shape': + # all LSTM ops should run on CPU + device = caffe2_pb2.CPU + op.device_option.MergeFrom( + core.DeviceOption(device_type=device)) + op.engine = "" + + # RecurrentNetwork has a nested step_net that needs special treatment + if op.type == 'RecurrentNetwork': + for arg in op.arg: + if arg.name == 'step_net': + for nested_op in arg.n.op: + # set device to CPU + nested_op.device_option.MergeFrom( + core.DeviceOption(device_type=device)) + nested_op.engine = "" + + # rename inputs in op of nested net + renamed_input = [] + for blob in nested_op.input: + renamed_input.append(blob + if blob not in external_inputs_to_cpu + else cpu_tmp(blob)) + nested_op.input[:] = renamed_input + + # rename external inputs of nested net + new_external_input = [] + for blob in arg.n.external_input: + new_external_input.append(blob + if blob not in external_inputs_to_cpu + else cpu_tmp(blob)) + arg.n.external_input[:] = new_external_input + + if ideep: + # Temporarily disbale conv+relu fusion until we verify further + # net.ParseFromString( + # C.transform_optimizeForIDEEP(net.SerializeToString())) + fix_BoxWithNMSLimit(net) + + def rewrite_model_helper_simple(model, ideep=True): model = copy.deepcopy(model) # All parameter initialization should run on MKL From 6f05b5ee5464d8c3386a8d1a059c327a6aaf5a59 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 12 Sep 2018 21:47:19 -0700 Subject: [PATCH 109/237] Pin Sphinx to 1.7.9 (#11620) Summary: Sphinx 1.8.0 breaks us. Upgrading is tracked in #11618. Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11620 Differential Revision: D9806440 Pulled By: ezyang fbshipit-source-id: 7a8d849c78e697a8775d00cd3a463a7bdbcddabe --- .jenkins/pytorch/build.sh | 5 ++--- docs/requirements.txt | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 3159ee1ad629e6..2dc64157c5d00d 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -117,9 +117,8 @@ fi if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then pushd docs # TODO: Don't run this here - # TODO: Reenable doc build - #pip install -r requirements.txt || true - #LC_ALL=C make html + pip install -r requirements.txt || true + LC_ALL=C make html popd fi diff --git a/docs/requirements.txt b/docs/requirements.txt index 159253ca0a6e33..a8d4eb2a85d11e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ -sphinx +sphinx==1.7.9 -e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme sphinxcontrib.katex From e998038bc009a872eebf29cf135f455862fb6715 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Wed, 12 Sep 2018 22:13:06 -0700 Subject: [PATCH 110/237] Use TypeMeta instead of TypeIdentifier within at::StorageImpl (#11236) Summary: Further aligns at::StorageImpl with caffe2::StorageImpl Pull Request resolved: https://github.com/pytorch/pytorch/pull/11236 Differential Revision: D9776286 Pulled By: cpuhrsch fbshipit-source-id: f2c53995fcece013b77b3a1f709ab0f9df8ab23e --- aten/src/ATen/UndefinedType.cpp | 3 +++ aten/src/ATen/UndefinedType.h | 1 + aten/src/ATen/Utils.h | 4 ++-- aten/src/ATen/core/ScalarType.h | 12 ++++++++++++ aten/src/ATen/core/Storage.cpp | 8 ++++---- aten/src/ATen/core/Storage.h | 6 +++--- aten/src/ATen/core/StorageImpl.cpp | 7 +++---- aten/src/ATen/core/StorageImpl.h | 14 +++++++------- aten/src/ATen/core/TensorImpl.cpp | 4 ++-- aten/src/ATen/core/TensorImpl.h | 6 ++++++ aten/src/ATen/core/Type.h | 1 + aten/src/ATen/templates/SparseTypeDerived.cpp | 3 +++ aten/src/ATen/templates/Type.h | 1 + aten/src/ATen/templates/TypeDefault.cpp | 8 ++++---- aten/src/ATen/templates/TypeDerived.cpp | 6 ++++++ aten/src/ATen/templates/TypeDerived.h | 1 + aten/src/TH/THStorageFunctions.cpp | 4 ++-- aten/src/TH/THStorageFunctions.hpp | 2 +- aten/src/TH/THTensor.cpp | 6 +++--- aten/src/TH/generic/THStorage.cpp | 16 ++++++++-------- aten/src/THC/THCStorage.cpp | 4 ++-- aten/src/THC/THCStorage.hpp | 2 +- aten/src/THC/THCTensor.cpp | 9 +++++---- aten/src/THC/THCTensor.hpp | 2 +- aten/src/THC/THCTensorCopy.cu | 3 +-- aten/src/THC/generic/THCStorage.cpp | 9 +++++---- aten/src/THC/generic/THCTensorCopy.cu | 4 ++-- .../complex_registration_extension.cpp | 8 +++++++- tools/autograd/templates/VariableType.cpp | 3 +++ tools/autograd/templates/VariableType.h | 1 + torch/csrc/DynamicTypes.cpp | 2 +- torch/csrc/jit/import.cpp | 2 +- 32 files changed, 103 insertions(+), 59 deletions(-) diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp index bea9baf61892f5..9c3457bdd4803f 100644 --- a/aten/src/ATen/UndefinedType.cpp +++ b/aten/src/ATen/UndefinedType.cpp @@ -8,6 +8,9 @@ UndefinedType::UndefinedType() ScalarType UndefinedType::scalarType() const { return ScalarType::Undefined; } +caffe2::TypeMeta UndefinedType::typeMeta() const { + AT_ERROR("typeMeta not defined for UndefinedType"); +} Backend UndefinedType::backend() const { return Backend::Undefined; } diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h index 594fb99e61dc0e..4ccd6101851a72 100644 --- a/aten/src/ATen/UndefinedType.h +++ b/aten/src/ATen/UndefinedType.h @@ -14,6 +14,7 @@ namespace at { struct UndefinedType final : public TypeDefault { explicit UndefinedType(); virtual ScalarType scalarType() const override; + virtual caffe2::TypeMeta typeMeta() const override; virtual Backend backend() const override; virtual Allocator* allocator() const override; virtual Device getDeviceFromPtr(void* data) const override; diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index cea2f83d3e9180..c4473d1471ab7d 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -44,12 +44,12 @@ static inline const Storage& checked_storage( name, "'"); } - if (expr.dtype() != data_type) { + if (expr.dtype().id() != data_type) { AT_ERROR( "Expected object of data type ", data_type, " but got data type ", - expr.dtype(), + expr.dtype().id(), " for argument #", pos, " '", diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h index b5e1a47646d7d6..6fe88bfadb05f5 100644 --- a/aten/src/ATen/core/ScalarType.h +++ b/aten/src/ATen/core/ScalarType.h @@ -80,6 +80,18 @@ static inline DataType scalarTypeToDataType(ScalarType scalar_type) { #undef DEFINE_CASE } +static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) { +#define DEFINE_CASE(ctype,name,_) \ + case ScalarType:: name : return caffe2::TypeMeta::Make(); + + switch(scalar_type) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE) + case ScalarType::Undefined: return caffe2::TypeMeta(); + default: AT_ERROR("Unrecognized Scalartype ", scalar_type, " (please report this error)"); + } +#undef DEFINE_CASE +} + static inline ScalarType dataTypeToScalarType(DataType dtype) { #define DEFINE_IF(ctype,name,_) \ if (dtype == caffe2::TypeMeta::Id()) { \ diff --git a/aten/src/ATen/core/Storage.cpp b/aten/src/ATen/core/Storage.cpp index 21f3e35ada4fd2..854a68876e2e5d 100644 --- a/aten/src/ATen/core/Storage.cpp +++ b/aten/src/ATen/core/Storage.cpp @@ -3,24 +3,24 @@ namespace at { Storage::Storage( - at::ScalarType scalar_type, + caffe2::TypeMeta data_type, size_t size, Allocator* allocator, bool resizable) : storage_impl_(c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + data_type, size, allocator, resizable)) {} Storage::Storage( - at::ScalarType scalar_type, + caffe2::TypeMeta data_type, at::DataPtr data_ptr, size_t size, const std::function& deleter, bool resizable) : storage_impl_(c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + data_type, size, std::move(data_ptr), /* allocator */ nullptr, diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h index 6f2a8fd68ee716..656ddb972c7959 100644 --- a/aten/src/ATen/core/Storage.h +++ b/aten/src/ATen/core/Storage.h @@ -11,12 +11,12 @@ struct AT_API Storage { Storage(const c10::intrusive_ptr& ptr) : storage_impl_(ptr) {} Storage(c10::intrusive_ptr&& ptr) : storage_impl_(std::move(ptr)) {} Storage( - at::ScalarType, + caffe2::TypeMeta, size_t size, Allocator* allocator, bool resizable = false); Storage( - at::ScalarType, + caffe2::TypeMeta, at::DataPtr, size_t size, const std::function& deleter, @@ -33,7 +33,7 @@ struct AT_API Storage { bool resizable() const { return storage_impl_->resizable(); } // get() use here is to get const-correctness void* data() const { return storage_impl_.get()->data(); } - const at::DataType dtype() const { + const caffe2::TypeMeta dtype() const { return storage_impl_->dtype(); } const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); } diff --git a/aten/src/ATen/core/StorageImpl.cpp b/aten/src/ATen/core/StorageImpl.cpp index b0f82132e39cb8..9c8bdfc45d3b70 100644 --- a/aten/src/ATen/core/StorageImpl.cpp +++ b/aten/src/ATen/core/StorageImpl.cpp @@ -3,7 +3,7 @@ namespace at { StorageImpl::StorageImpl( - at::DataType data_type, + caffe2::TypeMeta data_type, int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, @@ -15,15 +15,14 @@ StorageImpl::StorageImpl( allocator_(allocator) {} StorageImpl::StorageImpl( - at::DataType data_type, + caffe2::TypeMeta data_type, int64_t numel, at::Allocator* allocator, bool resizable) : StorageImpl( data_type, numel, - allocator->allocate( - at::elementSize(dataTypeToScalarType(data_type)) * numel), + allocator->allocate(data_type.itemsize() * numel), allocator, resizable) {} diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h index e80c11c6b0e211..0b2b635745abb2 100644 --- a/aten/src/ATen/core/StorageImpl.h +++ b/aten/src/ATen/core/StorageImpl.h @@ -15,13 +15,13 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { StorageImpl() = delete; ~StorageImpl() {}; StorageImpl( - at::DataType data_type, + caffe2::TypeMeta data_type, int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, bool resizable); StorageImpl( - at::DataType data_type, + caffe2::TypeMeta data_type, int64_t numel, at::Allocator* allocator, bool resizable); @@ -34,10 +34,10 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { template inline T* data() const { auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType::to()); - if (dtype() != data_type_T) { + if (dtype().id() != data_type_T) { AT_ERROR( "Attempt to access StorageImpl having data type ", - dtype(), + dtype().id(), " as data type ", data_type_T); } @@ -56,7 +56,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { void operator=(const StorageImpl&) = delete; size_t itemsize() const { - return at::elementSize(dataTypeToScalarType(data_type_)); + return data_type_.itemsize(); } Type& type(); @@ -93,7 +93,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { at::Allocator* allocator() { return allocator_; }; - const DataType dtype() const { + const caffe2::TypeMeta dtype() const { return data_type_; } const at::Allocator* allocator() const { @@ -114,7 +114,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { } private: - at::DataType data_type_; + caffe2::TypeMeta data_type_; at::DataPtr data_ptr_; int64_t numel_; bool resizable_; diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp index d5e5781b038ca5..1bf6802c7d4d8e 100644 --- a/aten/src/ATen/core/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -22,12 +22,12 @@ TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator * // UndefinedTensors and SparseTensors don't have storages. if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) { - storage_ = Storage(scalar_type, 0, allocator, true); + storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true); } } TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable) - : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype()), is_variable) {} + : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {} TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable) : storage_(std::move(storage)), diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index 3f41c1163c563b..831c53e23b6d49 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -111,10 +111,16 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { return storage_.unsafe_data() + storage_offset_; } + // TODO: Remove this once we get rid of scalar_type and use dmeta or dtype + // instead. inline at::ScalarType scalar_type() const { return scalar_type_; } + inline caffe2::TypeMeta dtype() const { + return storage_.dtype(); + } + virtual int64_t storage_offset() const { return storage_offset_; } diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h index 6284e7634bb755..e8b2e2d40980c2 100644 --- a/aten/src/ATen/core/Type.h +++ b/aten/src/ATen/core/Type.h @@ -82,6 +82,7 @@ struct AT_API Type { virtual ~Type() {} virtual ScalarType scalarType() const = 0; + virtual caffe2::TypeMeta typeMeta() const = 0; virtual Backend backend() const = 0; Layout layout() const noexcept { return layout_from_backend(backend()); } virtual bool is_cuda() const = 0; diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 4bc8265a258f70..dc419b9ad52c01 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -32,6 +32,9 @@ namespace at { ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } +caffe2::TypeMeta ${Type}::typeMeta() const { + AT_ERROR("typeMeta not defined for SparseType"); +} Backend ${Type}::backend() const { return Backend::${Backend}; } diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 18fb393aeca759..ef49b2880dcec7 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -53,6 +53,7 @@ struct AT_API Type { virtual ~Type() {} virtual ScalarType scalarType() const = 0; + virtual caffe2::TypeMeta typeMeta() const = 0; virtual Backend backend() const = 0; Layout layout() const noexcept { return layout_from_backend(backend()); } virtual bool is_cuda() const = 0; diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 530aa09fddb734..5e68470ecc0abd 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -94,20 +94,20 @@ Tensor TypeDefault::tensorWithAllocator(IntList sizes, IntList strides, Allocato } Storage TypeDefault::storage(bool resizable) const { - return Storage(scalarType(), 0, allocator(), resizable); + return Storage(typeMeta(), 0, allocator(), resizable); } Storage TypeDefault::storage(size_t size, bool resizable) const { - return Storage(scalarType(), size, allocator(), resizable); + return Storage(typeMeta(), size, allocator(), resizable); } Storage TypeDefault::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { return Storage( - scalarType(), + typeMeta(), InefficientStdFunctionContext::makeDataPtr(data, deleter, getDeviceFromPtr(data)), size, deleter); } Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) const { - return Storage(scalarType(), size, allocator); + return Storage(typeMeta(), size, allocator); } Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const { auto tensor_impl = c10::intrusive_ptr::reclaim(static_cast(th_pointer)); diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index bc2e12acb14f29..d012274c5fceed 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -32,9 +32,15 @@ namespace at { ${Type}::${Type}() : ${DenseBackend}TypeDefault(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} + ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } + +caffe2::TypeMeta ${Type}::typeMeta() const { + return caffe2::TypeMeta::Make<${ScalarType}>(); +} + Backend ${Type}::backend() const { return Backend::${Backend}; } diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h index 3a48d8b26e32b4..116df9b4d465fe 100644 --- a/aten/src/ATen/templates/TypeDerived.h +++ b/aten/src/ATen/templates/TypeDerived.h @@ -19,6 +19,7 @@ namespace at { struct ${Type} final : public ${DenseBackend}TypeDefault { explicit ${Type}(); virtual ScalarType scalarType() const override; + virtual caffe2::TypeMeta typeMeta() const override; virtual Backend backend() const override; virtual const char * toString() const override; virtual size_t elementSizeInBytes() const override; diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index a5319e67dabe61..c4d1f778250ea2 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -15,9 +15,9 @@ #include "generic/THStorageCopy.cpp" #include "THGenerateHalfType.h" -THStorage* THStorage_new(at::ScalarType scalar_type) { +THStorage* THStorage_new(caffe2::TypeMeta data_type) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + data_type, 0, getTHDefaultAllocator(), true).release(); diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 93a89f1753f000..95e5bacc2cef55 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -30,7 +30,7 @@ // If it is not, you must report that the storage is dead. // -TH_CPP_API THStorage* THStorage_new(at::ScalarType scalar_type); +TH_CPP_API THStorage* THStorage_new(caffe2::TypeMeta data_type); TH_API ptrdiff_t THStorage_size(const THStorage *self); TH_API void THStorage_retain(THStorage *storage); diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 5f861457c58ef4..2d1200eebd37aa 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -39,14 +39,14 @@ void THTensor_setStorageNd(THTensor *self, THStorage *storage, ptrdiff_t storage if (!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype()); + auto data_type = THTensor_getStoragePtr(self)->dtype(); if(storage) { c10::raw::intrusive_ptr::incref(storage); THTensor_stealAndSetStoragePtr(self, storage); } else { - THTensor_stealAndSetStoragePtr(self, THStorage_new(scalar_type)); + THTensor_stealAndSetStoragePtr(self, THStorage_new(data_type)); } } @@ -123,7 +123,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons if(totalSize+self->storage_offset() > 0) { if(!THTensor_getStoragePtr(self)) { - THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type())); + THTensor_stealAndSetStoragePtr(self, THStorage_new(self->dtype())); } if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) { THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset()); diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 7ed962567a31ff..6dfd90cfbe1bd2 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -21,13 +21,13 @@ size_t THStorage_(elementSize)() THStorage* THStorage_(new)(void) { - return THStorage_new(at::CTypeToScalarType::to()); + return THStorage_new(caffe2::TypeMeta::Make()); } THStorage* THStorage_(newWithSize)(ptrdiff_t size) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, getTHDefaultAllocator(), true).release(); @@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, at::Allocator *allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, allocator, true).release(); @@ -48,18 +48,18 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags) { - auto scalar_type = at::CTypeToScalarType::to(); + auto type_meta = caffe2::TypeMeta::Make(); size_t actual_size = -1; THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + type_meta, size, THMapAllocator::makeDataPtr( - filename, flags, size * at::elementSize(scalar_type), &actual_size), + filename, flags, size * type_meta.itemsize(), &actual_size), /* allocator */ nullptr, false).release(); if (size <= 0) { - storage->set_numel(actual_size / at::elementSize(scalar_type)); + storage->set_numel(actual_size / type_meta.itemsize()); } return storage; @@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage) THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, std::move(data), allocator, diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index 3d1630adedbe5b..b6c52791eb56d6 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -55,9 +55,9 @@ int THCStorage_getDevice(THCState* state, const THCStorage* storage) { THC_API THCStorage* THCStorage_new( THCState* state, - at::ScalarType scalar_type) { + caffe2::TypeMeta data_type) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(scalar_type), + data_type, 0, state->cudaDeviceAllocator, true).release(); diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp index ee683feced4bfb..3eedf4a907144f 100644 --- a/aten/src/THC/THCStorage.hpp +++ b/aten/src/THC/THCStorage.hpp @@ -16,7 +16,7 @@ struct CTypeToScalarType<__half> : public CTypeToScalarType {}; } -THC_API THCStorage* THCStorage_new(THCState* state, at::ScalarType); +THC_API THCStorage* THCStorage_new(THCState* state, caffe2::TypeMeta); THC_API void THCStorage_retain(THCState *state, THCStorage *storage); diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index b35551301383f8..b5f8d391dc8a96 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -40,8 +40,9 @@ int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, return THTensor_strideLegacyNoScalars(self, dim); } -THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type) { - switch(scalar_type) { +THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta) { + auto scalar_type = at::dataTypeToScalarType(type_meta.id()); + switch (scalar_type) { case at::ScalarType::Byte: return THCudaByteTensor_new(state); case at::ScalarType::Char: @@ -189,13 +190,13 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag if (!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype()); + auto data_type = THTensor_getStoragePtr(self)->dtype(); if (storage) { c10::raw::intrusive_ptr::incref(storage); THTensor_stealAndSetStoragePtr(self, storage); } else { - THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, scalar_type)); + THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, data_type)); } } diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp index 3505354c834bb4..8fecaf0b2296fc 100644 --- a/aten/src/THC/THCTensor.hpp +++ b/aten/src/THC/THCTensor.hpp @@ -21,7 +21,7 @@ THC_API int64_t THCTensor_sizeLegacyNoScalars(THCState *state, const THCTensor * THC_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim); THC_API int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, int dim); -THC_API THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type); +THC_API THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta); THC_API void THCTensor_resize(THCState *state, THCTensor *tensor, at::IntList size, at::IntList stride); THC_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride); diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu index a3a8ed2e182086..844539c4ac0b27 100644 --- a/aten/src/THC/THCTensorCopy.cu +++ b/aten/src/THC/THCTensorCopy.cu @@ -136,8 +136,7 @@ void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src) { } else { // Types are different // Copy into the new format, contiguous, on the source device - srcContig = THCTensor_new(state, - at::CTypeToScalarType::to()); + srcContig = THCTensor_new(state, caffe2::TypeMeta::Make()); THCTensor_resizeAs(state, srcContig, dst); bool succ = diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index 8918a449e19585..36a001059a5787 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -3,6 +3,7 @@ #else #include +#include scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self) { @@ -43,7 +44,7 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind THCStorage* THCStorage_(new)(THCState *state) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), 0, state->cudaDeviceAllocator, true).release(); @@ -53,7 +54,7 @@ THCStorage* THCStorage_(new)(THCState *state) THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, state->cudaDeviceAllocator, true).release(); @@ -64,7 +65,7 @@ THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, allocator, true).release(); @@ -117,7 +118,7 @@ THCStorage* THCStorage_(newWithDataAndAllocator)( ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType::to()), + caffe2::TypeMeta::Make(), size, std::move(data), allocator, diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 4919a7a2a364ab..25ded22747441f 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -10,8 +10,8 @@ THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { template <> THCTensor *THCTensor_newClone(THCState *state, THCTensor *self) { - THCTensor* tensor = THCTensor_new( - state, at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype())); + THCTensor* tensor = + THCTensor_new(state, THTensor_getStoragePtr(self)->dtype()); THCTensor_resizeAs(state, tensor, self); THC_copyTensor(state, tensor, self); return tensor; diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp index acfaccced2be09..db75e3f67f7772 100644 --- a/test/cpp_extensions/complex_registration_extension.cpp +++ b/test/cpp_extensions/complex_registration_extension.cpp @@ -33,6 +33,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault { /*is_undefined=*/false) {} ScalarType scalarType() const override; + caffe2::TypeMeta typeMeta() const override; Backend backend() const override; const char* toString() const override; size_t elementSizeInBytes() const override; @@ -49,7 +50,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault { numel *= s; } Storage s{c10::make_intrusive( - scalarTypeToDataType(ScalarType::ComplexFloat), + scalarTypeToTypeMeta(ScalarType::ComplexFloat), numel, getCPUAllocator(), /* resizable */ true)}; @@ -73,6 +74,10 @@ ScalarType CPUComplexFloatType::scalarType() const { return ScalarType::ComplexFloat; } +caffe2::TypeMeta CPUComplexFloatType::typeMeta() const { + return scalarTypeToTypeMeta(ScalarType::ComplexFloat); +} + Backend CPUComplexFloatType::backend() const { return Backend::CPU; } @@ -80,6 +85,7 @@ Backend CPUComplexFloatType::backend() const { const char* CPUComplexFloatType::toString() const { return "CPUComplexFloatType"; } + TypeID CPUComplexFloatType::ID() const { return TypeID::CPUComplexFloat; } diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 589bbf8323f155..c6a84afe710a89 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -52,6 +52,9 @@ VariableType::VariableType(Context* context, Type* baseType) ScalarType VariableType::scalarType() const { return baseType->scalarType(); } +caffe2::TypeMeta VariableType::typeMeta() const { + return baseType->typeMeta(); +} Backend VariableType::backend() const { return baseType->backend(); } diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index d1e847a141aace..19de1b910a0b69 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -36,6 +36,7 @@ void register_variable_type_for(at::Type* baseType); struct TORCH_API VariableType final : public at::TypeDefault { VariableType(Context* context, at::Type* baseType); at::ScalarType scalarType() const override; + virtual caffe2::TypeMeta typeMeta() const override; at::Backend backend() const override; at::Allocator* allocator() const override; at::Device getDeviceFromPtr(void * data) const override; diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp index 5d1e991574b341..bed40cb67a2d0e 100644 --- a/torch/csrc/DynamicTypes.cpp +++ b/torch/csrc/DynamicTypes.cpp @@ -71,7 +71,7 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage) { auto attype = at::globalContext().getNonVariableTypeOpt( deviceTypeToBackend(storage.device_type()), - at::dataTypeToScalarType(storage.dtype())); + at::dataTypeToScalarType(storage.dtype().id())); auto it = attype_to_py_storage_type.find(attype); if (it != attype_to_py_storage_type.end()) { return it->second; diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index 0ca9f79e6f50dc..8e6a07da0a00ca 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -315,7 +315,7 @@ at::Tensor ModuleDecoder::buildTensorCommon( int64_t size; std::tie(storage_ptr, size) = file_reader_.getRecordWithKey(record_number); auto storage = std::make_shared( - at::CPU(type).scalarType(), + at::CPU(type).typeMeta(), std::move(storage_ptr), size, nullptr); From 44b2b6b150e95f60b9858f28205a83d03440cba4 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Wed, 12 Sep 2018 22:42:18 -0700 Subject: [PATCH 111/237] clean up jit generated tests (#11403) Summary: Clean up some generated tests after we have newly nice features like var args. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11403 Differential Revision: D9800545 Pulled By: wanchaol fbshipit-source-id: e9973b113f78dc38cf99a81b6ede3fa3485f1cfa --- test/test_jit.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 4aa43b1c8596a5..6d97b88514ecfa 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -7428,28 +7428,6 @@ def forward(self, x, y): 'test_var_dim_neg0', 'test_norm_inf', 'test_renorm_norm_inf', - 'test_split', - 'test_expand', - 'test_expand_1_element', - 'test_expand_new_dim', - 'test_expand_new_dim_front_old_front_1', - 'test_expand_scalar_to_dims', - 'test_expand_size', - 'test_permute', - 'test_permute_neg_dim', - 'test_repeat', - 'test_repeat_scalar', - 'test_repeat_single_number', - 'test_repeat_unsqueeze', - 'test_reshape_1d', - 'test_reshape_scalar_to_1d', - 'test_view', - 'test_view_1d', - 'test_view_scalar_to_1d', - 'test_split_dim', - 'test_split_dim_neg0', - 'test_gesv', - 'test_inverse', 'test_matrix_power_n=-1', # involves inverse 'test_matrix_power_n=-3', # involves inverse # skipped nn functional tests From cac11a4ac388af775c4e6aa4b31921601906a98d Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Thu, 13 Sep 2018 01:12:34 -0700 Subject: [PATCH 112/237] Merge caffe2::/at::StorageImpl (#11543) Summary: Merges caffe2::StorageImpl methods with at::StorageImpl methods and defines caffe2::StorageImpl as at::StorageImpl. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11543 Differential Revision: D9795228 Pulled By: cpuhrsch fbshipit-source-id: fbd6fa3cbf6c9099a4803337286c30e00652f95c --- aten/src/ATen/core/StorageImpl.cpp | 28 ----- aten/src/ATen/core/StorageImpl.h | 132 ++++++++++++++++++++--- aten/src/THC/THCTensor.cpp | 1 - caffe2/core/storage.h | 167 +---------------------------- 4 files changed, 121 insertions(+), 207 deletions(-) diff --git a/aten/src/ATen/core/StorageImpl.cpp b/aten/src/ATen/core/StorageImpl.cpp index 9c8bdfc45d3b70..5190a7766dcb49 100644 --- a/aten/src/ATen/core/StorageImpl.cpp +++ b/aten/src/ATen/core/StorageImpl.cpp @@ -1,29 +1 @@ #include - -namespace at { - -StorageImpl::StorageImpl( - caffe2::TypeMeta data_type, - int64_t numel, - at::DataPtr data_ptr, - at::Allocator* allocator, - bool resizable) - : data_type_(data_type), - data_ptr_(std::move(data_ptr)), - numel_(numel), - resizable_(resizable), - allocator_(allocator) {} - -StorageImpl::StorageImpl( - caffe2::TypeMeta data_type, - int64_t numel, - at::Allocator* allocator, - bool resizable) - : StorageImpl( - data_type, - numel, - allocator->allocate(data_type.itemsize() * numel), - allocator, - resizable) {} - -} // namespace at diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h index 0b2b635745abb2..cc63bd00906669 100644 --- a/aten/src/ATen/core/StorageImpl.h +++ b/aten/src/ATen/core/StorageImpl.h @@ -12,24 +12,65 @@ struct Type; struct AT_API StorageImpl : public c10::intrusive_ptr_target { public: - StorageImpl() = delete; - ~StorageImpl() {}; StorageImpl( caffe2::TypeMeta data_type, int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, - bool resizable); + bool resizable) + : data_type_(data_type), + data_ptr_(std::move(data_ptr)), + numel_(numel), + resizable_(resizable), + allocator_(allocator) { + if (numel > 0) { + if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) { + AT_ERROR( + "Constructing a storage with meta of unknown type and non-zero numel"); + } + } + } + StorageImpl( caffe2::TypeMeta data_type, int64_t numel, at::Allocator* allocator, - bool resizable); + bool resizable) + : StorageImpl( + data_type, + numel, + allocator->allocate(data_type.itemsize() * numel), + allocator, + resizable) {} + + explicit StorageImpl(at::DeviceType device_type) + : StorageImpl(device_type, caffe2::TypeMeta()) {} + + StorageImpl(at::DeviceType device_type, caffe2::TypeMeta data_type) + : StorageImpl( + data_type, + 0, + at::DataPtr(nullptr, at::Device(device_type)), + nullptr, + true) {} + + StorageImpl& operator=(StorageImpl&& other) = default; + StorageImpl& operator=(const StorageImpl&) = delete; + StorageImpl() = delete; + StorageImpl(StorageImpl&& other) = default; StorageImpl(StorageImpl&) = delete; StorageImpl(const StorageImpl&) = delete; - // NB: Don't move ref count! - StorageImpl(StorageImpl&& other) = default; - StorageImpl& operator=(StorageImpl&& other) = default; + ~StorageImpl() = default; + + void reset() { + data_ptr_.clear(); + numel_ = 0; + } + + template + inline bool IsType() const { + return data_type_.Match(); + } template inline T* data() const { @@ -53,52 +94,77 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { data_ptr_.clear(); } - void operator=(const StorageImpl&) = delete; - size_t itemsize() const { return data_type_.itemsize(); } Type& type(); + size_t capacity() const { + return numel_ * itemsize(); + } + int64_t numel() const { return numel_; }; + + // TODO: remove later void set_numel(int64_t numel) { numel_ = numel; }; + bool resizable() const { return resizable_; }; + at::DataPtr& data_ptr() { return data_ptr_; }; + const at::DataPtr& data_ptr() const { return data_ptr_; }; + // Returns the previous data_ptr at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { std::swap(data_ptr_, data_ptr); return std::move(data_ptr); }; + + // XXX: TERRIBLE! DONT USE UNLESS YOU HAVE TO! AND EVEN THEN DONT, JUST DONT! + // Setting the data_type will require you to audit many other parts of the + // struct again to make sure it's still valid. + void set_dtype(const caffe2::TypeMeta& data_type) { + int64_t capacity = numel_ * data_type_.itemsize(); + data_type_ = data_type; + numel_ = capacity / data_type_.itemsize(); + } + + // TODO: Return const ptr eventually if possible void* data() { return data_ptr_.get(); - }; - const void* data() const { + } + + void* data() const { return data_ptr_.get(); - }; + } + at::DeviceType device_type() const { return data_ptr_.device().type(); } + at::Allocator* allocator() { return allocator_; - }; - const caffe2::TypeMeta dtype() const { + } + + const caffe2::TypeMeta& dtype() const { return data_type_; } + const at::Allocator* allocator() const { return allocator_; }; + // You generally shouldn't use this method, but it is occasionally // useful if you want to override how a tensor will be reallocated, // after it was already allocated (and its initial allocator was @@ -106,13 +172,51 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { void set_allocator(at::Allocator* allocator) { allocator_ = allocator; } + Device device() const { return data_ptr_.device(); } + void set_resizable(bool resizable) { resizable_ = resizable; } + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + void* src, + const caffe2::TypeMeta& data_type, + size_t capacity, + DeleterFnPtr d = nullptr) { + UniqueStorageShareExternalPointer( + at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity); + } + + /** + * Can only be called when use_count is 1 + */ + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + const caffe2::TypeMeta& data_type, + size_t capacity) { + data_type_ = data_type; + // TODO: Use CAFFE_ENFORCE_WITH_CALLER equivalent + // For now causes lots of redefine issues if caffe2/core/logging.h is used + if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) { + AT_ERROR( + "To share with a raw external pointer you need to have meta " + "already set."); + } + data_ptr_ = std::move(data_ptr); + // NOTE: data_type might change and so it's also possible that capacity + // might not be divisible by itemsize. There is no way for us to keep track + // of the exact capacity if we're not explicity storing is. More conrectely + // capacity() might not return the value that was set here, if itemsize does + // not evenly divide it. + numel_ = capacity / data_type_.itemsize(); + } + private: caffe2::TypeMeta data_type_; at::DataPtr data_ptr_; diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index b5f8d391dc8a96..bfef8fffb0f89d 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -191,7 +191,6 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag THError("Tensor: invalid null storage"); } auto data_type = THTensor_getStoragePtr(self)->dtype(); - if (storage) { c10::raw::intrusive_ptr::incref(storage); THTensor_stealAndSetStoragePtr(self, storage); diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h index 12bff9af69ad03..ce60a7e580f049 100644 --- a/caffe2/core/storage.h +++ b/caffe2/core/storage.h @@ -20,173 +20,11 @@ #include #include #include +#include namespace caffe2 { -class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target { - public: - StorageImpl() = delete; - StorageImpl(const StorageImpl&) = delete; - StorageImpl& operator=(const StorageImpl&) = delete; - - // Rule of Five - StorageImpl(StorageImpl&&) = default; - ~StorageImpl() = default; - StorageImpl& operator=(StorageImpl&&) = default; - - StorageImpl( - TypeMeta data_type, - int64_t numel, - at::DataPtr data_ptr, - at::Allocator* allocator, - bool resizable) - : data_type_(data_type), - data_ptr_(std::move(data_ptr)), - numel_(numel), - resizable_(resizable), - allocator_(allocator) { - if (numel > 0) { - CAFFE_ENFORCE( - data_type.id() != TypeIdentifier::uninitialized(), - "Constructing a storage with meta of unknown type and non-zero numel"); - } - } - - explicit StorageImpl(at::DeviceType device_type) - : StorageImpl(device_type, TypeMeta()) {} - StorageImpl(at::DeviceType device_type, TypeMeta data_type) - : StorageImpl( - data_type, - 0, - at::DataPtr(nullptr, at::Device(device_type)), - nullptr, - true) {} - - void reset() { - data_ptr_.clear(); - numel_ = 0; - } - - template - inline bool IsType() const { - return data_type_.Match(); - } - - void* data() const { - return data_ptr_.get(); - } - - void* data() { - return data_ptr_.get(); - } - - at::DataPtr& data_ptr() { - return data_ptr_; - } - - const at::DataPtr& data_ptr() const { - return data_ptr_; - } - - // Returns the previous data_ptr - at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { - std::swap(data_ptr_, data_ptr); - return std::move(data_ptr); - }; - - void set_dtype(const TypeMeta& data_type) { - int64_t capacity = numel_ * data_type_.itemsize(); - data_type_ = data_type; - numel_ = capacity / data_type_.itemsize(); - } - - const TypeMeta& dtype() const { - return data_type_; - } - - const at::Allocator* allocator() const { - return allocator_; - }; - // You generally shouldn't use this method, but it is occasionally - // useful if you want to override how a tensor will be reallocated, - // after it was already allocated (and its initial allocator was - // set) - void set_allocator(at::Allocator* allocator) { - allocator_ = allocator; - } - - size_t capacity() const { - return numel_ * itemsize(); - } - - int64_t numel() const { - return numel_; - } - - // TODO: remove later - void set_numel(int64_t numel) { - numel_ = numel; - } - - at::DeviceType device_type() const { - return data_ptr_.device().type(); - } - - inline size_t itemsize() const { - return data_type_.itemsize(); - } - - bool resizable() const { - return resizable_; - }; - - void set_resizable(bool resizable) { - resizable_ = resizable; - } - - /** - * Can only be called when use_count is 1 - */ - void UniqueStorageShareExternalPointer( - void* src, - const TypeMeta& data_type, - size_t capacity, - MemoryDeleter d = nullptr) { - UniqueStorageShareExternalPointer( - at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity); - } - - /** - * Can only be called when use_count is 1 - */ - void UniqueStorageShareExternalPointer( - at::DataPtr&& data_ptr, - const TypeMeta& data_type, - size_t capacity) { - data_type_ = data_type; - CAFFE_ENFORCE_WITH_CALLER( - data_type_.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to have meta " - "already set."); - data_ptr_ = std::move(data_ptr); - // NOTE: data_type might change and so it's also possible that capacity - // might not be divisible by itemsize. There is no way for us to keep track - // of the exact capacity if we're not explicity storing is. More conrectely - // capacity() might not return the value that was set here, if itemsize does - // not evenly divide it. - numel_ = capacity / data_type_.itemsize(); - } - - private: - TypeMeta data_type_; - at::DataPtr data_ptr_; - int64_t numel_; - bool resizable_; - at::Allocator* allocator_; - // allocator_ takes precedence over StaticContext from device_type_ - // Allocator* allocator_; - // at::DeviceType device_type_ = CPU; -}; +using StorageImpl = at::StorageImpl; class CAFFE2_API Storage { public: @@ -251,6 +89,7 @@ class CAFFE2_API Storage { const TypeMeta& dtype() const { return storage_impl_->dtype(); } + size_t capacity() const { return storage_impl_->capacity(); } From 77f6998e541238391aca98041a96bb49a533687d Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 13 Sep 2018 08:43:38 -0700 Subject: [PATCH 113/237] Guard against inputting or returning sparse tensors (#11550) Summary: Add guards against using sparse tensor by checking the conversion from IValue -> PyObject & PyObject -> IValue. This diff also changes the behavior in constant propagation to not run python ops even if all ops are constant because of possible mutation to global state. This came up in trying to run get_sparse(), and I'm including it here to make it easier to land. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11550 Differential Revision: D9804712 Pulled By: eellison fbshipit-source-id: 9fe7daf721c6d6e48df4925c0f9c775873bcdc77 --- test/test_jit.py | 20 ++++++++++++++++++ .../csrc/jit/passes/constant_propagation.cpp | 3 +-- torch/csrc/jit/pybind_utils.h | 21 +++++++++++++++---- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 6d97b88514ecfa..97ef28630b7836 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1571,6 +1571,26 @@ def test_fn(ten, mask): ten = torch.rand(3, 3) self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask)) + def test_sparse_tensors_error(self): + def get_sparse(): + return torch.sparse.FloatTensor(2, 3) + + @torch.jit.script + def sparse(input): + output = get_sparse() + return output, input + + with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"): + sparse(get_sparse()) + + # has a different entry point than calling sparse directly + with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"): + torch._C._jit_pass_shape_analysis( + sparse.graph, (get_sparse(),), False) + + with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"): + sparse(torch.tensor([1])) + def test_constant_prop_simple(self): @torch.jit.script def constant_prop(input_tensor): diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index e83971bfaf9dfb..c9cc1b703de100 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -15,9 +15,8 @@ namespace { std::unordered_set skip_list = { prim::If, prim::Loop, //TODO: handle Loop - //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be - //FIXME treated as having side effects but ONNX depends on them being removed prim::Print, + prim::PythonOp, //may have side effects //all the rand functions from native_functions.yaml aten::rand, aten::rand_out, diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h index 54414534b06215..004bc075161697 100644 --- a/torch/csrc/jit/pybind_utils.h +++ b/torch/csrc/jit/pybind_utils.h @@ -56,7 +56,11 @@ inline void findErrorInKwargs( inline IValue toIValue(py::handle input) { if (THPVariable_Check(input.ptr())) { - return py::cast(input); + auto ten = py::cast(input); + if (ten.is_sparse()) { + AT_ERROR("sparse tensors not supported"); + } + return ten; } else if (py::isinstance(input)) { py::tuple input_tuple = py::cast(input); Stack s; @@ -89,8 +93,13 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) { switch (type->kind()) { case TypeKind::DynamicType: case TypeKind::TensorType: - case TypeKind::CompleteTensorType: - return py::cast(obj); + case TypeKind::CompleteTensorType: { + auto var = py::cast(obj); + if (var.is_sparse()) { + AT_ERROR("sparse tensors not supported"); + } + return var; + } case TypeKind::FloatType: return py::cast(obj); case TypeKind::IntType: @@ -174,7 +183,11 @@ inline py::object toPyObject(IValue&& ivalue) { if (ivalue.isNone()) { return py::none(); } else if (ivalue.isTensor()) { - return py::cast(autograd::Variable(ivalue.toTensor())); + auto tensor = std::move(ivalue).toTensor(); + if (tensor.is_sparse()) { + AT_ERROR("sparse tensors not supported"); + } + return py::cast(autograd::Variable(std::move(tensor))); } else if (ivalue.isDouble()) { return py::cast(ivalue.toDouble()); } else if (ivalue.isInt()) { From 36fc1a0a58ab20f34e3047af41b4d15c92c8e2f7 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Thu, 13 Sep 2018 09:25:27 -0700 Subject: [PATCH 114/237] Merge caffe2::/at::Storage Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11637 Reviewed By: gchanan Differential Revision: D9806425 Pulled By: ezyang fbshipit-source-id: e20ec93bff6dc7fb22ca9b7e7348d060b3876b67 --- aten/src/ATen/core/Storage.cpp | 24 ----- aten/src/ATen/core/Storage.h | 158 ++++++++++++++++++++++++++++++--- caffe2/core/storage.h | 128 +------------------------- caffe2/core/tensor.cc | 2 +- 4 files changed, 147 insertions(+), 165 deletions(-) diff --git a/aten/src/ATen/core/Storage.cpp b/aten/src/ATen/core/Storage.cpp index 854a68876e2e5d..aca4bb75d2c95b 100644 --- a/aten/src/ATen/core/Storage.cpp +++ b/aten/src/ATen/core/Storage.cpp @@ -2,28 +2,4 @@ namespace at { -Storage::Storage( - caffe2::TypeMeta data_type, - size_t size, - Allocator* allocator, - bool resizable) - : storage_impl_(c10::make_intrusive( - data_type, - size, - allocator, - resizable)) {} - -Storage::Storage( - caffe2::TypeMeta data_type, - at::DataPtr data_ptr, - size_t size, - const std::function& deleter, - bool resizable) - : storage_impl_(c10::make_intrusive( - data_type, - size, - std::move(data_ptr), - /* allocator */ nullptr, - resizable)) {} - } // namespace at diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h index 656ddb972c7959..23a3327da1cafc 100644 --- a/aten/src/ATen/core/Storage.h +++ b/aten/src/ATen/core/Storage.h @@ -11,16 +11,56 @@ struct AT_API Storage { Storage(const c10::intrusive_ptr& ptr) : storage_impl_(ptr) {} Storage(c10::intrusive_ptr&& ptr) : storage_impl_(std::move(ptr)) {} Storage( - caffe2::TypeMeta, + caffe2::TypeMeta data_type, size_t size, Allocator* allocator, - bool resizable = false); + bool resizable = false) + : storage_impl_(c10::make_intrusive( + data_type, + size, + allocator, + resizable)) {} + Storage( - caffe2::TypeMeta, - at::DataPtr, + caffe2::TypeMeta data_type, + at::DataPtr data_ptr, size_t size, const std::function& deleter, - bool resizable = false); + bool resizable = false) + : storage_impl_(c10::make_intrusive( + data_type, + size, + std::move(data_ptr), + /* allocator */ nullptr, + resizable)) {} + + Storage(at::DeviceType device_type) + : storage_impl_(c10::make_intrusive(device_type)) {} + Storage(at::DeviceType device_type, caffe2::TypeMeta data_type) + : storage_impl_( + c10::make_intrusive(device_type, data_type)) {} + + Storage( + caffe2::TypeMeta data_type, + int64_t numel, + at::DataPtr data_ptr, + at::Allocator* allocator, + bool resizable) + : storage_impl_(c10::make_intrusive( + data_type, + numel, + std::move(data_ptr), + allocator, + resizable)) {} + + void reset() { + storage_impl_->reset(); + } + + template + inline bool IsType() const { + return storage_impl_->IsType(); + } template T* data() const { return storage_impl_->data(); } @@ -28,32 +68,122 @@ struct AT_API Storage { template T* unsafe_data() const { return storage_impl_->unsafe_data(); } - size_t elementSize() const { return storage_impl_->itemsize(); } - ptrdiff_t size() const { return storage_impl_->numel(); } - bool resizable() const { return storage_impl_->resizable(); } + size_t elementSize() const { + return storage_impl_->itemsize(); + } + + inline size_t itemsize() const { + return storage_impl_->itemsize(); + } + + ptrdiff_t size() const { + return storage_impl_->numel(); + } + + int64_t numel() const { + return storage_impl_->numel(); + } + + // TODO: remove later + void set_numel(int64_t numel) { + storage_impl_->set_numel(numel); + } + + bool resizable() const { + return storage_impl_->resizable(); + } + + size_t capacity() const { + return storage_impl_->capacity(); + } // get() use here is to get const-correctness - void* data() const { return storage_impl_.get()->data(); } - const caffe2::TypeMeta dtype() const { + + void* data() { + return storage_impl_->data(); + } + + void* data() const { + return storage_impl_.get()->data(); + } + + const caffe2::TypeMeta& dtype() const { return storage_impl_->dtype(); } - const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); } - DeviceType device_type() const { return storage_impl_->device_type(); } - at::Allocator* allocator() const { return storage_impl_.get()->allocator(); } - at::Device device() const { return storage_impl_->device(); } + + at::DataPtr& data_ptr() { + return storage_impl_->data_ptr(); + } + + const at::DataPtr& data_ptr() const { + return storage_impl_->data_ptr(); + } + + // Returns the previous data_ptr + at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { + return storage_impl_->set_data_ptr(std::move(data_ptr)); + }; + + void set_dtype(const caffe2::TypeMeta& data_type) { + storage_impl_->set_dtype(data_type); + } + + DeviceType device_type() const { + return storage_impl_->device_type(); + } + + at::Allocator* allocator() const { + return storage_impl_.get()->allocator(); + } + + at::Device device() const { + return storage_impl_->device(); + } StorageImpl* unsafeReleaseStorageImpl() { return storage_impl_.release(); } + StorageImpl* unsafeGetStorageImpl() const noexcept { return storage_impl_.get(); } + operator bool() const { return storage_impl_; } + size_t use_count() const { return storage_impl_.use_count(); } + inline bool unique() const { + return storage_impl_.unique(); + } + + void UniqueStorageShareExternalPointer( + void* src, + const caffe2::TypeMeta& data_type, + size_t capacity, + DeleterFnPtr d = nullptr) { + if (!storage_impl_.unique()) { + AT_ERROR( + "UniqueStorageShareExternalPointer can only be called when use_count == 1"); + } + storage_impl_->UniqueStorageShareExternalPointer( + src, data_type, capacity, d); + } + + void UniqueStorageShareExternalPointer( + at::DataPtr&& data_ptr, + const caffe2::TypeMeta& data_type, + size_t capacity) { + if (!storage_impl_.unique()) { + AT_ERROR( + "UniqueStorageShareExternalPointer can only be called when use_count == 1"); + } + storage_impl_->UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); + } + protected: c10::intrusive_ptr storage_impl_; }; diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h index ce60a7e580f049..35647d7b62d8af 100644 --- a/caffe2/core/storage.h +++ b/caffe2/core/storage.h @@ -20,137 +20,13 @@ #include #include #include +#include #include namespace caffe2 { using StorageImpl = at::StorageImpl; - -class CAFFE2_API Storage { - public: - Storage() {} - Storage(at::DeviceType device_type) - : storage_impl_(c10::make_intrusive(device_type)) {} - Storage(at::DeviceType device_type, TypeMeta data_type) - : storage_impl_( - c10::make_intrusive(device_type, data_type)) {} - - Storage( - TypeMeta data_type, - int64_t numel, - at::DataPtr data_ptr, - at::Allocator* allocator, - bool resizable) - : storage_impl_(c10::make_intrusive( - data_type, - numel, - std::move(data_ptr), - allocator, - resizable)) {} - - void reset() { - storage_impl_->reset(); - } - - // For debugging purpose only, please don't call it - StorageImpl* unsafeGetStorageImp() const { - return storage_impl_.get(); - } - - template - inline bool IsType() const { - return storage_impl_->IsType(); - } - - void* data() const { - return storage_impl_->data(); - } - - void* data() { - return storage_impl_->data(); - } - - at::DataPtr& data_ptr() { - return storage_impl_->data_ptr(); - } - - const at::DataPtr& data_ptr() const { - return storage_impl_->data_ptr(); - } - // Returns the previous data_ptr - at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) { - return storage_impl_->set_data_ptr(std::move(data_ptr)); - }; - - void set_dtype(const TypeMeta& data_type) { - storage_impl_->set_dtype(data_type); - } - - const TypeMeta& dtype() const { - return storage_impl_->dtype(); - } - - size_t capacity() const { - return storage_impl_->capacity(); - } - - int64_t numel() const { - return storage_impl_->numel(); - } - - // TODO: remove later - void set_numel(int64_t numel) { - storage_impl_->set_numel(numel); - } - - at::DeviceType device_type() const { - return storage_impl_->device_type(); - } - - const at::Allocator* allocator() const { - return storage_impl_->allocator(); - } - - inline size_t itemsize() const { - return storage_impl_->itemsize(); - } - - inline long use_count() const { - return storage_impl_.use_count(); - } - - inline bool unique() const { - return storage_impl_.unique(); - } - - void UniqueStorageShareExternalPointer( - void* src, - const TypeMeta& data_type, - size_t capacity, - MemoryDeleter d = nullptr) { - CAFFE_ENFORCE_WITH_CALLER( - storage_impl_.unique(), - "UniqueStorageShareExternalPointer can only be called when \ - use_count == 1"); - storage_impl_->UniqueStorageShareExternalPointer( - src, data_type, capacity, d); - } - - void UniqueStorageShareExternalPointer( - at::DataPtr&& data_ptr, - const TypeMeta& data_type, - size_t capacity) { - CAFFE_ENFORCE_WITH_CALLER( - storage_impl_.unique(), - "UniqueStorageShareExternalPointer can only be called when \ - use_count == 1"); - storage_impl_->UniqueStorageShareExternalPointer( - std::move(data_ptr), data_type, capacity); - } - - protected: - c10::intrusive_ptr storage_impl_; -}; +using Storage = at::Storage; } // namespace caffe2 diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index a24f8cb67b7347..15aefc37c74188 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -96,7 +96,7 @@ vector GetTensorInfo( const Tensor* tc = static_cast(c); CHECK(tc); CHECK(tc->unsafeGetTensorImpl()); - CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImp()); + CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImpl()); *capacity = tc->capacity_nbytes(); tc->ExtractDeviceOption(device); return tc->dims(); From 57f149a861bd3e4b0a8925d41cfe9091657794fc Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Thu, 13 Sep 2018 09:28:52 -0700 Subject: [PATCH 115/237] Only join pin_memory_thread after it started (#11599) Summary: Same reason as in #11432 . Example error: ``` Exception ignored in: Traceback (most recent call last): File "/private/home/ssnl/miniconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 405, in __del__ self._shutdown_workers() File "/private/home/ssnl/miniconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 401, in _shutdown_workers self.pin_memory_thread.join() AttributeError: '_DataLoaderIter' object has no attribute 'pin_memory_thread' ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11599 Differential Revision: D9801143 Pulled By: SsnL fbshipit-source-id: 520590a21f56fa381fcac621457a7544d3fba47e --- torch/utils/data/dataloader.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 951321fcdf1052..9d69ab4daf0fc8 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -289,12 +289,15 @@ def __init__(self, loader): if self.pin_memory: self.data_queue = queue.Queue() - self.pin_memory_thread = threading.Thread( + pin_memory_thread = threading.Thread( target=_pin_memory_loop, args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory, torch.cuda.current_device())) - self.pin_memory_thread.daemon = True - self.pin_memory_thread.start() + pin_memory_thread.daemon = True + pin_memory_thread.start() + # Similar to workers (see comment above), we only register + # pin_memory_thread once it is started. + self.pin_memory_thread = pin_memory_thread else: self.data_queue = self.worker_result_queue @@ -397,7 +400,7 @@ def _shutdown_workers(self): q.put(None) for w in self.workers: w.join() - if self.pin_memory: + if hasattr(self, 'pin_memory_thread'): self.pin_memory_thread.join() def __del__(self): From d4d72b87e3eef9936b6355effb05df546cbdb803 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Thu, 13 Sep 2018 10:16:33 -0700 Subject: [PATCH 116/237] Sphinx is case sensitive Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11646 Differential Revision: D9811355 Pulled By: SsnL fbshipit-source-id: d484561baa2ac5b3113870b4ee06fa3560b686e4 --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 31c6c3f743da06..8a0f53975e6098 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,7 +33,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs. torch.autograd torch.distributed torch.distributions - torch.jit + torch.jit torch.multiprocessing bottleneck checkpoint From 1f49b879d1b7f2d5e4f5f955fa037f8e54ec457a Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 10:25:57 -0700 Subject: [PATCH 117/237] Add missing include for __half (#11638) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11638 Differential Revision: D9811063 Pulled By: ezyang fbshipit-source-id: dd103bb152485bcdbb0108b4d3de2443c30d5572 --- aten/src/THC/THCStorage.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp index 3eedf4a907144f..e8dfc2213076f1 100644 --- a/aten/src/THC/THCStorage.hpp +++ b/aten/src/THC/THCStorage.hpp @@ -9,10 +9,16 @@ #include "ATen/ScalarType.h" +#include +#include +#include + namespace at { +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_HCC__) template <> struct CTypeToScalarType<__half> : public CTypeToScalarType {}; +#endif } From d278344e36dde4170da4e4d0c45953dcb53dcf65 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Thu, 13 Sep 2018 10:35:53 -0700 Subject: [PATCH 118/237] Automatic update of fbcode/onnx to 39dd0d4fec5913aa517b71bcfcbf638a427894eb (#11622) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11622 Previous import was bff0b8835870c7df7762ef43498d000d2d8ffb52 Included changes: - **[39dd0d4](https://github.com/onnx/onnx/commit/39dd0d4)**: [build] Add ONNX_API for protos in all cases (#1407) - **[944db4f](https://github.com/onnx/onnx/commit/944db4f)**: cmake (#1401) - **[8ccc8dd](https://github.com/onnx/onnx/commit/8ccc8dd)**: Remove ONNXIFI_CHECK_RESULT from onnxRelease* functions (#1397) - **[df14e74](https://github.com/onnx/onnx/commit/df14e74)**: Change onnxifi test driver classname (#1396) - **[0c885cc](https://github.com/onnx/onnx/commit/0c885cc)**: ONNXIFI cpp test driver (#1290) - **[a557848](https://github.com/onnx/onnx/commit/a557848)**: Coverage Report Tools for Backend Scoreboard (#1301) - **[31fd87f](https://github.com/onnx/onnx/commit/31fd87f)**: fix AvgPool doc. add default value for count_include_pad (#1391) - **[8ff08c2](https://github.com/onnx/onnx/commit/8ff08c2)**: Do not export onnx symbols in the python extension (#1388) Reviewed By: orionr Differential Revision: D9806635 fbshipit-source-id: f61c052b6bd14e0c80ace19c1a5f0ba659030c6f --- third_party/onnx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/onnx b/third_party/onnx index bff0b8835870c7..39dd0d4fec5913 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit bff0b8835870c7df7762ef43498d000d2d8ffb52 +Subproject commit 39dd0d4fec5913aa517b71bcfcbf638a427894eb From a861573e36860471a3997cc1cec4a0dd1fdf9746 Mon Sep 17 00:00:00 2001 From: Roy Li Date: Thu, 13 Sep 2018 10:59:11 -0700 Subject: [PATCH 119/237] fix tensor export bug in IR export (#11613) Differential Revision: D9811094 Pulled By: li-roy fbshipit-source-id: 012792dbedc70bd3fa242fdf2e39da0b21ce158d --- test/test_jit.py | 3 ++- torch/csrc/jit/export.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 97ef28630b7836..20ec276981428a 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -5115,10 +5115,11 @@ def __init__(self): self.param1 = torch.nn.Parameter(torch.rand(5, 5)) self.param2 = torch.nn.Parameter(self.param1[3]) self.param3 = torch.nn.Parameter(torch.rand(5, 5)) + self.param4 = torch.nn.Parameter(torch.rand(11, 5)[1:6]) @torch.jit.script_method def foo(self): - return self.param1 + self.param2 + self.param3 + return self.param1 + self.param2 + self.param3 + self.param4 m_orig = M() m_import = self.getExportImportCopy(m_orig) diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 0d304a91583e8d..0740d2e88a2c8a 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -685,7 +685,7 @@ void ModuleEncoder::EncodeTensor( } auto record_number = file_writer_.writeRecord( - static_cast(t.storage().data()), t.type().elementSizeInBytes() * t.numel()); + static_cast(t.storage().data()), t.type().elementSizeInBytes() * t.storage().size()); tensor_proto->add_int64_data(record_number); storage_dedup_map_[storage_ptr] = record_number; } From 5bc90b8554a9f978e7d87ed120fa13040a983cac Mon Sep 17 00:00:00 2001 From: Roger-luo Date: Thu, 13 Sep 2018 11:04:41 -0700 Subject: [PATCH 120/237] support conversion and dispatch of complex numbers (#11603) Summary: - Just a simple fix to support `fill_` - And a fix for indexing in `pytorch-complex` Differential Revision: D9804061 Pulled By: ezyang fbshipit-source-id: 631129b3fa220a9670770b3766f14a8e03633bdf --- aten/src/ATen/Dispatch.h | 49 ++++++++++++++++++++++++++ aten/src/ATen/core/Scalar.h | 2 +- aten/src/ATen/native/Scalar.cpp | 2 +- torch/csrc/utils/python_arg_parser.cpp | 4 +++ torch/csrc/utils/python_arg_parser.h | 4 +++ 5 files changed, 59 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index c598901b2b943f..64f181d4dccb3c 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -79,3 +79,52 @@ AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ } \ }() + +#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + + +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() + +#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...) \ + [&] { \ + const at::Type& the_type = TYPE; \ + switch (the_type.scalarType()) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ + } \ + }() diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h index 35c4b538336aeb..de01a56ce33748 100644 --- a/aten/src/ATen/core/Scalar.h +++ b/aten/src/ATen/core/Scalar.h @@ -99,6 +99,6 @@ template<> \ inline T Scalar::to() { \ return to##name(); \ } -AT_FORALL_SCALAR_TYPES(DEFINE_TO) +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO) #undef DEFINE_TO } diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index 6ffb891bf7d777..975ae8c1ff9c47 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -18,7 +18,7 @@ Scalar _local_scalar(const Tensor& self) { Scalar _local_scalar_dense_cpu(const Tensor& self) { Scalar r; - AT_DISPATCH_ALL_TYPES_AND_HALF( + AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX( self.type(), "_local_scalar_dense_cpu", [&] { scalar_t value = *self.data(); r = Scalar(value); diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index d36f70e460d793..5dbc83eddc59b9 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -103,6 +103,10 @@ bool FunctionParameter::check(PyObject* obj) { return THPVariable_Check(obj) || (allow_numbers_as_tensors && THPUtils_checkDouble(obj)); } case ParameterType::SCALAR: + if (PyComplex_Check(obj)) { + return true; + } + // fallthrough case ParameterType::DOUBLE: { if (THPUtils_checkDouble(obj)) { return true; diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 86453a79e30dc2..294819a5a11550 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -232,6 +232,10 @@ inline at::Scalar PythonArgs::scalarWithDefault(int i, at::Scalar default_scalar if (THPUtils_checkLong(args[i])) { return at::Scalar(static_cast(THPUtils_unpackLong(args[i]))); } + + if (PyComplex_Check(args[i])) { + return at::Scalar(THPUtils_unpackComplexDouble(args[i])); + } return at::Scalar(THPUtils_unpackDouble(args[i])); } From ab3a2d25fbf695a80bae4e9e269ee0e532cdb533 Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Thu, 13 Sep 2018 11:05:09 -0700 Subject: [PATCH 121/237] Improve error messages when trying to use nested lists. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11606 Differential Revision: D9806949 Pulled By: zdevito fbshipit-source-id: c38abc4ce745a63d26a64f6aa1b41350e4b1acd5 --- test/test_jit.py | 17 +++++++++++++++-- torch/csrc/jit/script/compiler.cpp | 23 ++++++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 20ec276981428a..5a7ea169d8d598 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -2529,6 +2529,19 @@ def stuff3(x): return torch.ones(x), x self.checkScript(stuff3, ([3, 2],)) + def test_nested_list_error(self): + with self.assertRaisesRegex(RuntimeError, "Lists can only contain"): + @torch.jit.script + def foo(x): + # type: (Tuple[List[List[int]]]) -> int + return 4 + + def test_nested_list_construct_error(self): + with self.assertRaisesRegex(RuntimeError, "Lists can only contain"): + @torch.jit.script + def foo(x): + return [[4]] + def test_script_cu(self): cu = torch.jit.CompilationUnit(''' def foo(a): @@ -5507,10 +5520,10 @@ def f3(a): def f4(a): torch.cat(a) - with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found Tensor[][]'): + with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found int\[\]'): @torch.jit.script def f5(a): - torch.cat([[a]]) + torch.cat([3]) with self.assertRaisesRegex(RuntimeError, 'Lists must contain only a single type'): @torch.jit.script diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 5efa0668d91296..d49a653194b1eb 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -721,6 +721,24 @@ std::shared_ptr BuiltinFunction::call( emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, true)); } +inline bool isSupportedListElementType(TypePtr type) { + return type->isSubtypeOf(DynamicType::get()) || + type->isSubtypeOf(NumberType::get()); +} + +// guard for List types we do not currently have operations for +inline void ensureLegalType(const SourceRange& range, TypePtr ptr) { + if(TupleTypePtr tt = ptr->cast()) { + for(auto elem : tt->elements()) { + ensureLegalType(range, elem); + } + } else if(ListTypePtr lt = ptr->cast()) { + if(!isSupportedListElementType(lt->getElementType())) { + throw ErrorReport(range) << "Lists can only contain numbers or Tensors, but found " << lt->getElementType()->str(); + } + } +} + struct to_ir { to_ir( Def def, @@ -771,6 +789,7 @@ struct to_ir { // Record the type for the schema and set the Type on the Value* arguments.push_back(schema.arguments.at(arg_annotation_idx++)); new_input->setType(arguments.back().type); + ensureLegalType((*it).ident().range(), arguments.back().type); } // body auto stmts = def.statements(); @@ -1550,8 +1569,10 @@ struct to_ir { << *elem_type << " but found " << *v->type() << " instead"; } } - return graph->insertNode(graph->createList(elem_type, values)) + Value* result = graph->insertNode(graph->createList(elem_type, values)) ->output(); + ensureLegalType(tree->range(), result->type()); + return result; } break; case TK_TUPLE_LITERAL: { auto ll = TupleLiteral(tree); From 6f53b4efea8746d4ff296cc8c5ddd35fd4975998 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Thu, 13 Sep 2018 11:10:00 -0700 Subject: [PATCH 122/237] Remove implicit bool casts (#11503) Summary: In order to comply with Python's rules on implicit casting of non-booleans to booleans, this PR removes implicit casting in favor of explicit casts via `bool()` cc zdevito Pull Request resolved: https://github.com/pytorch/pytorch/pull/11503 Differential Revision: D9780869 Pulled By: driazati fbshipit-source-id: c753acaca27f4e79dddf424c6b04674f44a6aad9 --- test/cpp/api/jit.cpp | 2 +- test/test_jit.py | 71 ++++++++++++++++-------------- torch/csrc/jit/script/compiler.cpp | 14 +++--- torch/csrc/jit/test_jit.cpp | 8 ++-- torch/jit/batchop.py | 20 ++++----- 5 files changed, 63 insertions(+), 52 deletions(-) diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp index 8879d6f0007fdc..c46868cdbf6989 100644 --- a/test/cpp/api/jit.cpp +++ b/test/cpp/api/jit.cpp @@ -13,7 +13,7 @@ TEST_CASE("torch script") { def test_relu(a, b): return torch.relu(a + b) def test_while(a, i): - while i < 10: + while bool(i < 10): a += a i += 1 return a diff --git a/test/test_jit.py b/test/test_jit.py index 5a7ea169d8d598..7bbc2455bf4a77 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -909,7 +909,7 @@ def test_recursive_cse(self): def fn(x, y): z = x - if x + y > x: + if bool(x + y > x): z = x + y return z @@ -1609,12 +1609,11 @@ def test_constant_prop_nested(self): @torch.jit.script def constant_prop(a): b = 2 + 1 - if a < 2: + if bool(a < 2): c = b + 2 else: c = b - 2 return c - out_ref = constant_prop(torch.tensor(2)) self.run_pass('constant_propagation', constant_prop.graph) out_test = constant_prop(torch.tensor(2)) @@ -1648,8 +1647,8 @@ def constant_prop(a, b): c0 = 1 c1 = 1 c2 = 1 - if a: # -> c0, c1 - if b: # -> c0 + if bool(a): # -> c0, c1 + if bool(b): # -> c0 if True: # -> c0 c0 = c0 + 1 if False: @@ -2098,7 +2097,7 @@ def batch_sum(a): def test_if_else(self): def single_if(a, b): - if a > b: + if bool(a > b): a = a + b else: a = a - b @@ -2118,7 +2117,7 @@ def single_if(a, b): def test_if_else_with_scalar(self): def single_if(a, b): - if a > 0.1: + if bool(a > 0.1): a = a + b else: a = a - b @@ -2138,7 +2137,7 @@ def single_if(a, b): def test_if_noelse(self): def single_if(a, b): - if a > b: + if bool(a > b): a = a + b return a @@ -2156,7 +2155,7 @@ def single_if(a, b): def test_if_noelse_with_scalar(self): def single_if(a, b): - if a > 0.1: + if bool(a > 0.1): a = a + b return a @@ -2174,7 +2173,7 @@ def single_if(a, b): def test_while(self): def single_while(a, b): - while a > b: + while bool(a > b): a = a - b return a @@ -2262,7 +2261,7 @@ def test_greedy_search(self): def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num): iter_count = torch.zeros_like(iter_num) - while(iter_count < iter_num): + while bool(iter_count < iter_num): iter_count += 1 # LSTM Cell i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i @@ -2328,7 +2327,7 @@ def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, vocab_size = embed.size(1) iter_count = torch.zeros_like(iter_num) max_len = idx.size(2) - while(iter_count < iter_num): + while bool(iter_count < iter_num): iter_count += 1 # LSTM Cell i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i @@ -3454,7 +3453,7 @@ def func(a): def test_while(self): def func(a, b, max): - while a < max: + while bool(a < max): a = a + 1 b = b + 1 c = a + b @@ -3471,7 +3470,7 @@ def func(lim): somenum = 5 dontmutateme = 3 third = 0 - while i < lim: + while bool(i < lim): third = first + second first = second second = third @@ -3493,7 +3492,7 @@ def test_if(self): def func(a, b): # type: (int, int) -> int d = 3 - if a > 10: + if bool(a > 10): a = 3 + d else: b = 3 + d @@ -3509,7 +3508,7 @@ def func(a, b): # type: (int, int) -> int d = 3 for _ in range(20): - if a > 10: + if bool(a > 10): a = 3 + d else: b = 3 + d @@ -3521,7 +3520,7 @@ def func(a, b): def test_if_noelse(self): def func(a, b): - if a > 10: + if bool(a > 10): a = 3 + b c = a + b return c @@ -3529,11 +3528,19 @@ def func(a, b): inputs = self._make_scalar_vars([-1, 1], torch.int64) self.checkScript(func, inputs, optimize=True) + def test_explicit_bool_cast(self): + with self.assertRaisesRegex(RuntimeError, "expected an integer"): + @torch.jit.script + def test_bool_cast(a): + if a: + return a + 2 + return a + 1 + def test_while_nonexistent_value(self): with self.assertRaisesRegex(RuntimeError, "undefined value x"): torch.jit.CompilationUnit(''' def test_while(a, b): - while a < 10: + while bool(a < 10): a = a + x b = b + 1 return a + b @@ -3551,7 +3558,7 @@ def test_while(a, b): def test_while_write_outer_then_read(self): def func(a, b): - while a < 10: + while bool(a < 10): a = a + 1 b = a + 1 return a + b @@ -3700,7 +3707,7 @@ def test_script_bool_constant(): def test_ternary(self): def func(a, b): c = 3 - c = a + b if a > 3 else b + c = a + b if bool(a > 3) else b return c inputs_true = self._make_scalar_vars([5, 2], torch.int64) @@ -3723,18 +3730,18 @@ def test_logical_short_circuit(self): @torch.jit.script def testNoThrows(t): c1 = 1 - if (False and t[1]) or (True or t[1]): + if (False and bool(t[1])) or (True or bool(t[1])): c1 = 0 return c1 @torch.jit.script def throwsOr(t): - c0 = False or t[1] + c0 = False or bool(t[1]) print(c0) @torch.jit.script def throwsAnd(t): - c0 = True and t[1] + c0 = True and bool(t[1]) print(c0) t = torch.randn(0) @@ -3968,7 +3975,7 @@ def test_call_python(a): step = 1 while i < 10: b = pyfunc(b) - if b > 3.0: + if bool(b > 3.0): b = pyfunc(b) i = 11 return b @@ -4966,7 +4973,7 @@ def test_trace_of_script(self): @torch.jit.script def foo(a, c): b = 0.0 - if a == 0.0: + if bool(a == 0.0): b = 1.0 return b + c @@ -4985,7 +4992,7 @@ def use(b): def test_if_define(self): @torch.jit.script def foo(a): - if a == 0: + if bool(a == 0): b = 1 else: b = 0 @@ -4994,14 +5001,14 @@ def foo(a): @torch.jit.script def foo2(a): b = 0 - if a == 0: + if bool(a == 0): b = 1 return b + 1 @torch.jit.script def foo3(a): b = 1 - if a == 0: + if bool(a == 0): c = 4 else: b = 0 @@ -5287,7 +5294,7 @@ def __init__(self): @torch.jit.script_method def forward(self, x): - if torch.sum(x) > 0: + if bool(torch.sum(x) > 0): x = torch.neg(x) return x @@ -5402,8 +5409,8 @@ def forward(self, x): # we cannot use `True` as the condition. Constant prop # would remove the `if` statements. c = sum(x) > 4 - if c: - if c: + if bool(c): + if bool(c): y = self.m(x) else: y = self.m(x) @@ -5836,7 +5843,7 @@ def test_return_stmt_not_at_end(self): with self.assertRaisesRegex(RuntimeError, 'return statements can appear only at the end of the function body'): @torch.jit.script def return_stmt_wrong(x): - if x > 3: + if bool(x > 3): return 3 else: return x diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index d49a653194b1eb..fc7bf068f20998 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -988,11 +988,15 @@ struct to_ir { Value* emitCond(Expr cond) { Value* v = emitExpr(cond); - if(v->type()->isSubtypeOf(DynamicType::get())) { - v = typeCast(cond.range(), v, IntType::get()); - } - if(!v->type()->isSubtypeOf(IntType::get())) { - throw ErrorReport(cond) << "expected a tensor or integer expression for condition but found " << v->type()->str(); + if (!v->type()->isSubtypeOf(IntType::get())) { + ErrorReport error(cond); + error << "expected an integer expression for condition but found " + << v->type()->str(); + if (v->type()->isSubtypeOf(DynamicType::get())) { + error << ", to use a tensor in a boolean" + << " expression, explicitly cast it with `bool()`"; + } + throw error; } return v; } diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index bec6b0459a0814..4ffd16ff24c95a 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -844,25 +844,25 @@ const static auto cf_examples = R"JIT( # FIXME: use 0 instead of a. # c = 0 c = a - if a < b: + if bool(a < b): c = b else: c = a return c def if_one(a, b): c = b - if a < b: + if bool(a < b): c = a return c def while_test(a, i): - while i < 3: + while bool(i < 3): a *= a i += 1 return a )JIT"; void testControlFlow() { script::Module cu; - script::defineMethodsInModule(cu, cf_examples, torch::jit::script::Resolver(), nullptr); + script::defineMethodsInModule(cu, cf_examples, torch::jit::script::nativeResolver, nullptr); auto run = [&](const std::string & name, std::vector stack) { auto graph = cu.get_method(name).graph(); Code code(graph); diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py index cc022a37da2e3f..229cafbb94119d 100644 --- a/torch/jit/batchop.py +++ b/torch/jit/batchop.py @@ -140,7 +140,7 @@ def batch_select(data, mask, dims, dim_, index_): # if dim == 0: # raise ValueError("Cannot select 0 dim in BatchTensor") data = data.select(dim, index) - if dims[dim - 1]: + if bool(dims[dim - 1]): mask = mask.select(dim, index) else: mask = mask.select(dim, 0) @@ -171,7 +171,7 @@ def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dim res_mask = torch.zeros([0]) for i in range(batch_size): d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0) - if dims[dim - 1]: + if bool(dims[dim - 1]): m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0) else: m = mask[i].unsqueeze(0) @@ -310,7 +310,7 @@ def batch_argmax(data, mask, dims, dim_, keepdim_): batch_size = data.size(0) res_data = torch.zeros([0]) for i in range(batch_size): - if dims[dim - 1]: + if bool(dims[dim - 1]): if dim - 1 != 0: m = mask[i].transpose(0, dim - 1) else: @@ -346,7 +346,7 @@ def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_): res_data = torch.zeros([0]) res_index = torch.zeros([0]) for i in range(batch_size): - if dims[dim - 1]: + if bool(dims[dim - 1]): if dim - 1 != 0: m = mask[i].transpose(0, dim - 1) else: @@ -364,7 +364,7 @@ def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_): else: res_data = torch.cat([res_data, d], 0) res_index = torch.cat([res_index, idx], 0) - if dims[dim - 1]: + if bool(dims[dim - 1]): mask = mask.narrow(dim, 0, k) return res_data, mask, dims, res_index, mask, dims @@ -378,7 +378,7 @@ def batch_softmax(data, mask, dims, dim_): max_len = data.size(dim) res_data = torch.zeros([0]) for i in range(batch_size): - if dims[dim - 1]: + if bool(dims[dim - 1]): if dim - 1 != 0: m = mask[i].transpose(0, dim - 1) else: @@ -417,7 +417,7 @@ def batch_view(data, mask, dims, sizes): res_dims = data_sizes_.narrow(0, 0, 1) for i_ in range(sizes.size(0) - 1): i = i_ + 1 - if(sizes[i] == -1): + if bool(sizes[i] == -1): cur_size_ = mask.size(i) cur_dim = 1 else: @@ -434,7 +434,7 @@ def batch_view(data, mask, dims, sizes): def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_): dim = int(dim_) data = torch.cat([data1, data2], dim) - if(dims1[dim - 1]): + if bool(dims1[dim - 1]): mask = torch.cat([mask1, mask2], dim) else: mask = mask1 @@ -445,7 +445,7 @@ def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_): def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_): dim = int(dim_) data = torch.cat([data1, data2, data3], dim) - if(dims1[dim - 1]): + if bool(dims1[dim - 1]): mask = torch.cat([mask1, mask2, mask3], dim) else: mask = mask1 @@ -460,7 +460,7 @@ def batch_narrow(data, mask, dims, dimension_, start_, length_): # if dimension == 0: # raise ValueError("cannot do narrow along batch_dim") data = data.narrow(dimension, start, length) - if dims[dimension - 1]: + if bool(dims[dimension - 1]): mask = mask.narrow(dimension, start, length) else: mask = mask.narrow(dimension, 0, 1) From 9abc66674559ac214dad31ca42bc0d981d5260db Mon Sep 17 00:00:00 2001 From: Roy Li Date: Thu, 13 Sep 2018 11:51:17 -0700 Subject: [PATCH 123/237] stop allowing extra positional args in arg parser (#10499) Summary: Arg parser allowed additional positional args to be parsed into keyword-only params. Fixes a couple cases: - The positional argument happens to be of the right type, and it just works silently. Now, we fail as expected. - The positional argument fails later down the line. Now, we fail at the appropriate time and get a better error message. Pre-fix: ``` >>> torch.cuda.LongTensor((6, 0), 1, 1, 0) tensor([6, 0], device='cuda:1') ``` Post-fix: ``` >>> torch.cuda.LongTensor((6, 0), 1, 1, 0) Traceback (most recent call last): File "", line 1, in TypeError: new() received an invalid combination of arguments - got (tuple, int, int, int), but expected one of: * (torch.device device) * (torch.Storage storage) * (Tensor other) * (tuple of ints size, torch.device device) * (object data, torch.device device) ``` Pre-fix: ``` >>> a = torch.tensor(5) >>> a.new_zeros((5,5), 0) Traceback (most recent call last): File "", line 1, in TypeError: new_zeros(): argument 'dtype' (position 2) must be torch.dtype, not int ``` Post-fix: ``` >>> a = torch.tensor(5) >>> a.new_zeros((5,5), 0) Traceback (most recent call last): File "", line 1, in TypeError: new_zeros() takes 1 positional argument but 2 were given ``` fixes #8351 Pull Request resolved: https://github.com/pytorch/pytorch/pull/10499 Differential Revision: D9811093 Pulled By: li-roy fbshipit-source-id: ce946270fd11b264ff1b09765db3300879491f76 --- test/test_torch.py | 8 ++++++++ torch/csrc/utils/python_arg_parser.cpp | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/test/test_torch.py b/test/test_torch.py index dce509cb7c5b0e..837d7715f3d9e1 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -7427,6 +7427,14 @@ def test_parsing_intlist(self): self.assertRaises(TypeError, lambda: torch.ones(np.array(3, 3))) self.assertRaises(TypeError, lambda: torch.ones((np.array(3, 3)))) + # fail parse with additional positional args after intlist arg + self.assertRaisesRegex(TypeError, + "received an invalid combination of arguments", + lambda: torch.LongTensor((6, 0), 1, 1, 0)) + self.assertRaisesRegex(TypeError, + "missing 1 required positional arguments", + lambda: torch.tensor().new_zeros((5, 5), 0)) + def _test_serialization_data(self): a = [torch.randn(5, 5).float() for i in range(2)] b = [a[i % 2] for i in range(4)] # 0-3 diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 5dbc83eddc59b9..b1118481c4520a 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -450,6 +450,13 @@ bool FunctionSignature::parse(PyObject* args, PyObject* kwargs, PyObject* dst[], PyObject* obj = nullptr; bool is_kwd = false; if (arg_pos < nargs) { + // extra positional args given after single positional IntList arg + if (param.keyword_only) { + if (raise_exception) { + extra_args(*this, nargs); + } + return false; + } obj = PyTuple_GET_ITEM(args, arg_pos); } else if (kwargs) { obj = PyDict_GetItem(kwargs, param.python_name); From 45e9ee096e64c572cc6c1411aff6c6adc698c0bf Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Thu, 13 Sep 2018 12:07:22 -0700 Subject: [PATCH 124/237] Fix test_mnist_training_leaks_no_memory_cuda warning (#11639) Summary: Before this PR it would warn that "dropout is non deterministic and can cause problems when checking trace", so I disabled the trace checking. cc zdevito apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/11639 Differential Revision: D9812493 Pulled By: zou3519 fbshipit-source-id: fab86928a5fba8b218b47543533aaf7c82a10b4a --- test/test_jit.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_jit.py b/test/test_jit.py index 7bbc2455bf4a77..61a6b94db9935c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -7084,7 +7084,9 @@ def test_mnist_cuda(self): @unittest.skipIf(not RUN_CUDA, "no CUDA") def test_mnist_training_leaks_no_memory_cuda(self): net = MnistNet().cuda() - traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')]) + # MnistNet uses dropout, don't check its trace + traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')], + check_trace=False) def train(iters): for _ in range(iters): From 912d3626c8f834dc7d66ed6fd2d3b27ec77e39ed Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 12:19:16 -0700 Subject: [PATCH 125/237] Split tensor.h into tensor_impl.h and tensor.h (#11642) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11642 This is just a preparatory change to help with future refactoring: - I want to reduce the number of includes that tensor_impl.h depends on, but - I need to keep tensor.h providing all Caffe2 headers, because users may be relying on tensor.h transitively providing those headers. Introducing a level of indirection lets me do both at the same time. Reviewed By: jerryzh168 Differential Revision: D9810823 fbshipit-source-id: 8dfaac4b8768051a22898be8fcaf787ecc57eb13 --- caffe2/core/tensor.cc | 12 - caffe2/core/tensor.h | 894 +----------------------------------- caffe2/core/tensor_impl.cc | 14 + caffe2/core/tensor_impl.h | 902 +++++++++++++++++++++++++++++++++++++ 4 files changed, 917 insertions(+), 905 deletions(-) create mode 100644 caffe2/core/tensor_impl.cc create mode 100644 caffe2/core/tensor_impl.h diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index 15aefc37c74188..e88ed360e8b92b 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -1,18 +1,6 @@ #include "caffe2/core/tensor.h" #include "caffe2/core/blob_stats.h" -#include "caffe2/core/flags.h" - -CAFFE2_DEFINE_bool( - caffe2_keep_on_shrink, - true, - "If set, keeps memory when a tensor is shrinking its size."); - -CAFFE2_DEFINE_int64( - caffe2_max_keep_on_shrink_memory, - LLONG_MAX, - "The maximum memory in bytes to keep on shrink, if the difference between " - "tensor sizes is bigger than this then tensor will be reset."); namespace caffe2 { diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 6fd35133811cd9..5564034e545837 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -2,904 +2,12 @@ #define CAFFE2_CORE_TENSOR_H_ #include "caffe2/core/storage.h" +#include "caffe2/core/tensor_impl.h" #include -// A global boolean variable to control whether we free memory when a Tensor -// is shrinked to a smaller size. As a result, a Tensor is always going to -// keep the memory allocated for its maximum capacity reshaped to so far. -CAFFE2_DECLARE_bool(caffe2_keep_on_shrink); - -// Since we can have high variance in blob memory allocated across different -// inputs in the same run, we will shrink the blob only if the memory gain -// is larger than this flag in bytes. -CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory); - namespace caffe2 { -/** - * A utility function to convert vector to vector. - */ -inline vector ToVectorTIndex(const std::vector& src) { - return vector(src.begin(), src.end()); -} - -/** - * Return product of all dimensions starting from k - */ -inline TIndex size_from_dim_(int k, const vector& dims) { - TIndex r = 1; - for (size_t i = k; i < dims.size(); ++i) { - r *= dims[i]; - } - return r; -} - -// Product of all dims up to k (not including dims[k]) -inline TIndex size_to_dim_(int k, const vector& dims) { - CAFFE_ENFORCE((unsigned)k <= dims.size()); - TIndex r = 1; - for (int i = 0; i < k; ++i) { - r *= dims[i]; - } - return r; -} - -// Product of all dims between k and l (not including dims[k] and dims[l]) -inline TIndex size_between_dim_(int k, int l, const vector& dims) { - CAFFE_ENFORCE((unsigned)l < dims.size()); - TIndex r = 1; - if (k < l) { - for (int i = k + 1; i < l; ++i) { - r *= dims[i]; - } - } else { - for (int i = l + 1; i < k; ++i) { - r *= dims[i]; - } - } - return r; -} - -// Wrap around axis_index if it is negative, s.t., -1 is the last dim -inline int canonical_axis_index_(int axis_index, int ndims) { - CAFFE_ENFORCE_GE(axis_index, -ndims); - CAFFE_ENFORCE_LT(axis_index, ndims); - if (axis_index < 0) { - return axis_index + ndims; - } - return axis_index; -} - -/** - * @brief TensorImpl is the implementation of a tensor and the basic class - * in Caffe2 that stores a contiguous memory with its shape information. - * - * The TensorImpl class is essentially a wrapper around a device-specific memory - * (the device is specified by the Context template argument), and deals with - * the allocation and de-allocation of such memory. We make a simplified - * assumption that the memory is always contiguous. - */ -class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { - public: - TensorImpl() = delete; - explicit TensorImpl(DeviceType device_type) : storage_(device_type) {} - - /** - * @brief Creates a tensor of the given dimension. - * - * Note that the actual data allocation is not going to be carried out until - * the first time mutable_data() is called. - */ - // TODO: here, we create a Storage - // and immediately discard it in Resize() since - // reset_tensor will be true and FreeMemory will be called, - // we might want to avoid creating Storage twice? - explicit TensorImpl(const vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - explicit TensorImpl(const vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - /* Now we require that context_for_copy has the same device type as src since - * template is removed - */ - TensorImpl( - const TensorImpl& src, - BaseContext* context_for_copy, - at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src, context_for_copy); - } - - /** - * @brief: Create a Tensor of at::DeviceType `type` and initialize it with - * src Tensor - */ - TensorImpl(const TensorImpl& src, at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src); - } - - /** - * @brief Creates a tensor, and fills its contents with the given values. - * The type of tensor will be decided by the context parameter - */ - template - TensorImpl( - const vector& dims, - const vector& values, - BaseContext* context) - : storage_(context->GetDevicetype(), TypeMeta::Make()) { - Resize(dims); - CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, values.data(), mutable_data()); - } - - /** - * @brief Creates a scalar tensor, and fills its content with the given value. - * The type of tensor will be decided by the context parameter - */ - template < - typename T, - typename = typename std::enable_if::value>::type> - TensorImpl(const T& value, BaseContext* context) - : storage_(context->GetDevicetype(), TypeMeta::Make()) { - Resize(vector{}); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, &value, mutable_data()); - } - - /** - * @brief Delete the copy constructor and use Clone explicitly - */ - TensorImpl(const TensorImpl& src) = delete; - - TensorImpl(TensorImpl&& src) noexcept { - swap(src); - } - - TensorImpl& operator=(TensorImpl&&) = default; - // Note(jiayq): possibly a rule-of-three violation, but we explicitly - // discourage the use of = for Tensors. - TensorImpl& operator=(const TensorImpl& src) = delete; - - virtual ~TensorImpl() noexcept {} - - /* - * Since we removed template from tensor, we now store a static - * context pointer in tensor, which indicates the type of the tensor. - */ - BaseStaticContext* GetStaticContext() const { - return get_static_context(GetDeviceType()); - } - - /* @brief - * Create a context that has the same device_type - * as the tensor. - * Note that this doesn't support passing in argument - * TODO(jerryzh): move this to a global registry - * that can create context for us - */ - std::unique_ptr CreateContext() const { - return GetStaticContext()->CreateContext(); - } - - at::DeviceType GetDeviceType() const { - return storage_.device_type(); - } - - /** - * @brief Copies the data from a source tensor, with a contex provided to - * carry out the underlying memcpy operation. - */ - void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) { - if ((void*)&src == (void*)this) { - return; - } - if (storage_.dtype() != src.meta()) { - storage_ = Storage(GetDeviceType(), src.meta()); - } - if (src.size() == -1) { - dims_.clear(); - numel_ = -1; - storage_.reset(); - return; - } - Resize(src.dims()); - if (size() > 0) { - if (storage_.dtype().copy()) { - CAFFE_ENFORCE( - GetDeviceType() == CPU, - "In CopyFrom source and dest tensors must both be CPU for meta copy"); - CAFFE_ENFORCE( - src.GetDeviceType() == CPU, - "In CopyFrom source and dest tensors must both be CPU for meta copy"); - storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size()); - } else { - // We'll need to use a non-CPU context to perform the copy if - // one of the context is not CPU since only non-CPU context - // knows how to copy between CPU and that context - if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) { - if (!context) { - src.CreateContext()->CopyBytesToDevice( - nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); - } else { - CAFFE_ENFORCE( - context->GetDevicetype() == src.GetDeviceType(), - "Type for provided context does not match the type of source"); - context->CopyBytesToDevice( - nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); - } - } else { - // In case source context is CPU, and target context is non-CPU - // We'll have to create a Context from target and perform the - // copy using that context - CreateContext()->CopyBytesFromCPU( - nbytes(), src.raw_data(), raw_mutable_data()); - } - } - } - } - - /** - * @brief Extend the outer-most dimension of this tensor - * to dimension of `num`. - */ - void ExtendTo(TIndex num, float growthPct, BaseContext* context) { - CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); - CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); - CAFFE_ENFORCE(context != nullptr, "Context must be provided."); - Extend(num - dims_[0], growthPct, context); - } - - /** - * @brief Extends the outer-most dimension of this tensor by num elements, - * preserving the existing data. - * - * The underlying data may be reallocated in order to accommodate the new - * elements, in which case this tensors' capacity is grown at a factor of - * growthPct. This ensures that Extend runs on an amortized O(1) time - * complexity. - */ - void Extend(TIndex num, float growthPct, BaseContext* context) { - CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); - CAFFE_ENFORCE_GE_WITH_CALLER( - num, 0, "`num` must be non-negative for Extend"); - auto newDims = dims_; - newDims[0] += num; - if (!storage_.data()) { - Resize(newDims); - return; - } - auto newNumel = std::accumulate( - newDims.begin(), - newDims.end(), - static_cast(1), - std::multiplies()); - if (newNumel * storage_.itemsize() <= storage_.capacity()) { - dims_ = newDims; - numel_ = newNumel; - return; - } - auto newCapacity = dims_; - newCapacity[0] = std::max( - newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100)); - auto oldData = std::move(storage_.data_ptr()); - auto oldSize = numel_; - auto oldDims = dims_; - Resize(newCapacity); - auto* newData = raw_mutable_data(storage_.dtype()); - CAFFE_ENFORCE( - context != nullptr, "Context must be provided to Extend the tensor"); - context->CopyItemsSameDevice( - storage_.dtype(), oldSize, oldData.get(), newData); - reserved_ = true; - dims_ = newDims; - numel_ = newNumel; - } - - /** - * @brief Shrinks the outer-most dimension to given size, keeping the data. - * - * This method guarantees that no re-allocations are carried out, which means - * that the extra capacity after the end of the shurnk tensor is maintained. - */ - void ShrinkTo(TIndex outer_dim) { - CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D"); - CAFFE_ENFORCE_WITH_CALLER( - outer_dim <= dims_[0], - "New outer dimension must be smaller than current."); - CAFFE_ENFORCE( - storage_.unique(), - "Can't call ShrinkTo on shared storage, please call Resize instead."); - dims_[0] = outer_dim; - numel_ = std::accumulate( - dims_.begin(), - dims_.end(), - static_cast(1), - std::multiplies()); - } - - /** - * @brief Reserve space for the underlying tensor. - * - * This must be called after Resize(), since we only specify the first - * dimension This does not copy over the old data to the newly allocated space - */ - template - void ReserveSpace(const T& outer_dim) { - CAFFE_ENFORCE( - numel_ != -1, "size should be initialized before calling ReserveSpace"); - CAFFE_ENFORCE( - storage_.unique(), "Can't call ReserveSpace on shared storage."); - auto newCapacity = dims_; - newCapacity[0] = outer_dim; - auto newNumel = std::accumulate( - newCapacity.begin(), - newCapacity.end(), - static_cast(1), - std::multiplies()); - if (newNumel * storage_.itemsize() <= storage_.capacity()) { - return; - } - // Old data is discarded - storage_.data_ptr().clear(); - auto oldSize = numel_; - auto oldDims = dims_; - Resize(newCapacity); - // Allocate new memory but don't copy over the data - raw_mutable_data(storage_.dtype()); - dims_ = oldDims; - numel_ = oldSize; - reserved_ = true; - } - - /** - * @brief Resizes a tensor. - * - * Resize takes in a vector of ints specifying the dimensions of the tensor. - * You can pass in an empty vector to specify that it is a scalar (i.e. - * containing one single item). - * - * The underlying storage may be deleted after calling Resize: if the new - * shape leads to a different number of items in the tensor, the old memory - * is deleted and new memory will be allocated next time you call - * mutable_data(). However, if the shape is different but the total number of - * items is the same, the underlying storage is kept. - */ - template - void Resize(Ts... dim_source) { - bool is_init = numel_ == -1; - bool size_changed = SetDims(dim_source...); - if (size_changed) { - // If needed, we will free the data. the next mutable_data() call - // will create the data storage. - bool reset_tensor = false; - if (reserved_) { - // If tensor is reserved then don't claim its memeory unless capacity() - // is smaller than new size - reset_tensor = storage_.capacity() < numel_ * storage_.itemsize(); - } else { - reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() || - !FLAGS_caffe2_keep_on_shrink || - storage_.capacity() - numel_ * storage_.itemsize() > - FLAGS_caffe2_max_keep_on_shrink_memory; - } - - if (reset_tensor && !is_init) { - FreeMemory(); - } - } - } - - /** - * Resize the tensor like the source tensor. Note that this is just a - * sugar wrapper that essentially calls Resize(src_tensor.dims()). - */ - inline void ResizeLike(const TensorImpl& src_tensor) { - // Note: need casting for different context types. - if (static_cast(this) != static_cast(&src_tensor)) { - Resize(src_tensor.dims()); - } - } - - /** - * Resizes the tensor without touching underlying storage. - * This requires the total size of the tensor to remains constant. - */ - inline void Reshape(const vector& dims) { - TIndex new_size = 1; - for (auto d : dims) { - CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); - new_size *= d; - } - CAFFE_ENFORCE_WITH_CALLER( - new_size == numel_, - "New size and old size are not equal. You cannot use Reshape, " - "but should use Resize." - // TODO(jiayq): remove the following warning after pending diffs - // stabilize. - " The old caffe2 mixes Reshape and Resize but this behavior has " - "been changed. If you find this error, most likely you will need " - "to change corresponding code from Reshape to Resize."); - dims_ = dims; - } - - inline void Reshape(const vector& dims) { - Reshape(ToVectorTIndex(dims)); - } - - /** - * Release whatever memory the tensor was holding but keep size and type - * information. Subsequent call to mutable_data will trigger new memory - * allocation. - */ - inline void FreeMemory() { - // We'll detach from the old Storage and create a new one - storage_ = Storage(storage_.device_type(), storage_.dtype()); - } - - /** - * A utility function to print the debug string for the tensor. Note that this - * is very slow since it involves quite some string operations, so do not use - * it in your performance-critical code. - */ - string DebugString() const { - std::stringstream ss; - ss << "A Tensor of item size " << storage_.itemsize() << " and type " - << storage_.dtype().name() << " and dimension ("; - for (int d : dims_) { - ss << d << ","; - } - ss << ")."; - return ss.str(); - } - - void swap(TensorImpl& other) noexcept { - std::swap(dims_, other.dims_); - std::swap(numel_, other.numel_); - std::swap(storage_, other.storage_); - } - - /** - * @brief Shares the data with another tensor. - * - * To share data between two tensors, the sizes of the two tensors must be - * equal already. The reason we do not implicitly do a Resize to make the two - * tensors have the same shape is that we want to allow tensors of different - * shapes but the same number of items to still be able to share data. This - * allows one to e.g. have a n-dimensional Tensor and a flattened version - * sharing the same underlying storage. - * - * The source tensor should already have its data allocated. - */ - void ShareData(const TensorImpl& src) { - // Right now, we are assuming the device_type are the same, since it is - // inherently the same in the non-templatized code. We should probably add - // an ENFORCE here which might affect perf a little bit. - CAFFE_ENFORCE_EQ_WITH_CALLER( - src.numel_, - numel_, - "Size mismatch - did you call reshape before sharing the data?"); - // It is possible that the source tensor hasn't called mutable_data() yet, - // in which case ShareData() doesn't make much sense since we don't really - // know what to share yet. - CAFFE_ENFORCE_WITH_CALLER( - src.storage_.data() || src.numel_ == 0, - "Source tensor has no content and has size > 0"); - // Finally, do sharing. - /* Since we create new Storage whenever we need to change data_type/capacity - * this still keeps the original semantics - */ - storage_ = src.storage(); - } - - /** - * @brief Shares the data with an externally managed pointer. - * - * This is similar to ShareData() but the source is a pointer with an advanced - * deleter option. In default, no deletion takes place, and one needs to make - * sure that the external memory is deallocated only after the tensor finishes - * using it. If a Deleter object is passed in, when this tensor is reallocated - * or freed, the deleter function is going to be called. - */ - template - void - ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) { - ShareExternalPointer((void*)src, TypeMeta::Make(), capacity, d); - } - - template - void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) { - ShareExternalPointer(std::move(data_ptr), TypeMeta::Make(), capacity); - } - - void ShareExternalPointer( - void* src, - const TypeMeta& data_type, - size_t capacity = 0, - MemoryDeleter d = nullptr) { - CAFFE_ENFORCE_WITH_CALLER( - data_type.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - ShareExternalPointer( - at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity); - } - - void ShareExternalPointer( - at::DataPtr&& data_ptr, - const TypeMeta& data_type, - size_t capacity) { - CAFFE_ENFORCE_WITH_CALLER( - data_type.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - if (!capacity) { - capacity = numel_ * data_type.itemsize(); - } - if (storage_.unique()) { - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "To share data with a raw pointer, you need to set shape first."); - storage_.UniqueStorageShareExternalPointer( - std::move(data_ptr), data_type, capacity); - } else { - int64_t numel = capacity / data_type.itemsize(); - // Create a new Storage - storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true); - } - } - - /** - * Returns a const raw void* pointer of the underlying storage. mutable_data() - * or raw_mutable_data() must have been called prior to this function call. - */ - inline const void* raw_data() const { - CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); - return storage_.data(); - } - - /** - * Returns a typed pointer of the underlying storage. mutable_data() or - * raw_mutable_data() must have been called prior to this function call, and - * the data type must be of the correct type. If you want to get a void* - * pointer instead, use raw_data(). - */ - template - inline const T* data() const { - CAFFE_ENFORCE_WITH_CALLER( - storage_.data() || numel_ == 0, - "The tensor is of non-zero shape, but its data is not allocated yet. " - "Caffe2 uses a lazy allocation, so you will need to call " - "mutable_data() or raw_mutable_data() to actually allocate memory."); - CAFFE_ENFORCE_WITH_CALLER( - IsType(), - "Tensor type mismatch, caller expects elements to be ", - TypeMeta::TypeName(), - ", while tensor contains ", - storage_.dtype().name(), - ". "); - return static_cast(storage_.data()); - } - - /** - * Returns a mutable raw pointer of the underlying storage. Since we will need - * to know the type of the data for allocation, a TypeMeta object is passed in - * to specify the necessary information. This is conceptually equivalent of - * calling mutable_data() where the TypeMeta parameter meta is derived from - * the type T. This function differs from mutable_data() in the sense that - * the type T can be specified during runtime via the TypeMeta object. - * - * If the existing data does not match the desired type, it will be deleted - * and a new storage will be created. - */ - inline void* raw_mutable_data(const TypeMeta& meta) { - // For 0-size tensors it's fine to return any pointer (including nullptr) - if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) { - return storage_.data(); - } else { - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "Tensor is not initialized. You probably need to call Resize() " - "before calling mutable_data()"); - bool had_special_dtor = storage_.dtype().dtor() != nullptr; - if (storage_.unique()) { - storage_.set_dtype(meta); - } else { - if (storage_.dtype() != meta) { - storage_ = Storage(storage_.device_type(), meta); - } - } - - // We can reuse the existing buffer if the current data does not have - // a special destructor and the new data doesn't have a special - // constructor. - if (numel_ == 0 || - (meta.ctor() == nullptr && !had_special_dtor && - storage_.numel() >= numel_)) { - return storage_.data(); - } - const at::Allocator* allocator = storage_.allocator(); - // TODO: Get rid of StaticContext - CAFFE_ENFORCE( - allocator == nullptr, - "Allocator is not used within Caffe2 functions, please use StaticContext instead."); - if (meta.ctor()) { - // For types that need placement new, we will call it, as well as - // making sure that when the data is freed, it calls the right - // destruction procedure. - auto size = numel_; - auto dtor = storage_.dtype().dtor(); - void* ptr; - at::DeleterFnPtr deleter; - auto ptr_and_deleter = GetStaticContext()->New( - numel_ * storage_.itemsize()); // Removing this can get rid of - // InefficientStdFunctionContext - ptr = ptr_and_deleter.first; - deleter = ptr_and_deleter.second; - storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( - ptr, - [size, dtor, deleter](void* local_ptr) -> void { - dtor(local_ptr, size); - deleter(local_ptr); - }, - at::Device(storage_.device_type()))); - storage_.dtype().ctor()(storage_.data(), numel_); - } else { - // For fundamental type, new and delete is easier. - auto ptr_and_deleter = - GetStaticContext()->New(numel_ * storage_.itemsize()); - storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( - ptr_and_deleter.first, - ptr_and_deleter.second, - at::Device(storage_.device_type()))); - } - storage_.set_numel(numel_); - return storage_.data(); - } - } - - /** - * Returns a mutable raw pointer of the underlying storage. This can only be - * used when you know for sure that the underlying storage of the tensor is - * already created via an earlier raw_mutable_data(meta) call or a - * mutable_data() call. - * - * If the existing data does not match the desired type, it will be deleted - * and a new storage will be created. - */ - inline void* raw_mutable_data() { - CAFFE_ENFORCE_WITH_CALLER( - storage_.dtype().id() != TypeIdentifier::uninitialized(), - "Calling raw_mutable_data() without meta, but the current meta is " - "of unknown type."); - return raw_mutable_data(storage_.dtype()); - } - - /** - * Returns a typed pointer of the underlying storage. - * - * For fundamental types, we reuse possible existing storage if there - * is sufficient capacity. - */ - template - inline T* mutable_data() { - if ((numel_ == 0 || storage_.data()) && IsType()) { - return static_cast(storage_.data()); - } - // Check it here statically - otherwise TypeMeta would throw the runtime - // error in attempt to invoke TypeMeta::ctor() - static_assert( - std::is_default_constructible::value, - "Tensor can't hold non-default-constructible types"); - return static_cast(raw_mutable_data(TypeMeta::Make())); - } - - /** - * Returns the number of dimensions of the data. - */ - inline int ndim() const { - return dims_.size(); - } - /** - * Returns the size (i.e. the number of items) of the tensor. - */ - inline TIndex size() const { - return numel_; - } - /** - * Return the number of bytes each item takes in the tensor. - */ - inline size_t itemsize() const { - return storage_.itemsize(); - } - /** - * Returns the total number of bytes of the storage. - * - * This is equivalent to calling size() * itemsize(). - */ - inline size_t nbytes() const { - return numel_ * itemsize(); - ; - } - - inline size_t capacity_nbytes() const { - return storage_.capacity(); - } - /** - * Returns the dimensions of the tensor as a vector. - */ - inline const vector& dims() const { - return dims_; - } - - inline TIndex size_from_dim(int k) const { - return size_from_dim_(k, dims_); - } - - inline TIndex size_to_dim(int k) const { - return size_to_dim_(k, dims_); - } - - inline TIndex size_between_dim(int k, int l) const { - return size_between_dim_(k, l, dims_); - } - - /** - * Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing (e.g., -1 for the last axis). - * - * @param axis_index the axis index. - * If 0 <= index < ndim(), return index. - * If -ndim <= index <= -1, return (ndim() - (-index)), - * e.g., the last axis index (ndim() - 1) if index == -1, - * the second to last if index == -2, etc. - * Dies on out of range index. - */ - inline int canonical_axis_index(int axis_index) const { - return canonical_axis_index_(axis_index, ndim()); - } - - /** - * Checks if the tensor content is of the given data type. - */ - template - inline bool IsType() const { - return storage_.IsType(); - } - /** - * Returns the TypeMeta object associated with the current data type. - */ - inline const TypeMeta& meta() const { - return storage_.dtype(); - } - - /** - * Returns the i-th dimension of the tensor in int. - * - * This function returns an int value instead of TIndex, which depending on - * the typedef could be int64. If you want int64 dim values, make sure you - * call dim() instead. - */ - inline int dim32(const int i) const { -#ifndef NDEBUG - CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); -#endif - CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits::max()); - return static_cast(dims_[i]); - } - - /** - * Returns the i-th dimension of the tensor. Note that the passed in index - * must be between 0 (inclusive) and the number of dimensions, otherwise - * this function will produce a fatal message. - */ - inline TIndex dim(const int i) const { -#ifndef NDEBUG - CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); -#endif - return dims_[i]; - } - - void ExtractDeviceOption(DeviceOption* device) const { - GetStaticContext()->ExtractDeviceOption(device, raw_data()); - } - - const Storage& storage() { - return storage_; - } - - const Storage& storage() const { - return storage_; - } - - protected: - using DimVector = std::vector; - DimVector dims_; // sizes_ - TIndex numel_ = -1; // numel_ - // we decide to keep reserved_ and it will - // live in Tensor after the split - // The logic is that if Extend() or ReserveSpace() were ever called, - // then subsequent Resize()s will not free up Storage. - bool reserved_ = false; - Storage storage_; - // int64_t storage_offset_; - - private: - template < - typename T, - typename = typename std::enable_if::value>::type> - bool SetDims(const vector& src) { - auto old_numel = numel_; - dims_.resize(src.size()); - TIndex new_numel = 1; - for (size_t i = 0; i < src.size(); ++i) { - new_numel *= src[i]; - dims_[i] = src[i]; - } - numel_ = new_numel; - return numel_ != old_numel; - } - - bool SetDims() { - auto old_numel = numel_; - dims_.resize(0); - numel_ = 1; - return numel_ != old_numel; - } - - // TODO(jiayq): maybe rewrite the following functions with initializer list. - // NVCC does not play well with initializer lists last time, but worth - // another shot. - bool SetDims(const TIndex d0) { - auto old_numel = numel_; - dims_.resize(1); - dims_[0] = d0; - numel_ = d0; - return numel_ != old_numel; - } - - bool SetDims(const TIndex d0, const TIndex d1) { - auto old_numel = numel_; - dims_.resize(2); - dims_[0] = d0; - dims_[1] = d1; - numel_ = d0 * d1; - return numel_ != old_numel; - } - - bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) { - auto old_numel = numel_; - dims_.resize(3); - dims_[0] = d0; - dims_[1] = d1; - dims_[2] = d2; - numel_ = d0 * d1 * d2; - return numel_ != old_numel; - } - - bool - SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) { - auto old_numel = numel_; - dims_.resize(4); - dims_[0] = d0; - dims_[1] = d1; - dims_[2] = d2; - dims_[3] = d3; - numel_ = d0 * d1 * d2 * d3; - return numel_ != old_numel; - } -}; - class CAFFE2_API UndefinedTensorImpl final : public TensorImpl { UndefinedTensorImpl() : TensorImpl(CPU){}; diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc new file mode 100644 index 00000000000000..cff98c6101ea5d --- /dev/null +++ b/caffe2/core/tensor_impl.cc @@ -0,0 +1,14 @@ +#include "caffe2/core/tensor_impl.h" + +#include "caffe2/core/flags.h" + +CAFFE2_DEFINE_bool( + caffe2_keep_on_shrink, + true, + "If set, keeps memory when a tensor is shrinking its size."); + +CAFFE2_DEFINE_int64( + caffe2_max_keep_on_shrink_memory, + LLONG_MAX, + "The maximum memory in bytes to keep on shrink, if the difference between " + "tensor sizes is bigger than this then tensor will be reset."); diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h new file mode 100644 index 00000000000000..ffa3ef61c4cc27 --- /dev/null +++ b/caffe2/core/tensor_impl.h @@ -0,0 +1,902 @@ +#pragma once + +#include "caffe2/core/storage.h" + +#include + +// A global boolean variable to control whether we free memory when a Tensor +// is shrinked to a smaller size. As a result, a Tensor is always going to +// keep the memory allocated for its maximum capacity reshaped to so far. +CAFFE2_DECLARE_bool(caffe2_keep_on_shrink); + +// Since we can have high variance in blob memory allocated across different +// inputs in the same run, we will shrink the blob only if the memory gain +// is larger than this flag in bytes. +CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory); + +namespace caffe2 { + +/** + * A utility function to convert vector to vector. + */ +inline vector ToVectorTIndex(const std::vector& src) { + return vector(src.begin(), src.end()); +} + +/** + * Return product of all dimensions starting from k + */ +inline TIndex size_from_dim_(int k, const vector& dims) { + TIndex r = 1; + for (size_t i = k; i < dims.size(); ++i) { + r *= dims[i]; + } + return r; +} + +// Product of all dims up to k (not including dims[k]) +inline TIndex size_to_dim_(int k, const vector& dims) { + CAFFE_ENFORCE((unsigned)k <= dims.size()); + TIndex r = 1; + for (int i = 0; i < k; ++i) { + r *= dims[i]; + } + return r; +} + +// Product of all dims between k and l (not including dims[k] and dims[l]) +inline TIndex size_between_dim_(int k, int l, const vector& dims) { + CAFFE_ENFORCE((unsigned)l < dims.size()); + TIndex r = 1; + if (k < l) { + for (int i = k + 1; i < l; ++i) { + r *= dims[i]; + } + } else { + for (int i = l + 1; i < k; ++i) { + r *= dims[i]; + } + } + return r; +} + +// Wrap around axis_index if it is negative, s.t., -1 is the last dim +inline int canonical_axis_index_(int axis_index, int ndims) { + CAFFE_ENFORCE_GE(axis_index, -ndims); + CAFFE_ENFORCE_LT(axis_index, ndims); + if (axis_index < 0) { + return axis_index + ndims; + } + return axis_index; +} + +/** + * @brief TensorImpl is the implementation of a tensor and the basic class + * in Caffe2 that stores a contiguous memory with its shape information. + * + * The TensorImpl class is essentially a wrapper around a device-specific memory + * (the device is specified by the Context template argument), and deals with + * the allocation and de-allocation of such memory. We make a simplified + * assumption that the memory is always contiguous. + */ +class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { + public: + TensorImpl() = delete; + explicit TensorImpl(DeviceType device_type) : storage_(device_type) {} + + /** + * @brief Creates a tensor of the given dimension. + * + * Note that the actual data allocation is not going to be carried out until + * the first time mutable_data() is called. + */ + // TODO: here, we create a Storage + // and immediately discard it in Resize() since + // reset_tensor will be true and FreeMemory will be called, + // we might want to avoid creating Storage twice? + explicit TensorImpl(const vector& dims, at::DeviceType device_type) + : storage_(device_type) { + Resize(dims); + } + + explicit TensorImpl(const vector& dims, at::DeviceType device_type) + : storage_(device_type) { + Resize(dims); + } + + /* Now we require that context_for_copy has the same device type as src since + * template is removed + */ + TensorImpl( + const TensorImpl& src, + BaseContext* context_for_copy, + at::DeviceType device_type) + : storage_(device_type) { + CopyFrom(src, context_for_copy); + } + + /** + * @brief: Create a Tensor of at::DeviceType `type` and initialize it with + * src Tensor + */ + TensorImpl(const TensorImpl& src, at::DeviceType device_type) + : storage_(device_type) { + CopyFrom(src); + } + + /** + * @brief Creates a tensor, and fills its contents with the given values. + * The type of tensor will be decided by the context parameter + */ + template + TensorImpl( + const vector& dims, + const vector& values, + BaseContext* context) + : storage_(context->GetDevicetype(), TypeMeta::Make()) { + Resize(dims); + CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); + context->CopyItemsFromCPU( + storage_.dtype(), numel_, values.data(), mutable_data()); + } + + /** + * @brief Creates a scalar tensor, and fills its content with the given value. + * The type of tensor will be decided by the context parameter + */ + template < + typename T, + typename = typename std::enable_if::value>::type> + TensorImpl(const T& value, BaseContext* context) + : storage_(context->GetDevicetype(), TypeMeta::Make()) { + Resize(vector{}); + context->CopyItemsFromCPU( + storage_.dtype(), numel_, &value, mutable_data()); + } + + /** + * @brief Delete the copy constructor and use Clone explicitly + */ + TensorImpl(const TensorImpl& src) = delete; + + TensorImpl(TensorImpl&& src) noexcept { + swap(src); + } + + TensorImpl& operator=(TensorImpl&&) = default; + // Note(jiayq): possibly a rule-of-three violation, but we explicitly + // discourage the use of = for Tensors. + TensorImpl& operator=(const TensorImpl& src) = delete; + + virtual ~TensorImpl() noexcept {} + + /* + * Since we removed template from tensor, we now store a static + * context pointer in tensor, which indicates the type of the tensor. + */ + BaseStaticContext* GetStaticContext() const { + return get_static_context(GetDeviceType()); + } + + /* @brief + * Create a context that has the same device_type + * as the tensor. + * Note that this doesn't support passing in argument + * TODO(jerryzh): move this to a global registry + * that can create context for us + */ + std::unique_ptr CreateContext() const { + return GetStaticContext()->CreateContext(); + } + + at::DeviceType GetDeviceType() const { + return storage_.device_type(); + } + + /** + * @brief Copies the data from a source tensor, with a contex provided to + * carry out the underlying memcpy operation. + */ + void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) { + if ((void*)&src == (void*)this) { + return; + } + if (storage_.dtype() != src.meta()) { + storage_ = Storage(GetDeviceType(), src.meta()); + } + if (src.size() == -1) { + dims_.clear(); + numel_ = -1; + storage_.reset(); + return; + } + Resize(src.dims()); + if (size() > 0) { + if (storage_.dtype().copy()) { + CAFFE_ENFORCE( + GetDeviceType() == CPU, + "In CopyFrom source and dest tensors must both be CPU for meta copy"); + CAFFE_ENFORCE( + src.GetDeviceType() == CPU, + "In CopyFrom source and dest tensors must both be CPU for meta copy"); + storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size()); + } else { + // We'll need to use a non-CPU context to perform the copy if + // one of the context is not CPU since only non-CPU context + // knows how to copy between CPU and that context + if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) { + if (!context) { + src.CreateContext()->CopyBytesToDevice( + nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); + } else { + CAFFE_ENFORCE( + context->GetDevicetype() == src.GetDeviceType(), + "Type for provided context does not match the type of source"); + context->CopyBytesToDevice( + nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); + } + } else { + // In case source context is CPU, and target context is non-CPU + // We'll have to create a Context from target and perform the + // copy using that context + CreateContext()->CopyBytesFromCPU( + nbytes(), src.raw_data(), raw_mutable_data()); + } + } + } + } + + /** + * @brief Extend the outer-most dimension of this tensor + * to dimension of `num`. + */ + void ExtendTo(TIndex num, float growthPct, BaseContext* context) { + CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); + CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); + CAFFE_ENFORCE(context != nullptr, "Context must be provided."); + Extend(num - dims_[0], growthPct, context); + } + + /** + * @brief Extends the outer-most dimension of this tensor by num elements, + * preserving the existing data. + * + * The underlying data may be reallocated in order to accommodate the new + * elements, in which case this tensors' capacity is grown at a factor of + * growthPct. This ensures that Extend runs on an amortized O(1) time + * complexity. + */ + void Extend(TIndex num, float growthPct, BaseContext* context) { + CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); + CAFFE_ENFORCE_GE_WITH_CALLER( + num, 0, "`num` must be non-negative for Extend"); + auto newDims = dims_; + newDims[0] += num; + if (!storage_.data()) { + Resize(newDims); + return; + } + auto newNumel = std::accumulate( + newDims.begin(), + newDims.end(), + static_cast(1), + std::multiplies()); + if (newNumel * storage_.itemsize() <= storage_.capacity()) { + dims_ = newDims; + numel_ = newNumel; + return; + } + auto newCapacity = dims_; + newCapacity[0] = std::max( + newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100)); + auto oldData = std::move(storage_.data_ptr()); + auto oldSize = numel_; + auto oldDims = dims_; + Resize(newCapacity); + auto* newData = raw_mutable_data(storage_.dtype()); + CAFFE_ENFORCE( + context != nullptr, "Context must be provided to Extend the tensor"); + context->CopyItemsSameDevice( + storage_.dtype(), oldSize, oldData.get(), newData); + reserved_ = true; + dims_ = newDims; + numel_ = newNumel; + } + + /** + * @brief Shrinks the outer-most dimension to given size, keeping the data. + * + * This method guarantees that no re-allocations are carried out, which means + * that the extra capacity after the end of the shurnk tensor is maintained. + */ + void ShrinkTo(TIndex outer_dim) { + CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D"); + CAFFE_ENFORCE_WITH_CALLER( + outer_dim <= dims_[0], + "New outer dimension must be smaller than current."); + CAFFE_ENFORCE( + storage_.unique(), + "Can't call ShrinkTo on shared storage, please call Resize instead."); + dims_[0] = outer_dim; + numel_ = std::accumulate( + dims_.begin(), + dims_.end(), + static_cast(1), + std::multiplies()); + } + + /** + * @brief Reserve space for the underlying tensor. + * + * This must be called after Resize(), since we only specify the first + * dimension This does not copy over the old data to the newly allocated space + */ + template + void ReserveSpace(const T& outer_dim) { + CAFFE_ENFORCE( + numel_ != -1, "size should be initialized before calling ReserveSpace"); + CAFFE_ENFORCE( + storage_.unique(), "Can't call ReserveSpace on shared storage."); + auto newCapacity = dims_; + newCapacity[0] = outer_dim; + auto newNumel = std::accumulate( + newCapacity.begin(), + newCapacity.end(), + static_cast(1), + std::multiplies()); + if (newNumel * storage_.itemsize() <= storage_.capacity()) { + return; + } + // Old data is discarded + storage_.data_ptr().clear(); + auto oldSize = numel_; + auto oldDims = dims_; + Resize(newCapacity); + // Allocate new memory but don't copy over the data + raw_mutable_data(storage_.dtype()); + dims_ = oldDims; + numel_ = oldSize; + reserved_ = true; + } + + /** + * @brief Resizes a tensor. + * + * Resize takes in a vector of ints specifying the dimensions of the tensor. + * You can pass in an empty vector to specify that it is a scalar (i.e. + * containing one single item). + * + * The underlying storage may be deleted after calling Resize: if the new + * shape leads to a different number of items in the tensor, the old memory + * is deleted and new memory will be allocated next time you call + * mutable_data(). However, if the shape is different but the total number of + * items is the same, the underlying storage is kept. + */ + template + void Resize(Ts... dim_source) { + bool is_init = numel_ == -1; + bool size_changed = SetDims(dim_source...); + if (size_changed) { + // If needed, we will free the data. the next mutable_data() call + // will create the data storage. + bool reset_tensor = false; + if (reserved_) { + // If tensor is reserved then don't claim its memeory unless capacity() + // is smaller than new size + reset_tensor = storage_.capacity() < numel_ * storage_.itemsize(); + } else { + reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() || + !FLAGS_caffe2_keep_on_shrink || + storage_.capacity() - numel_ * storage_.itemsize() > + FLAGS_caffe2_max_keep_on_shrink_memory; + } + + if (reset_tensor && !is_init) { + FreeMemory(); + } + } + } + + /** + * Resize the tensor like the source tensor. Note that this is just a + * sugar wrapper that essentially calls Resize(src_tensor.dims()). + */ + inline void ResizeLike(const TensorImpl& src_tensor) { + // Note: need casting for different context types. + if (static_cast(this) != static_cast(&src_tensor)) { + Resize(src_tensor.dims()); + } + } + + /** + * Resizes the tensor without touching underlying storage. + * This requires the total size of the tensor to remains constant. + */ + inline void Reshape(const vector& dims) { + TIndex new_size = 1; + for (auto d : dims) { + CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); + new_size *= d; + } + CAFFE_ENFORCE_WITH_CALLER( + new_size == numel_, + "New size and old size are not equal. You cannot use Reshape, " + "but should use Resize." + // TODO(jiayq): remove the following warning after pending diffs + // stabilize. + " The old caffe2 mixes Reshape and Resize but this behavior has " + "been changed. If you find this error, most likely you will need " + "to change corresponding code from Reshape to Resize."); + dims_ = dims; + } + + inline void Reshape(const vector& dims) { + Reshape(ToVectorTIndex(dims)); + } + + /** + * Release whatever memory the tensor was holding but keep size and type + * information. Subsequent call to mutable_data will trigger new memory + * allocation. + */ + inline void FreeMemory() { + // We'll detach from the old Storage and create a new one + storage_ = Storage(storage_.device_type(), storage_.dtype()); + } + + /** + * A utility function to print the debug string for the tensor. Note that this + * is very slow since it involves quite some string operations, so do not use + * it in your performance-critical code. + */ + string DebugString() const { + std::stringstream ss; + ss << "A Tensor of item size " << storage_.itemsize() << " and type " + << storage_.dtype().name() << " and dimension ("; + for (int d : dims_) { + ss << d << ","; + } + ss << ")."; + return ss.str(); + } + + void swap(TensorImpl& other) noexcept { + std::swap(dims_, other.dims_); + std::swap(numel_, other.numel_); + std::swap(storage_, other.storage_); + } + + /** + * @brief Shares the data with another tensor. + * + * To share data between two tensors, the sizes of the two tensors must be + * equal already. The reason we do not implicitly do a Resize to make the two + * tensors have the same shape is that we want to allow tensors of different + * shapes but the same number of items to still be able to share data. This + * allows one to e.g. have a n-dimensional Tensor and a flattened version + * sharing the same underlying storage. + * + * The source tensor should already have its data allocated. + */ + void ShareData(const TensorImpl& src) { + // Right now, we are assuming the device_type are the same, since it is + // inherently the same in the non-templatized code. We should probably add + // an ENFORCE here which might affect perf a little bit. + CAFFE_ENFORCE_EQ_WITH_CALLER( + src.numel_, + numel_, + "Size mismatch - did you call reshape before sharing the data?"); + // It is possible that the source tensor hasn't called mutable_data() yet, + // in which case ShareData() doesn't make much sense since we don't really + // know what to share yet. + CAFFE_ENFORCE_WITH_CALLER( + src.storage_.data() || src.numel_ == 0, + "Source tensor has no content and has size > 0"); + // Finally, do sharing. + /* Since we create new Storage whenever we need to change data_type/capacity + * this still keeps the original semantics + */ + storage_ = src.storage(); + } + + /** + * @brief Shares the data with an externally managed pointer. + * + * This is similar to ShareData() but the source is a pointer with an advanced + * deleter option. In default, no deletion takes place, and one needs to make + * sure that the external memory is deallocated only after the tensor finishes + * using it. If a Deleter object is passed in, when this tensor is reallocated + * or freed, the deleter function is going to be called. + */ + template + void + ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) { + ShareExternalPointer((void*)src, TypeMeta::Make(), capacity, d); + } + + template + void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) { + ShareExternalPointer(std::move(data_ptr), TypeMeta::Make(), capacity); + } + + void ShareExternalPointer( + void* src, + const TypeMeta& data_type, + size_t capacity = 0, + MemoryDeleter d = nullptr) { + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + ShareExternalPointer( + at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity); + } + + void ShareExternalPointer( + at::DataPtr&& data_ptr, + const TypeMeta& data_type, + size_t capacity) { + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + if (!capacity) { + capacity = numel_ * data_type.itemsize(); + } + if (storage_.unique()) { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "To share data with a raw pointer, you need to set shape first."); + storage_.UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); + } else { + int64_t numel = capacity / data_type.itemsize(); + // Create a new Storage + storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true); + } + } + + /** + * Returns a const raw void* pointer of the underlying storage. mutable_data() + * or raw_mutable_data() must have been called prior to this function call. + */ + inline const void* raw_data() const { + CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); + return storage_.data(); + } + + /** + * Returns a typed pointer of the underlying storage. mutable_data() or + * raw_mutable_data() must have been called prior to this function call, and + * the data type must be of the correct type. If you want to get a void* + * pointer instead, use raw_data(). + */ + template + inline const T* data() const { + CAFFE_ENFORCE_WITH_CALLER( + storage_.data() || numel_ == 0, + "The tensor is of non-zero shape, but its data is not allocated yet. " + "Caffe2 uses a lazy allocation, so you will need to call " + "mutable_data() or raw_mutable_data() to actually allocate memory."); + CAFFE_ENFORCE_WITH_CALLER( + IsType(), + "Tensor type mismatch, caller expects elements to be ", + TypeMeta::TypeName(), + ", while tensor contains ", + storage_.dtype().name(), + ". "); + return static_cast(storage_.data()); + } + + /** + * Returns a mutable raw pointer of the underlying storage. Since we will need + * to know the type of the data for allocation, a TypeMeta object is passed in + * to specify the necessary information. This is conceptually equivalent of + * calling mutable_data() where the TypeMeta parameter meta is derived from + * the type T. This function differs from mutable_data() in the sense that + * the type T can be specified during runtime via the TypeMeta object. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ + inline void* raw_mutable_data(const TypeMeta& meta) { + // For 0-size tensors it's fine to return any pointer (including nullptr) + if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) { + return storage_.data(); + } else { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "Tensor is not initialized. You probably need to call Resize() " + "before calling mutable_data()"); + bool had_special_dtor = storage_.dtype().dtor() != nullptr; + if (storage_.unique()) { + storage_.set_dtype(meta); + } else { + if (storage_.dtype() != meta) { + storage_ = Storage(storage_.device_type(), meta); + } + } + + // We can reuse the existing buffer if the current data does not have + // a special destructor and the new data doesn't have a special + // constructor. + if (numel_ == 0 || + (meta.ctor() == nullptr && !had_special_dtor && + storage_.numel() >= numel_)) { + return storage_.data(); + } + const at::Allocator* allocator = storage_.allocator(); + // TODO: Get rid of StaticContext + CAFFE_ENFORCE( + allocator == nullptr, + "Allocator is not used within Caffe2 functions, please use StaticContext instead."); + if (meta.ctor()) { + // For types that need placement new, we will call it, as well as + // making sure that when the data is freed, it calls the right + // destruction procedure. + auto size = numel_; + auto dtor = storage_.dtype().dtor(); + void* ptr; + at::DeleterFnPtr deleter; + auto ptr_and_deleter = GetStaticContext()->New( + numel_ * storage_.itemsize()); // Removing this can get rid of + // InefficientStdFunctionContext + ptr = ptr_and_deleter.first; + deleter = ptr_and_deleter.second; + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr, + [size, dtor, deleter](void* local_ptr) -> void { + dtor(local_ptr, size); + deleter(local_ptr); + }, + at::Device(storage_.device_type()))); + storage_.dtype().ctor()(storage_.data(), numel_); + } else { + // For fundamental type, new and delete is easier. + auto ptr_and_deleter = + GetStaticContext()->New(numel_ * storage_.itemsize()); + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr_and_deleter.first, + ptr_and_deleter.second, + at::Device(storage_.device_type()))); + } + storage_.set_numel(numel_); + return storage_.data(); + } + } + + /** + * Returns a mutable raw pointer of the underlying storage. This can only be + * used when you know for sure that the underlying storage of the tensor is + * already created via an earlier raw_mutable_data(meta) call or a + * mutable_data() call. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ + inline void* raw_mutable_data() { + CAFFE_ENFORCE_WITH_CALLER( + storage_.dtype().id() != TypeIdentifier::uninitialized(), + "Calling raw_mutable_data() without meta, but the current meta is " + "of unknown type."); + return raw_mutable_data(storage_.dtype()); + } + + /** + * Returns a typed pointer of the underlying storage. + * + * For fundamental types, we reuse possible existing storage if there + * is sufficient capacity. + */ + template + inline T* mutable_data() { + if ((numel_ == 0 || storage_.data()) && IsType()) { + return static_cast(storage_.data()); + } + // Check it here statically - otherwise TypeMeta would throw the runtime + // error in attempt to invoke TypeMeta::ctor() + static_assert( + std::is_default_constructible::value, + "Tensor can't hold non-default-constructible types"); + return static_cast(raw_mutable_data(TypeMeta::Make())); + } + + /** + * Returns the number of dimensions of the data. + */ + inline int ndim() const { + return dims_.size(); + } + /** + * Returns the size (i.e. the number of items) of the tensor. + */ + inline TIndex size() const { + return numel_; + } + /** + * Return the number of bytes each item takes in the tensor. + */ + inline size_t itemsize() const { + return storage_.itemsize(); + } + /** + * Returns the total number of bytes of the storage. + * + * This is equivalent to calling size() * itemsize(). + */ + inline size_t nbytes() const { + return numel_ * itemsize(); + ; + } + + inline size_t capacity_nbytes() const { + return storage_.capacity(); + } + /** + * Returns the dimensions of the tensor as a vector. + */ + inline const vector& dims() const { + return dims_; + } + + inline TIndex size_from_dim(int k) const { + return size_from_dim_(k, dims_); + } + + inline TIndex size_to_dim(int k) const { + return size_to_dim_(k, dims_); + } + + inline TIndex size_between_dim(int k, int l) const { + return size_between_dim_(k, l, dims_); + } + + /** + * Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing (e.g., -1 for the last axis). + * + * @param axis_index the axis index. + * If 0 <= index < ndim(), return index. + * If -ndim <= index <= -1, return (ndim() - (-index)), + * e.g., the last axis index (ndim() - 1) if index == -1, + * the second to last if index == -2, etc. + * Dies on out of range index. + */ + inline int canonical_axis_index(int axis_index) const { + return canonical_axis_index_(axis_index, ndim()); + } + + /** + * Checks if the tensor content is of the given data type. + */ + template + inline bool IsType() const { + return storage_.IsType(); + } + /** + * Returns the TypeMeta object associated with the current data type. + */ + inline const TypeMeta& meta() const { + return storage_.dtype(); + } + + /** + * Returns the i-th dimension of the tensor in int. + * + * This function returns an int value instead of TIndex, which depending on + * the typedef could be int64. If you want int64 dim values, make sure you + * call dim() instead. + */ + inline int dim32(const int i) const { +#ifndef NDEBUG + CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); +#endif + CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits::max()); + return static_cast(dims_[i]); + } + + /** + * Returns the i-th dimension of the tensor. Note that the passed in index + * must be between 0 (inclusive) and the number of dimensions, otherwise + * this function will produce a fatal message. + */ + inline TIndex dim(const int i) const { +#ifndef NDEBUG + CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); +#endif + return dims_[i]; + } + + void ExtractDeviceOption(DeviceOption* device) const { + GetStaticContext()->ExtractDeviceOption(device, raw_data()); + } + + const Storage& storage() { + return storage_; + } + + const Storage& storage() const { + return storage_; + } + + protected: + using DimVector = std::vector; + DimVector dims_; // sizes_ + TIndex numel_ = -1; // numel_ + // we decide to keep reserved_ and it will + // live in Tensor after the split + // The logic is that if Extend() or ReserveSpace() were ever called, + // then subsequent Resize()s will not free up Storage. + bool reserved_ = false; + Storage storage_; + // int64_t storage_offset_; + + private: + template < + typename T, + typename = typename std::enable_if::value>::type> + bool SetDims(const vector& src) { + auto old_numel = numel_; + dims_.resize(src.size()); + TIndex new_numel = 1; + for (size_t i = 0; i < src.size(); ++i) { + new_numel *= src[i]; + dims_[i] = src[i]; + } + numel_ = new_numel; + return numel_ != old_numel; + } + + bool SetDims() { + auto old_numel = numel_; + dims_.resize(0); + numel_ = 1; + return numel_ != old_numel; + } + + // TODO(jiayq): maybe rewrite the following functions with initializer list. + // NVCC does not play well with initializer lists last time, but worth + // another shot. + bool SetDims(const TIndex d0) { + auto old_numel = numel_; + dims_.resize(1); + dims_[0] = d0; + numel_ = d0; + return numel_ != old_numel; + } + + bool SetDims(const TIndex d0, const TIndex d1) { + auto old_numel = numel_; + dims_.resize(2); + dims_[0] = d0; + dims_[1] = d1; + numel_ = d0 * d1; + return numel_ != old_numel; + } + + bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) { + auto old_numel = numel_; + dims_.resize(3); + dims_[0] = d0; + dims_[1] = d1; + dims_[2] = d2; + numel_ = d0 * d1 * d2; + return numel_ != old_numel; + } + + bool + SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) { + auto old_numel = numel_; + dims_.resize(4); + dims_[0] = d0; + dims_[1] = d1; + dims_[2] = d2; + dims_[3] = d3; + numel_ = d0 * d1 * d2 * d3; + return numel_ != old_numel; + } +}; + +} From 75f49befeb922a577a1e0e38f26e963b57c1f8ea Mon Sep 17 00:00:00 2001 From: Roy Li Date: Thu, 13 Sep 2018 12:25:20 -0700 Subject: [PATCH 126/237] move instance_norm to aten (#10792) Summary: This also removes the usage of torch.onnx.symbolic_override in instance_norm. Fixes #8439. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10792 Differential Revision: D9800643 Pulled By: li-roy fbshipit-source-id: fa13a57de5a31fbfa2d4d02639d214c867b9e1f1 --- aten/src/ATen/native/Normalization.cpp | 39 +++++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 3 ++ test/test_jit.py | 15 ++++++-- torch/nn/functional.py | 44 ++-------------------- torch/onnx/symbolic.py | 32 ++++++++-------- 5 files changed, 73 insertions(+), 60 deletions(-) diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 750e42568715d7..ed0a94ae496718 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -14,6 +14,13 @@ namespace { AT_CHECK(actual == expected, arg_name, " should contain ", expected, " elements not ", actual); } + + static inline Tensor repeat_if_defined(const Tensor& t, int64_t repeat) { + if (t.defined()) { + return t.repeat(repeat); + } + return t; + } } Tensor batch_norm( @@ -80,6 +87,38 @@ Tensor batch_norm( running_mean, running_var, training, momentum, eps); } +Tensor instance_norm( + const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */, + const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */, + bool use_input_stats, double momentum, double eps, bool cudnn_enabled) { + AT_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()), + "Expected running_mean and running_var to be defined when use_input_stats is false"); + std::vector shape = input.sizes().vec(); + int64_t b = input.size(0); + int64_t c = input.size(1); + shape[1] = b * c; + shape[0] = 1; + + Tensor weight_ = repeat_if_defined(weight, b); + Tensor bias_ = repeat_if_defined(bias, b); + Tensor running_mean_ = repeat_if_defined(running_mean, b); + Tensor running_var_ = repeat_if_defined(running_var, b); + + auto input_reshaped = input.contiguous().view(shape); + auto out = at::batch_norm(input_reshaped, weight_, bias_, running_mean_, running_var_, + use_input_stats, momentum, eps, cudnn_enabled); + + // we alias running_mean and running_var because they are const but we want to modify their data + if (running_mean.defined()) { + at::alias(running_mean).copy_(running_mean_.view({ b, c }).mean(0, false)); + } + if (running_var.defined()) { + at::alias(running_var).copy_(running_var_.view({ b, c }).mean(0, false)); + } + + return out.view(input.sizes()); +} + Tensor layer_norm(const Tensor& input, IntList normalized_shape, const Tensor& weight /* optional */, const Tensor& bias /* optional */, double eps, bool cudnn_enabled) { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index af58e45ebe0ebd..c15aefa2f671d6 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -873,6 +873,9 @@ - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor variants: function, method +- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) -> Tensor + variants: function + - func: inverse(Tensor self) -> Tensor variants: function, method diff --git a/test/test_jit.py b/test/test_jit.py index 61a6b94db9935c..f6a4347eae86cb 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -6966,9 +6966,8 @@ def test_dcgan_models_cuda(self): # XXX: export_import on CUDA modules doesn't work (#11480) self._test_dcgan_models(self, device='cuda', check_export_import=False) - # XXX: When this is fixed, write a CUDA test for this. - @unittest.skip('https://github.com/pytorch/pytorch/issues/8439 InstanceNormalization bug') - def test_neural_style(self): + @staticmethod + def _test_neural_style(self, device, check_export_import=True): class TransformerNet(torch.nn.Module): def __init__(self): super(TransformerNet, self).__init__() @@ -7065,7 +7064,15 @@ def forward(self, x): out = self.conv2d(out) return out - self.checkTrace(TransformerNet(), (torch.rand(5, 3, 224, 224),)) + self.checkTrace(TransformerNet(), (torch.rand(5, 3, 64, 64),), export_import=check_export_import) + + def test_neural_style(self): + self._test_neural_style(self, device='cpu') + + @unittest.skipIf(not RUN_CUDA, "no CUDA") + def test_neural_style_cuda(self): + # XXX: export_import on CUDA modules doesn't work (#11480) + self._test_neural_style(self, device='cuda', check_export_import=False) @staticmethod def _test_mnist(self, device, check_export_import=True): diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 7823d445a0c13a..690112424db560 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1357,46 +1357,10 @@ def instance_norm(input, running_mean=None, running_var=None, weight=None, See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`, :class:`~torch.nn.InstanceNorm3d` for details. """ - if not use_input_stats and (running_mean is None or running_var is None): - raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False') - - b, c = input.size(0), input.size(1) - if weight is not None: - weight = weight.repeat(b) - if bias is not None: - bias = bias.repeat(b) - - import torch.onnx.symbolic - - @torch.onnx.symbolic_override(torch.onnx.symbolic.instance_norm) - def _instance_norm(input, running_mean=None, running_var=None, weight=None, - bias=None, use_input_stats=None, momentum=None, eps=None): - # Repeat stored stats and affine transform params if necessary - if running_mean is not None: - running_mean_orig = running_mean - running_mean = running_mean_orig.repeat(b) - if running_var is not None: - running_var_orig = running_var - running_var = running_var_orig.repeat(b) - - # Apply instance norm - input_reshaped = input.contiguous().view(1, b * c, *input.size()[2:]) - - out = batch_norm( - input_reshaped, running_mean, running_var, weight=weight, bias=bias, - training=use_input_stats, momentum=momentum, eps=eps) - - # Reshape and copy back - if running_mean is not None: - running_mean_orig.copy_(running_mean.view(b, c).mean(0, keepdim=False)) - if running_var is not None: - running_var_orig.copy_(running_var.view(b, c).mean(0, keepdim=False)) - - return out.view(b, c, *input.size()[2:]) - return _instance_norm(input, running_mean=running_mean, - running_var=running_var, weight=weight, bias=bias, - use_input_stats=use_input_stats, momentum=momentum, - eps=eps) + return torch.instance_norm( + input, weight, bias, running_mean, running_var, + use_input_stats, momentum, eps, torch.backends.cudnn.enabled + ) def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5): diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index d9c5964109a0c4..30e8672be601fd 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -716,6 +716,22 @@ def batch_norm(g, input, weight, bias, running_mean, running_var, training, mome return res +@parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i') +def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled): + input_sizes = input.type().sizes() + if weight is None or weight.node().kind() == "prim::Undefined": + assert len(input_sizes) > 1 + weight_value = torch.tensor([1.] * input_sizes[1]).type( + 'torch.' + input.type().scalarType() + 'Tensor') + weight = g.op("Constant", value_t=weight_value) + if bias is None or bias.node().kind() == "prim::Undefined": + assert len(input_sizes) > 1 + bias_value = torch.tensor([0.] * input_sizes[1]).type( + 'torch.' + input.type().scalarType() + 'Tensor') + bias = g.op("Constant", value_t=bias_value) + return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps) + + @parse_args('v', 'i', 'i', 'i') def unfold(g, input, dimension, size, step): return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step) @@ -1027,22 +1043,6 @@ def repeat(g, self, repeats): return g.op("Tile", self, repeats) -def instance_norm(g, input, **kwargs): - input_type = input.type().scalarType() - weight = kwargs.get("weight", None) - bias = kwargs.get("bias", None) - eps = kwargs.get("eps", 1e-5) - if weight is None: - weight = g.constant(1.0, [input.type().sizes()[1]], input_type) - else: - weight = g.op('Constant', value_t=weight) - if bias is None: - bias = g.constant(0.0, [input.type().sizes()[1]], input_type) - else: - bias = g.op('Constant', value_t=bias) - return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps) - - def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases, num_layers, dropout, train, bidirectional, batch_first=None, batch_sizes=None): weights_per_layer = 4 if has_biases else 2 From acb6f18bab4bf7c801e445cf9b438cec827829ae Mon Sep 17 00:00:00 2001 From: Soumith Chintala Date: Thu, 13 Sep 2018 12:26:58 -0700 Subject: [PATCH 127/237] fix generate_code.py caching (#11644) Summary: Currently, because of some setup.py logic, `ninja` caching of the `generate_code.py` build step was broken. This resulted in `generate_code.py` running every single time builds were happening, regardless of whether inputs changed. This updated logic fixes the input caching Pull Request resolved: https://github.com/pytorch/pytorch/pull/11644 Reviewed By: orionr Differential Revision: D9814348 Pulled By: soumith fbshipit-source-id: 2012960908d0f600488d410094095cfd72adc34f --- setup.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 03e7b745a83539..533b6dee4fcc53 100644 --- a/setup.py +++ b/setup.py @@ -123,6 +123,7 @@ import distutils.command.build import distutils.command.clean import distutils.sysconfig +import filecmp import platform import subprocess import shutil @@ -457,9 +458,14 @@ def check_file(f): sym_files = ['tools/shared/cwrap_common.py', 'tools/shared/_utils_internal.py'] orig_files = ['aten/src/ATen/common_with_cwrap.py', 'torch/_utils_internal.py'] for sym_file, orig_file in zip(sym_files, orig_files): + same = False if os.path.exists(sym_file): - os.remove(sym_file) - shutil.copyfile(orig_file, sym_file) + if filecmp.cmp(sym_file, orig_file): + same = True + else: + os.remove(sym_file) + if not same: + shutil.copyfile(orig_file, sym_file) # Copy headers necessary to compile C++ extensions. # From 0f1ca569ceae07b800f037805aa60691b8a2e801 Mon Sep 17 00:00:00 2001 From: James Reed Date: Thu, 13 Sep 2018 12:32:41 -0700 Subject: [PATCH 128/237] End-to-end dynamic slicing with ONNX DynamicSlice experimental operator (#11255) Summary: Requires https://github.com/onnx/onnx/pull/1377 This PR makes it so that slices with dynamic boundary values can be exported from pytorch and run in caffe2 via ONNX. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11255 Differential Revision: D9790216 Pulled By: jamesr66a fbshipit-source-id: 6adfcddc5788df4d34d7ca98341077140402a3e2 --- caffe2/onnx/backend.cc | 169 +++++++++++++++- caffe2/onnx/backend.h | 13 +- caffe2/operators/slice_op.cc | 4 +- caffe2/operators/slice_op.cu | 180 ++++++++++++------ caffe2/operators/slice_op.h | 49 +++-- caffe2/python/onnx/backend.py | 2 +- ...estScript.test_export_dynamic_slice.expect | 40 ++++ .../expect/TestOperators.test_acos.expect | 2 +- .../TestOperators.test_add_broadcast.expect | 2 +- ...stOperators.test_add_left_broadcast.expect | 2 +- ...tOperators.test_add_size1_broadcast.expect | 2 +- ...tors.test_add_size1_right_broadcast.expect | 2 +- ....test_add_size1_singleton_broadcast.expect | 2 +- .../TestOperators.test_addconstant.expect | 2 +- .../expect/TestOperators.test_addmm.expect | 2 +- .../expect/TestOperators.test_asin.expect | 2 +- .../expect/TestOperators.test_at_op.expect | 2 +- .../expect/TestOperators.test_atan.expect | 2 +- .../expect/TestOperators.test_basic.expect | 2 +- .../TestOperators.test_batchnorm.expect | 2 +- .../TestOperators.test_batchnorm_1d.expect | 2 +- ...stOperators.test_batchnorm_noaffine.expect | 2 +- ...stOperators.test_batchnorm_training.expect | 2 +- .../expect/TestOperators.test_chunk.expect | 2 +- .../expect/TestOperators.test_clip.expect | 2 +- .../expect/TestOperators.test_clip_max.expect | 2 +- .../expect/TestOperators.test_clip_min.expect | 2 +- .../expect/TestOperators.test_concat2.expect | 2 +- .../expect/TestOperators.test_conv.expect | 2 +- .../TestOperators.test_convtranspose.expect | 2 +- .../onnx/expect/TestOperators.test_cos.expect | 2 +- .../onnx/expect/TestOperators.test_elu.expect | 2 +- .../expect/TestOperators.test_equal.expect | 2 +- .../onnx/expect/TestOperators.test_exp.expect | 2 +- .../expect/TestOperators.test_flatten.expect | 2 +- test/onnx/expect/TestOperators.test_ge.expect | 2 +- test/onnx/expect/TestOperators.test_gt.expect | 2 +- .../expect/TestOperators.test_hardtanh.expect | 2 +- .../expect/TestOperators.test_index.expect | 2 +- test/onnx/expect/TestOperators.test_le.expect | 2 +- .../TestOperators.test_logsoftmax.expect | 2 +- test/onnx/expect/TestOperators.test_lt.expect | 2 +- .../onnx/expect/TestOperators.test_max.expect | 2 +- .../expect/TestOperators.test_maxpool.expect | 2 +- .../expect/TestOperators.test_mean.expect | 2 +- .../onnx/expect/TestOperators.test_min.expect | 2 +- test/onnx/expect/TestOperators.test_mm.expect | 2 +- .../expect/TestOperators.test_norm.expect | 2 +- .../onnx/expect/TestOperators.test_pad.expect | 2 +- .../expect/TestOperators.test_params.expect | 2 +- .../expect/TestOperators.test_permute2.expect | 2 +- .../onnx/expect/TestOperators.test_pow.expect | 2 +- .../expect/TestOperators.test_prod.expect | 2 +- .../TestOperators.test_reduced_mean.expect | 2 +- ...Operators.test_reduced_mean_keepdim.expect | 2 +- .../TestOperators.test_reduced_prod.expect | 2 +- ...Operators.test_reduced_prod_keepdim.expect | 2 +- .../TestOperators.test_reduced_sum.expect | 2 +- ...tOperators.test_reduced_sum_keepdim.expect | 2 +- .../expect/TestOperators.test_repeat.expect | 2 +- ...tOperators.test_repeat_dim_overflow.expect | 2 +- .../expect/TestOperators.test_rsub.expect | 2 +- .../expect/TestOperators.test_selu.expect | 2 +- .../onnx/expect/TestOperators.test_sin.expect | 2 +- .../expect/TestOperators.test_slice.expect | 2 +- .../expect/TestOperators.test_sqrt.expect | 2 +- .../onnx/expect/TestOperators.test_sum.expect | 2 +- ...estOperators.test_symbolic_override.expect | 2 +- ...ators.test_symbolic_override_nested.expect | 2 +- .../onnx/expect/TestOperators.test_tan.expect | 2 +- .../TestOperators.test_transpose.expect | 2 +- .../expect/TestOperators.test_type_as.expect | 2 +- .../TestOperators.test_unsqueeze.expect | 2 +- .../expect/TestOperators.test_upsample.expect | 2 +- .../expect/TestOperators.test_view.expect | 2 +- test/onnx/test_pytorch_onnx_caffe2.py | 50 +++-- test/test_jit.py | 19 ++ torch/onnx/symbolic.py | 16 +- 78 files changed, 503 insertions(+), 175 deletions(-) create mode 100644 test/expect/TestScript.test_export_dynamic_slice.expect diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc index 5ccea70926fb6f..2350910febff27 100644 --- a/caffe2/onnx/backend.cc +++ b/caffe2/onnx/backend.cc @@ -301,7 +301,8 @@ Caffe2Backend::get_renamed_operators() const { {"Less", "LT"}, {"Greater", "GT"}, {"Unsqueeze", "ExpandDims"}, - {"Tile", "NumpyTile"}}; + {"Tile", "NumpyTile"}, + {"DynamicSlice", "Slice"}}; return kRenamedOperators; } @@ -356,7 +357,8 @@ Caffe2Backend::get_special_operators() const { {"MatMul", &Caffe2Backend::CreateMatMul}, {"Upsample", &Caffe2Backend::CreateUpsample}, {"Dropout", &Caffe2Backend::CreateDropout}, - {"LRN", &Caffe2Backend::CreateLRN}}; + {"LRN", &Caffe2Backend::CreateLRN}, + {"DynamicSlice", &Caffe2Backend::CreateDynamicSlice}}; return kSpecialOperators; } @@ -899,7 +901,6 @@ Caffe2Ops Caffe2Backend::CreateSlice( auto starts_vals_tensor = dummy_->NewDummyName(); auto starts_tensor = dummy_->NewDummyName(); - auto casted_starts_tensor = dummy_->NewDummyName(); c2_op = ret.ops.Add(); { caffe2::Argument shape_starts; @@ -936,12 +937,9 @@ Caffe2Ops Caffe2Backend::CreateSlice( caffe2::Argument to; to.set_name("to"); to.set_i(static_cast(caffe2::TensorProto::INT32)); - c2_op = ret.ops.Add(); - BuildOperator(c2_op, "Cast", {starts_tensor}, {casted_starts_tensor}, {to}); auto ends_vals_tensor = dummy_->NewDummyName(); auto ends_tensor = dummy_->NewDummyName(); - auto casted_ends_tensor = dummy_->NewDummyName(); c2_op = ret.ops.Add(); { caffe2::Argument shape_ends; @@ -965,17 +963,168 @@ Caffe2Ops Caffe2Backend::CreateSlice( "ScatterAssign", {ends_tensor, axes_tensor, ends_vals_tensor}, {ends_tensor}); - // Slice only accepts ends as int + + // attach the original op at the end + c2_op = ret.ops.Add(); + c2_op->CopyFrom(*op); + c2_op->mutable_input()->Clear(); + c2_op->add_input(data); + c2_op->add_input(starts_tensor); + c2_op->add_input(ends_tensor); + c2_op->mutable_arg()->Clear(); + for (const auto& kv : args) { + c2_op->add_arg()->CopyFrom(*kv.second); + } + + return ret; +} + +// Do the following: +// for a given index tensor (i.e. `starts` or `ends`): +// 1) Hilariously subtract 1 from the value if it is negative. This due to +// the behavior of Caffe2's slice operator not matching that of ONNX's slice +// 2) Fully expand the index tensor out to the rank of the data tensor. +// pseudocode: indices_full = zeros(rank); indices_full[axes] = indices.int() +std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node, + Caffe2Ops& ret, + std::string indices_tensor, + std::string axes_tensor, + std::string rank_tensor, + std::string zero_tensor, + std::string one_tensor, + int default_value) { + auto indices_tensor_full = dummy_->NewDummyName(); + + { + caffe2::Argument value; + value.set_name("value"); + value.set_i(default_value); + caffe2::Argument dtype; + dtype.set_name("dtype"); + dtype.set_i(static_cast(caffe2::TensorProto::INT64)); + caffe2::Argument input_as_shape; + input_as_shape.set_name("input_as_shape"); + input_as_shape.set_i(1); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "ConstantFill", {rank_tensor}, {indices_tensor_full}, + {value, dtype, input_as_shape}); + } + + // Subtract 1 from each element of the indices tensor that is negative + auto lt_tensor = dummy_->NewDummyName(); + { + caffe2::Argument broadcast; + broadcast.set_name("broadcast"); + broadcast.set_i(1); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "LT", {indices_tensor, zero_tensor}, {lt_tensor}, {broadcast}); + } + + auto sub_one_tensor = dummy_->NewDummyName(); + { + caffe2::Argument broadcast; + broadcast.set_name("broadcast"); + broadcast.set_i(1); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "Sub", {indices_tensor, one_tensor}, {sub_one_tensor}, {broadcast}); + } + + auto indices_tensor_adjusted = dummy_->NewDummyName(); + auto c2_op = ret.ops.Add(); + BuildOperator(c2_op, "Conditional", {lt_tensor, sub_one_tensor, indices_tensor}, {indices_tensor_adjusted}, {}); + + // Fill in values specified from the partially-specified ONNX indices tensor + c2_op = ret.ops.Add(); + BuildOperator(c2_op, "ScatterAssign", + {indices_tensor_full, axes_tensor, indices_tensor_adjusted}, + {indices_tensor_full}); + + return indices_tensor_full; +} + +Caffe2Ops Caffe2Backend::CreateDynamicSlice( + OnnxNode* onnx_node, + const ConversionContext& ctx) { + auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx); + CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1); + auto* op = op_tmp.ops.Mutable(0); + std::unordered_map args; + for (auto& arg : *op->mutable_arg()) { + args.emplace(arg.name(), &arg); + } + + CAFFE_ENFORCE_GE(op->input_size(), 1); + auto data = op->input(0); + Caffe2Ops ret; + + // First get the shape of the input tensor + auto* c2_op = ret.ops.Add(); + auto size_tensor = dummy_->NewDummyName(); + BuildOperator(c2_op, "Shape", {data}, {size_tensor}); + + // Now get the rank of the tensor by getting the shape of the shape of + // the input tensor c2_op = ret.ops.Add(); - BuildOperator(c2_op, "Cast", {ends_tensor}, {casted_ends_tensor}, {to}); + auto rank_tensor = dummy_->NewDummyName(); + BuildOperator(c2_op, "Shape", {size_tensor}, {rank_tensor}); + + // Axes tensor will be used to populate the fully-specified starts and ends + // arguments to the caffe2 Slice operator. + std::string axes_tensor; + if (onnx_node->node.input_size() > 2) { + axes_tensor = onnx_node->node.input(3); + } else { + axes_tensor = dummy_->NewDummyName(); + auto* c2_op = ret.ops.Add(); + BuildOperator(c2_op, "Range", {rank_tensor}, {axes_tensor}, {}); + } + + // Useful int tensors + auto define_integer_constant = [this, &ret](int val) { + caffe2::Argument value; + value.set_name("value"); + value.set_i(val); + caffe2::Argument dtype; + dtype.set_name("dtype"); + dtype.set_i(static_cast(caffe2::TensorProto::INT64)); + caffe2::Argument shape; + shape.set_name("shape"); + shape.add_ints(1); + auto c2_op = ret.ops.Add(); + auto name = dummy_->NewDummyName(); + BuildOperator(c2_op, "ConstantFill", {}, {name}, + {value, dtype, shape}); + return name; + }; + + auto zero_tensor = define_integer_constant(0); + auto one_tensor = define_integer_constant(1); + + auto starts_tensor_full = PreprocessSliceIndexTensor(onnx_node, + ret, + onnx_node->node.input(1), // starts + axes_tensor, + rank_tensor, + zero_tensor, + one_tensor, + 0); + + auto ends_tensor_full = PreprocessSliceIndexTensor(onnx_node, + ret, + onnx_node->node.input(2), // ends + axes_tensor, + rank_tensor, + zero_tensor, + one_tensor, + -1); // attach the original op at the end c2_op = ret.ops.Add(); c2_op->CopyFrom(*op); c2_op->mutable_input()->Clear(); c2_op->add_input(data); - c2_op->add_input(casted_starts_tensor); - c2_op->add_input(casted_ends_tensor); + c2_op->add_input(starts_tensor_full); + c2_op->add_input(ends_tensor_full); c2_op->mutable_arg()->Clear(); for (const auto& kv : args) { c2_op->add_arg()->CopyFrom(*kv.second); diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h index eab0e2f7e1f131..2b74dec1e3ccea 100644 --- a/caffe2/onnx/backend.h +++ b/caffe2/onnx/backend.h @@ -11,7 +11,7 @@ #include #include -constexpr int kKnownOpsetVersion = 7; +constexpr int kKnownOpsetVersion = 9; namespace caffe2 { namespace onnx { @@ -212,6 +212,17 @@ class CAFFE2_API Caffe2Backend { Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx); + std::string PreprocessSliceIndexTensor(OnnxNode* onnx_node, + Caffe2Ops& ret, + std::string indices_tensor, + std::string axes_tensor, + std::string rank_tensor, + std::string zero_tensor, + std::string one_tensor, + int default_value); + + Caffe2Ops CreateDynamicSlice(OnnxNode* onnx_node, const ConversionContext& ctx); + Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx); Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx); diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc index 529394b9092555..93c039c965f448 100644 --- a/caffe2/operators/slice_op.cc +++ b/caffe2/operators/slice_op.cc @@ -3,8 +3,8 @@ namespace caffe2 { -REGISTER_CPU_OPERATOR(Slice, SliceOp); -REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp); +REGISTER_CPU_OPERATOR(Slice, SliceOp); +REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp); OPERATOR_SCHEMA(Slice) .NumInputs(1, 3) diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu index e2523ad7cbf3fc..5de302814ba2aa 100644 --- a/caffe2/operators/slice_op.cu +++ b/caffe2/operators/slice_op.cu @@ -231,79 +231,133 @@ bool SliceImplGpu( } // namespace -template <> -bool SliceOp::RunOnDevice() { - auto* output = Output(0); - auto& data = Input(0); +template<> +class SliceOp : public Operator { + public: + USE_OPERATOR_FUNCTIONS(CUDAContext); + SliceOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), + statically_inited_(false) {} + + bool RunOnDevice() override { + if (InputSize() > 1) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } + } - if (InputSize() > 1) { - starts_host_.CopyFrom(Input(1)); - ends_host_.CopyFrom(Input(2)); - } else { - if (!statically_inited_) { - CAFFE_ENFORCE(HasArgument("starts")); - CAFFE_ENFORCE(HasArgument("ends")); - CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); - - starts_host_.Resize(starts_.size()); - ends_host_.Resize(ends_.size()); - - memcpy( - starts_host_.mutable_data(), - starts_.data(), - sizeof(int) * starts_.size()); - memcpy( - ends_host_.mutable_data(), - ends_.data(), - sizeof(int) * ends_.size()); - statically_inited_ = true; + template + bool DoRunWithType() { + auto* output = Output(0); + auto& data = Input(0); + + if (InputSize() > 1) { + starts_host_.CopyFrom(Input(1)); + ends_host_.CopyFrom(Input(2)); + } else { + if (!statically_inited_) { + CAFFE_ENFORCE(HasArgument("starts")); + CAFFE_ENFORCE(HasArgument("ends")); + CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); + + starts_host_.Resize(starts_.size()); + ends_host_.Resize(ends_.size()); + + memcpy( + starts_host_.mutable_data(), + starts_.data(), + sizeof(SIndex) * starts_.size()); + memcpy( + ends_host_.mutable_data(), + ends_.data(), + sizeof(SIndex) * ends_.size()); + statically_inited_ = true; + } } + + return SliceImplGpu( + output, data, starts_host_, ends_host_, &context_); } + private: + std::vector starts_; + std::vector ends_; + bool statically_inited_; + Tensor starts_host_{CPU}; + Tensor ends_host_{CPU}; - return SliceImplGpu( - output, data, starts_host_, ends_host_, &context_); -} +}; // class SliceOp -REGISTER_CUDA_OPERATOR(Slice, SliceOp); +REGISTER_CUDA_OPERATOR(Slice, SliceOp); template <> -bool SliceGradientOp::RunOnDevice() { - auto* gdata = Output(0); - auto& data = Input(0); +class SliceGradientOp : public Operator { + public: + USE_OPERATOR_FUNCTIONS(CUDAContext); + SliceGradientOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), + statically_inited_(false) {} + + AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp); + + bool RunOnDevice() override { + if (InputSize() == 4) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } + } - if (InputSize() == 4) { - starts_host_.CopyFrom(Input(1)); - ends_host_.CopyFrom(Input(2)); + template + bool DoRunWithType() { + auto* gdata = Output(0); + auto& data = Input(0); - auto& go = Input(3); + if (InputSize() == 4) { + starts_host_.CopyFrom(Input(1)); + ends_host_.CopyFrom(Input(2)); - return SliceImplGpu( - nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); - } else { - if (!statically_inited_) { - CAFFE_ENFORCE(HasArgument("starts")); - CAFFE_ENFORCE(HasArgument("ends")); - CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); - - starts_host_.Resize(starts_.size()); - ends_host_.Resize(ends_.size()); - - memcpy( - starts_host_.mutable_data(), - starts_.data(), - sizeof(int) * starts_.size()); - memcpy( - ends_host_.mutable_data(), - ends_.data(), - sizeof(int) * ends_.size()); - - statically_inited_ = true; - } - auto& go = Input(1); + auto& go = Input(3); - return SliceImplGpu( - nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); + return SliceImplGpu( + nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); + } else { + if (!statically_inited_) { + CAFFE_ENFORCE(HasArgument("starts")); + CAFFE_ENFORCE(HasArgument("ends")); + CAFFE_ENFORCE_EQ(starts_.size(), ends_.size()); + + starts_host_.Resize(starts_.size()); + ends_host_.Resize(ends_.size()); + + memcpy( + starts_host_.mutable_data(), + starts_.data(), + sizeof(SIndex) * starts_.size()); + memcpy( + ends_host_.mutable_data(), + ends_.data(), + sizeof(SIndex) * ends_.size()); + + statically_inited_ = true; + } + auto& go = Input(1); + + return SliceImplGpu( + nullptr, data, starts_host_, ends_host_, &context_, gdata, &go); + } } -} -REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp); + private: + + std::vector starts_; + std::vector ends_; + bool statically_inited_; + Tensor starts_host_{CPU}; + Tensor ends_host_{CPU}; +}; // class SliceGradientOp +REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp); } // namespace caffe2 diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h index 8f291affb8e8d0..aa8d4e50f0f9d9 100644 --- a/caffe2/operators/slice_op.h +++ b/caffe2/operators/slice_op.h @@ -198,22 +198,26 @@ bool SliceImpl( } // namespace -template +template class SliceOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; SliceOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), - starts_(this->template GetRepeatedArgument("starts")), - ends_(this->template GetRepeatedArgument("ends")), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), statically_inited_(false) {} bool RunOnDevice() override { - return RunOnDeviceImpl(Input(0), Output(0)); + if (InputSize() > 1) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } } - protected: - bool RunOnDeviceImpl(const Tensor& data, Tensor* output) { + template + bool DoRunWithType() { if (InputSize() > 1) { starts_host_.CopyFrom(Input(1)); ends_host_.CopyFrom(Input(2)); @@ -238,31 +242,45 @@ class SliceOp : public Operator { } } + auto data = Input(0); + auto output = Output(0); + return SliceImpl( output, data, starts_host_, ends_host_, &context_); } AT_DISABLE_COPY_AND_ASSIGN(SliceOp); - private: - std::vector starts_; - std::vector ends_; + protected: + std::vector starts_; + std::vector ends_; bool statically_inited_; Tensor starts_host_{CPU}; Tensor ends_host_{CPU}; }; -template +template class SliceGradientOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; SliceGradientOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), - starts_(this->template GetRepeatedArgument("starts")), - ends_(this->template GetRepeatedArgument("ends")), + starts_(this->template GetRepeatedArgument("starts")), + ends_(this->template GetRepeatedArgument("ends")), statically_inited_(false) {} + AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp); + bool RunOnDevice() override { + if (InputSize() == 4) { + return DispatchHelper>::call(this, Input(1)); + } else { + return DoRunWithType(); + } + } + + template + bool DoRunWithType() { auto* gdata = Output(0); auto& data = Input(0); @@ -301,11 +319,10 @@ class SliceGradientOp : public Operator { } } - AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp); - private: - std::vector starts_; - std::vector ends_; + + std::vector starts_; + std::vector ends_; bool statically_inited_; Tensor starts_host_{CPU}; Tensor ends_host_{CPU}; diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 79dadb091488e6..3d9239c8b5c92b 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -140,7 +140,7 @@ class Caffe2Backend(Backend): # If you increase this, make SURE you cross-reference all BC-breaking # changes from one version to the next, and any that you did not # implement, mark as broken in _broken_operators - _known_opset_version = 7 + _known_opset_version = 9 # This dictionary will record operators which are KNOWN to be # broken, so we give a good error message rather than do something diff --git a/test/expect/TestScript.test_export_dynamic_slice.expect b/test/expect/TestScript.test_export_dynamic_slice.expect new file mode 100644 index 00000000000000..7182fec515ae74 --- /dev/null +++ b/test/expect/TestScript.test_export_dynamic_slice.expect @@ -0,0 +1,40 @@ +ModelProto { + producer_name: "pytorch" + domain: "" + doc_string: "" + graph: + GraphProto { + name: "torch-jit-export" + inputs: [{name: "x", type:Tensor dims: 3 4 5}] + outputs: [{name: "7", type:Tensor dims: 4 5}] + initializers: [] + nodes: [ + Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Gather", inputs: [x,2], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]}, + Node {type: "Shape", inputs: [x], outputs: [4], attributes: []}, + Node {type: "Gather", inputs: [4,1], outputs: [5], attributes: [{ name: 'axis', type: int, value: 0}]}, + Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Loop", inputs: [5,6,3], outputs: [7], attributes: [{ name: 'body', type: graph, value: + GraphProto { + name: "torch-jit-export1" + inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "10", type:Tensor dims: }] + outputs: [{name: "18", type:Tensor dims: },{name: "17", type:Tensor dims: }] + initializers: [] + nodes: [ + Node {type: "Constant", inputs: [], outputs: [11], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Unsqueeze", inputs: [2], outputs: [12], attributes: [{ name: 'axes', type: ints, values: [0]}]}, + Node {type: "Unsqueeze", inputs: [i], outputs: [13], attributes: [{ name: 'axes', type: ints, values: [0]}]}, + Node {type: "Unsqueeze", inputs: [11], outputs: [14], attributes: [{ name: 'axes', type: ints, values: [0]}]}, + Node {type: "DynamicSlice", inputs: [x,12,13,14], outputs: [15], attributes: []}, + Node {type: "ReduceSum", inputs: [15], outputs: [16], attributes: [{ name: 'axes', type: ints, values: [0]},{ name: 'keepdims', type: int, value: 0}]}, + Node {type: "Add", inputs: [10,16], outputs: [17], attributes: []}, + Node {type: "Constant", inputs: [], outputs: [18], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]} + ] + } + + }]} + ] + } + opset_import: [OperatorSetIdProto { domain: }], +} diff --git a/test/onnx/expect/TestOperators.test_acos.expect b/test/onnx/expect/TestOperators.test_acos.expect index 219f74f6b2fcb1..acecfeba820a25 100644 --- a/test/onnx/expect/TestOperators.test_acos.expect +++ b/test/onnx/expect/TestOperators.test_acos.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_add_broadcast.expect b/test/onnx/expect/TestOperators.test_add_broadcast.expect index 097cd600278781..4062b9ed9fe87b 100644 --- a/test/onnx/expect/TestOperators.test_add_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_broadcast.expect @@ -56,5 +56,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect index 5b6ade0101acfc..b78cf56b5d4723 100644 --- a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect @@ -56,5 +56,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect index edad6530a8083f..181800f7bba55e 100644 --- a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect index 097cd600278781..4062b9ed9fe87b 100644 --- a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect @@ -56,5 +56,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect index 832e076c2e3ad4..19557dc56aff51 100644 --- a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_addconstant.expect b/test/onnx/expect/TestOperators.test_addconstant.expect index 6aa04d81a0cbe0..5657e671e47653 100644 --- a/test/onnx/expect/TestOperators.test_addconstant.expect +++ b/test/onnx/expect/TestOperators.test_addconstant.expect @@ -55,5 +55,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_addmm.expect b/test/onnx/expect/TestOperators.test_addmm.expect index 972e2b07217a7d..deb017b257f321 100644 --- a/test/onnx/expect/TestOperators.test_addmm.expect +++ b/test/onnx/expect/TestOperators.test_addmm.expect @@ -100,5 +100,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_asin.expect b/test/onnx/expect/TestOperators.test_asin.expect index 2f9438488521da..230374f5c9b857 100644 --- a/test/onnx/expect/TestOperators.test_asin.expect +++ b/test/onnx/expect/TestOperators.test_asin.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_at_op.expect b/test/onnx/expect/TestOperators.test_at_op.expect index 61478b558dcfaf..b3e06b0be8fb9b 100644 --- a/test/onnx/expect/TestOperators.test_at_op.expect +++ b/test/onnx/expect/TestOperators.test_at_op.expect @@ -48,5 +48,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_atan.expect b/test/onnx/expect/TestOperators.test_atan.expect index b265c17a1a4a5b..6b5f71786aea5b 100644 --- a/test/onnx/expect/TestOperators.test_atan.expect +++ b/test/onnx/expect/TestOperators.test_atan.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_basic.expect b/test/onnx/expect/TestOperators.test_basic.expect index 94b569e8f1b925..be9eb4647779a6 100644 --- a/test/onnx/expect/TestOperators.test_basic.expect +++ b/test/onnx/expect/TestOperators.test_basic.expect @@ -71,5 +71,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm.expect b/test/onnx/expect/TestOperators.test_batchnorm.expect index 18d9e5d56ac6c2..0ddd04cd86141e 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm.expect @@ -159,5 +159,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect index c4be699a7111ac..88af65817b710b 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect @@ -167,5 +167,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect index d807b30800d398..a96638c7bf4c8f 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect @@ -147,5 +147,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_training.expect b/test/onnx/expect/TestOperators.test_batchnorm_training.expect index 79211b2b3cc3fa..1c53cfe0aa4a93 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_training.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_training.expect @@ -163,5 +163,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_chunk.expect b/test/onnx/expect/TestOperators.test_chunk.expect index ddd8ba28ec6517..cb31cfcbeb7cb9 100644 --- a/test/onnx/expect/TestOperators.test_chunk.expect +++ b/test/onnx/expect/TestOperators.test_chunk.expect @@ -61,5 +61,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_clip.expect b/test/onnx/expect/TestOperators.test_clip.expect index 9581da849754a0..6a2d3e544b1e44 100644 --- a/test/onnx/expect/TestOperators.test_clip.expect +++ b/test/onnx/expect/TestOperators.test_clip.expect @@ -52,5 +52,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_clip_max.expect b/test/onnx/expect/TestOperators.test_clip_max.expect index 30160f26432abe..48f3ddc37a4e69 100644 --- a/test/onnx/expect/TestOperators.test_clip_max.expect +++ b/test/onnx/expect/TestOperators.test_clip_max.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_clip_min.expect b/test/onnx/expect/TestOperators.test_clip_min.expect index 015c474ebed9f4..55516c3f8f1aba 100644 --- a/test/onnx/expect/TestOperators.test_clip_min.expect +++ b/test/onnx/expect/TestOperators.test_clip_min.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_concat2.expect b/test/onnx/expect/TestOperators.test_concat2.expect index af84517fcdd867..3efe305e422abd 100644 --- a/test/onnx/expect/TestOperators.test_concat2.expect +++ b/test/onnx/expect/TestOperators.test_concat2.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_conv.expect b/test/onnx/expect/TestOperators.test_conv.expect index 9686a55318160e..bc07077b8a9535 100644 --- a/test/onnx/expect/TestOperators.test_conv.expect +++ b/test/onnx/expect/TestOperators.test_conv.expect @@ -117,5 +117,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_convtranspose.expect b/test/onnx/expect/TestOperators.test_convtranspose.expect index cdaf80988335a2..323dd426a4c510 100644 --- a/test/onnx/expect/TestOperators.test_convtranspose.expect +++ b/test/onnx/expect/TestOperators.test_convtranspose.expect @@ -123,5 +123,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_cos.expect b/test/onnx/expect/TestOperators.test_cos.expect index fc4d799ec0e87d..db4ddebecf2e6a 100644 --- a/test/onnx/expect/TestOperators.test_cos.expect +++ b/test/onnx/expect/TestOperators.test_cos.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect index 6bb3b42496c5c8..f80dabe1cdbb56 100644 --- a/test/onnx/expect/TestOperators.test_elu.expect +++ b/test/onnx/expect/TestOperators.test_elu.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect index eb66a1149599f6..1577dae0fdd20d 100644 --- a/test/onnx/expect/TestOperators.test_equal.expect +++ b/test/onnx/expect/TestOperators.test_equal.expect @@ -71,5 +71,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_exp.expect b/test/onnx/expect/TestOperators.test_exp.expect index 326273b565ac7a..5536e52b300402 100644 --- a/test/onnx/expect/TestOperators.test_exp.expect +++ b/test/onnx/expect/TestOperators.test_exp.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect index 87075f3aeb1134..f127eebbbee288 100644 --- a/test/onnx/expect/TestOperators.test_flatten.expect +++ b/test/onnx/expect/TestOperators.test_flatten.expect @@ -192,5 +192,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect index 9e02d7d92e1bd6..63f1f3cc563951 100644 --- a/test/onnx/expect/TestOperators.test_ge.expect +++ b/test/onnx/expect/TestOperators.test_ge.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect index a1958aeb7d1f1a..ab107f9e0233c5 100644 --- a/test/onnx/expect/TestOperators.test_gt.expect +++ b/test/onnx/expect/TestOperators.test_gt.expect @@ -71,5 +71,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_hardtanh.expect b/test/onnx/expect/TestOperators.test_hardtanh.expect index 9581da849754a0..6a2d3e544b1e44 100644 --- a/test/onnx/expect/TestOperators.test_hardtanh.expect +++ b/test/onnx/expect/TestOperators.test_hardtanh.expect @@ -52,5 +52,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_index.expect b/test/onnx/expect/TestOperators.test_index.expect index 6d2a95332cbb94..577244747d9aef 100644 --- a/test/onnx/expect/TestOperators.test_index.expect +++ b/test/onnx/expect/TestOperators.test_index.expect @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect index 881c199d949f3c..fb36f3449f2664 100644 --- a/test/onnx/expect/TestOperators.test_le.expect +++ b/test/onnx/expect/TestOperators.test_le.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_logsoftmax.expect b/test/onnx/expect/TestOperators.test_logsoftmax.expect index 650662a76685af..fcfa594b8ea881 100644 --- a/test/onnx/expect/TestOperators.test_logsoftmax.expect +++ b/test/onnx/expect/TestOperators.test_logsoftmax.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect index 4edf94b2aa4e6c..901aa61666c7df 100644 --- a/test/onnx/expect/TestOperators.test_lt.expect +++ b/test/onnx/expect/TestOperators.test_lt.expect @@ -71,5 +71,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_max.expect b/test/onnx/expect/TestOperators.test_max.expect index fda649076cb375..62378fbae6afe6 100644 --- a/test/onnx/expect/TestOperators.test_max.expect +++ b/test/onnx/expect/TestOperators.test_max.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_maxpool.expect b/test/onnx/expect/TestOperators.test_maxpool.expect index e788300b089586..3e7ce68b8d42c8 100644 --- a/test/onnx/expect/TestOperators.test_maxpool.expect +++ b/test/onnx/expect/TestOperators.test_maxpool.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_mean.expect b/test/onnx/expect/TestOperators.test_mean.expect index c575109faa8d86..5476ca2298cba6 100644 --- a/test/onnx/expect/TestOperators.test_mean.expect +++ b/test/onnx/expect/TestOperators.test_mean.expect @@ -47,5 +47,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_min.expect b/test/onnx/expect/TestOperators.test_min.expect index c0a51ca2bb7bf6..7b66f3661bd3d0 100644 --- a/test/onnx/expect/TestOperators.test_min.expect +++ b/test/onnx/expect/TestOperators.test_min.expect @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_mm.expect b/test/onnx/expect/TestOperators.test_mm.expect index 129981ea61a1d6..0dcede4964c410 100644 --- a/test/onnx/expect/TestOperators.test_mm.expect +++ b/test/onnx/expect/TestOperators.test_mm.expect @@ -83,5 +83,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_norm.expect b/test/onnx/expect/TestOperators.test_norm.expect index c53938f94a8fca..6b6ff1846fdeb4 100644 --- a/test/onnx/expect/TestOperators.test_norm.expect +++ b/test/onnx/expect/TestOperators.test_norm.expect @@ -61,5 +61,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_pad.expect b/test/onnx/expect/TestOperators.test_pad.expect index f94b67f3616057..343230d4c02a66 100644 --- a/test/onnx/expect/TestOperators.test_pad.expect +++ b/test/onnx/expect/TestOperators.test_pad.expect @@ -71,5 +71,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_params.expect b/test/onnx/expect/TestOperators.test_params.expect index c4a718f940944e..91b64310a3006c 100644 --- a/test/onnx/expect/TestOperators.test_params.expect +++ b/test/onnx/expect/TestOperators.test_params.expect @@ -87,5 +87,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_permute2.expect b/test/onnx/expect/TestOperators.test_permute2.expect index 4e2e7e19d27b6f..d20f35b1077e04 100644 --- a/test/onnx/expect/TestOperators.test_permute2.expect +++ b/test/onnx/expect/TestOperators.test_permute2.expect @@ -76,5 +76,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_pow.expect b/test/onnx/expect/TestOperators.test_pow.expect index 595c51dae740d8..49bec073317701 100644 --- a/test/onnx/expect/TestOperators.test_pow.expect +++ b/test/onnx/expect/TestOperators.test_pow.expect @@ -77,5 +77,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_prod.expect b/test/onnx/expect/TestOperators.test_prod.expect index 2a01bf76bfbe1b..235dde0d4462de 100644 --- a/test/onnx/expect/TestOperators.test_prod.expect +++ b/test/onnx/expect/TestOperators.test_prod.expect @@ -47,5 +47,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_reduced_mean.expect b/test/onnx/expect/TestOperators.test_reduced_mean.expect index 7449a2a53c6fbb..6c846801887ee8 100644 --- a/test/onnx/expect/TestOperators.test_reduced_mean.expect +++ b/test/onnx/expect/TestOperators.test_reduced_mean.expect @@ -61,5 +61,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect index 45d4eaa4663882..dd4cbb90efbba7 100644 --- a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect +++ b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_reduced_prod.expect b/test/onnx/expect/TestOperators.test_reduced_prod.expect index d835c77b86c209..3dbabb1fcb0890 100644 --- a/test/onnx/expect/TestOperators.test_reduced_prod.expect +++ b/test/onnx/expect/TestOperators.test_reduced_prod.expect @@ -61,5 +61,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect index 138e39ec9b5134..fe612211c7c8cb 100644 --- a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect +++ b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_reduced_sum.expect b/test/onnx/expect/TestOperators.test_reduced_sum.expect index b602edbb2fc451..b42d577db7f534 100644 --- a/test/onnx/expect/TestOperators.test_reduced_sum.expect +++ b/test/onnx/expect/TestOperators.test_reduced_sum.expect @@ -61,5 +61,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect index 0e4bf5da96be3a..9e238ad24da393 100644 --- a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect +++ b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect @@ -64,5 +64,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_repeat.expect b/test/onnx/expect/TestOperators.test_repeat.expect index eb95f268857c5d..d8e9337890f3f1 100644 --- a/test/onnx/expect/TestOperators.test_repeat.expect +++ b/test/onnx/expect/TestOperators.test_repeat.expect @@ -68,5 +68,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect index 33b15851531c09..235a5a13d42d36 100644 --- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect +++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect @@ -81,5 +81,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect index 9e6c112c8768c4..49fa976a5e1bd6 100644 --- a/test/onnx/expect/TestOperators.test_rsub.expect +++ b/test/onnx/expect/TestOperators.test_rsub.expect @@ -55,5 +55,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_selu.expect b/test/onnx/expect/TestOperators.test_selu.expect index 92af508521a538..117d25a0d19192 100644 --- a/test/onnx/expect/TestOperators.test_selu.expect +++ b/test/onnx/expect/TestOperators.test_selu.expect @@ -54,5 +54,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_sin.expect b/test/onnx/expect/TestOperators.test_sin.expect index a6d733d7b7e031..32ceff124dde7d 100644 --- a/test/onnx/expect/TestOperators.test_sin.expect +++ b/test/onnx/expect/TestOperators.test_sin.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_slice.expect b/test/onnx/expect/TestOperators.test_slice.expect index 5965137e8de6ec..0e24cafa6e5c97 100644 --- a/test/onnx/expect/TestOperators.test_slice.expect +++ b/test/onnx/expect/TestOperators.test_slice.expect @@ -77,5 +77,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_sqrt.expect b/test/onnx/expect/TestOperators.test_sqrt.expect index 5b56bea7eb24b3..5344a0aff20c46 100644 --- a/test/onnx/expect/TestOperators.test_sqrt.expect +++ b/test/onnx/expect/TestOperators.test_sqrt.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_sum.expect b/test/onnx/expect/TestOperators.test_sum.expect index dff021eb26af16..caba3c16ba70a0 100644 --- a/test/onnx/expect/TestOperators.test_sum.expect +++ b/test/onnx/expect/TestOperators.test_sum.expect @@ -47,5 +47,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_symbolic_override.expect b/test/onnx/expect/TestOperators.test_symbolic_override.expect index f4e6965a6cafbb..6ccb4ff03f8d5d 100644 --- a/test/onnx/expect/TestOperators.test_symbolic_override.expect +++ b/test/onnx/expect/TestOperators.test_symbolic_override.expect @@ -99,5 +99,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect index 757609e63740f2..6bec0381a55fa0 100644 --- a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect +++ b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect @@ -100,5 +100,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_tan.expect b/test/onnx/expect/TestOperators.test_tan.expect index 58b7ff689b9930..e36c718bb00bc4 100644 --- a/test/onnx/expect/TestOperators.test_tan.expect +++ b/test/onnx/expect/TestOperators.test_tan.expect @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_transpose.expect b/test/onnx/expect/TestOperators.test_transpose.expect index f2c634daebf589..394bb253f4e3f6 100644 --- a/test/onnx/expect/TestOperators.test_transpose.expect +++ b/test/onnx/expect/TestOperators.test_transpose.expect @@ -37,5 +37,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_type_as.expect b/test/onnx/expect/TestOperators.test_type_as.expect index 5107617395e3cb..bdc6dd44a3f97b 100644 --- a/test/onnx/expect/TestOperators.test_type_as.expect +++ b/test/onnx/expect/TestOperators.test_type_as.expect @@ -31,5 +31,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect index 3a8e01092f8d0b..8ecb4314d347f0 100644 --- a/test/onnx/expect/TestOperators.test_unsqueeze.expect +++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect @@ -50,5 +50,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_upsample.expect b/test/onnx/expect/TestOperators.test_upsample.expect index 8e7a6a8fa1f7a7..7cc176ff57b8dc 100644 --- a/test/onnx/expect/TestOperators.test_upsample.expect +++ b/test/onnx/expect/TestOperators.test_upsample.expect @@ -67,5 +67,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect index 1354b4271067af..be69ffb4cdab95 100644 --- a/test/onnx/expect/TestOperators.test_view.expect +++ b/test/onnx/expect/TestOperators.test_view.expect @@ -44,5 +44,5 @@ graph { } } opset_import { - version: 7 + version: 9 } diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 1a9e9cdbff1442..a889e7b1fa0f89 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -127,13 +127,14 @@ def convert_cuda(self, model, input): return cuda_model, cuda_input def run_debug_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True): + input=None, use_gpu=True, example_outputs=None): """ # TODO: remove this from the final release version This test is for our debugging only for the case where embed_params=False """ - model.train(train) + if not isinstance(model, torch.jit.ScriptModule): + model.train(train) if state_dict is not None: model.load_state_dict(state_dict) @@ -144,7 +145,8 @@ def run_debug_test(self, model, train, batch_size, state_dict=None, if use_gpu: model, input = self.convert_cuda(model, input) - onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False) + onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False, + example_outputs=example_outputs) if isinstance(torch_out, torch.autograd.Variable): torch_out = (torch_out,) @@ -153,12 +155,14 @@ def run_debug_test(self, model, train, batch_size, state_dict=None, np.testing.assert_almost_equal(x.data.cpu().numpy(), y, decimal=3) def run_actual_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True, rtol=0.001, atol=1e-7): + input=None, use_gpu=True, rtol=0.001, atol=1e-7, + example_outputs=None): """ This is what the user facing version will look like """ # set the training/test mode for the model - model.train(train) + if not isinstance(model, torch.jit.ScriptModule): + model.train(train) # use the pre-trained model params if available if state_dict is not None: model.load_state_dict(state_dict) @@ -175,14 +179,16 @@ def run_actual_test(self, model, train, batch_size, state_dict=None, verify.verify(model, input, c2, rtol=rtol, atol=atol) def run_model_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True, rtol=0.001, atol=1e-7): + input=None, use_gpu=True, rtol=0.001, atol=1e-7, + example_outputs=None): use_gpu_ = torch.cuda.is_available() and use_gpu if self.embed_params: self.run_actual_test(model, train, batch_size, state_dict, input, - use_gpu=use_gpu_, rtol=rtol, atol=atol) + use_gpu=use_gpu_, rtol=rtol, atol=atol, + example_outputs=example_outputs) else: self.run_debug_test(model, train, batch_size, state_dict, input, - use_gpu=use_gpu_) + use_gpu=use_gpu_, example_outputs=example_outputs) def test_linear(self): model = nn.Linear(1, 1) @@ -346,11 +352,11 @@ def test_rnn_init_predict_split(self): mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params)[0]) prepared = c2.prepare(mp, device='CPU') if self.embed_params: - assert len(prepared.init_net.op) == 1019 - assert len(prepared.predict_net.op) == 142 + assert len(prepared.init_net.op) == 875 + assert len(prepared.predict_net.op) == 130 else: assert len(prepared.init_net.op) == 8 - assert len(prepared.predict_net.op) == 1153 + assert len(prepared.predict_net.op) == 997 def test_alexnet(self): state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False) @@ -891,6 +897,28 @@ def forward(self, x): x = torch.randn(3, 4, 5, 6, 7) self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + def test_dynamic_slice(self): + class DynamicSliceExportMod(torch.nn.Module): + def forward(self, x): + results = [] + for i in range(4): + results.append(x[:x.size(0) - i, i:x.size(2), i:3]) + return tuple(results) + + x = torch.rand(5, 5, 5) + self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + + def test_dynamic_slice_to_the_end(self): + class DynamicSliceExportMod(torch.nn.Module): + def forward(self, x): + results = [] + for i in range(4): + results.append(x[:, i:, x.size(2) - 5]) + return tuple(results) + + x = torch.rand(5, 5, 5) + self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + # a bit of metaprogramming to set up all the rnn tests diff --git a/test/test_jit.py b/test/test_jit.py index f6a4347eae86cb..54bc33e8542e43 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -6870,6 +6870,25 @@ def foo(x): self.assertEqual(foo(input), input) + def test_export_dynamic_slice(self): + class DynamicSliceExportMod(torch.jit.ScriptModule): + @torch.jit.script_method + def forward(self, x): + retval = x[0] + for i in range(x.size(1)): + retval += torch.sum(x[0:i], dim=0) + return retval + + mod = DynamicSliceExportMod() + + input = torch.rand(3, 4, 5) + example_outs = mod(input) + + f = io.BytesIO() + exported = torch.onnx.export_to_pretty_string( + DynamicSliceExportMod(), (input,), f, example_outputs=example_outs) + self.assertExpected(exported) + class MnistNet(nn.Module): def __init__(self): diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 30e8672be601fd..d5b586c3846d6f 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -151,7 +151,7 @@ def _unimplemented(op, msg): # increasing this number. This includes symbolic definitions NOT in this # file, so grep for "OpName" (with quotes) -_onnx_opset_version = 7 +_onnx_opset_version = 9 # --------------------------------------------------------------------- @@ -981,11 +981,21 @@ def full_like(g, input, fill_value): return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1))) -@parse_args('v', 'i', 'i', 'i', 'i') +@parse_args('v', 'v', 'v', 'v', 'i') def slice(g, self, dim, start, end, step): if step != 1: _unimplemented("slice", "step!=1 is currently not supported") - return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end]) + if start.node().kind() != 'onnx::Constant' or \ + end.node().kind() != 'onnx::Constant' or dim.node().kind() != 'onnx::Constant': + start_unsqueezed = g.op("Unsqueeze", start, axes_i=[0]) + end_unsqueezed = g.op("Unsqueeze", end, axes_i=[0]) + dim_unsqueezed = g.op("Unsqueeze", dim, axes_i=[0]) + return g.op("DynamicSlice", self, start_unsqueezed, end_unsqueezed, dim_unsqueezed) + else: + start = _parse_arg(start, 'i') + end = _parse_arg(end, 'i') + dim = _parse_arg(dim, 'i') + return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end]) @parse_args('v', 'f', 'f') From 90537289a04ef5d572496240e2ac3a881be518d2 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Thu, 13 Sep 2018 12:41:11 -0700 Subject: [PATCH 129/237] Constexpr std::move / std::forward for C++11 (#11396) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11396 std::move and std::forward in C++11 aren't constexpr (they are in C++14). This caused a build issue orionr was working on. It should be fixed by this diff Reviewed By: orionr Differential Revision: D9724805 fbshipit-source-id: 0d9047dce611385d659cc71a6c04cc7a6a40a5ae --- aten/src/ATen/core/C++17.h | 82 ++++++++++++++++++++++++------------- caffe2/utils/Array.h | 6 +-- caffe2/utils/Array_test.cpp | 8 ++-- 3 files changed, 60 insertions(+), 36 deletions(-) diff --git a/aten/src/ATen/core/C++17.h b/aten/src/ATen/core/C++17.h index 5964a9dea36b2f..ac5b3022ed5ff1 100644 --- a/aten/src/ATen/core/C++17.h +++ b/aten/src/ATen/core/C++17.h @@ -12,6 +12,53 @@ namespace c10 { namespace guts { + + +#ifdef __cpp_lib_transformation_trait_aliases +template using conditional_t = std::conditional_t; +template using enable_if_t = std::enable_if_t; +template using add_lvalue_reference_t = std::add_lvalue_reference_t; +template using remove_reference_t = std::remove_reference_t; +template using remove_cv_t = std::remove_cv_t; +template using result_of_t = std::result_of_t; +template using decay_t = std::decay_t; +template using remove_const_t = std::remove_const_t; +template using remove_pointer_t = std::remove_pointer_t; +#else +template using conditional_t = typename std::conditional::type; +template using enable_if_t = typename std::enable_if::type; +template using add_lvalue_reference_t = typename std::add_lvalue_reference::type; +template using remove_reference_t = typename std::remove_reference::type; +template using remove_cv_t = typename std::remove_cv::type; +template using result_of_t = typename std::result_of::type; +template using decay_t = typename std::decay::type; +template using remove_const_t = typename std::remove_const::type; +template using remove_pointer_t = typename std::remove_pointer::type; +#endif + + + + +// C++11 doesn't have constexpr std::move / std::forward. +// Implementation taken from libc++. +template +constexpr inline guts::remove_reference_t&& move(T&& t) noexcept { + return static_cast&&>(t); +} +template +constexpr inline T&& forward(guts::remove_reference_t& t) noexcept { + return static_cast(t); +} +template +constexpr inline T&& forward(guts::remove_reference_t&& t) noexcept { + static_assert(!std::is_lvalue_reference::value, + "can not forward an rvalue as an lvalue."); + return static_cast(t); +} + + + + #if __cplusplus >= 201402L || defined(__cpp_lib_make_unique) && __cpp_lib_make_unique >= 201304L || \ (defined(__ANDROID__) && __ANDROID__ && __cplusplus >= 201300L) || defined(_MSC_VER) && _MSC_VER >= 1900 @@ -23,7 +70,7 @@ namespace c10 { namespace guts { template typename std::enable_if::value, std::unique_ptr>::type make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); + return std::unique_ptr(new T(forward(args)...)); } // Allows 'make_unique(10)'. (N3690 s20.9.1.4 p3-4) template @@ -39,6 +86,7 @@ make_unique(Args&&...) = delete; #endif + #ifdef __cpp_lib_integer_sequence template using integer_sequence = std::integer_sequence; @@ -73,28 +121,6 @@ template using index_sequence_for = make_index_sequence using conditional_t = std::conditional_t; -template using enable_if_t = std::enable_if_t; -template using add_lvalue_reference_t = std::add_lvalue_reference_t; -template using remove_reference_t = std::remove_reference_t; -template using remove_cv_t = std::remove_cv_t; -template using result_of_t = std::result_of_t; -template using decay_t = std::decay_t; -template using remove_const_t = std::remove_const_t; -template using remove_pointer_t = std::remove_pointer_t; -#else -template using conditional_t = typename std::conditional::type; -template using enable_if_t = typename std::enable_if::type; -template using add_lvalue_reference_t = typename std::add_lvalue_reference::type; -template using remove_reference_t = typename std::remove_reference::type; -template using remove_cv_t = typename std::remove_cv::type; -template using result_of_t = typename std::result_of::type; -template using decay_t = typename std::decay::type; -template using remove_const_t = typename std::remove_const::type; -template using remove_pointer_t = typename std::remove_pointer::type; -#endif - #ifdef __cpp_lib_logical_traits @@ -155,7 +181,7 @@ template using void_t = typename make_void::type; template inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { - return std::apply(std::forward(f), std::forward(t)); + return std::apply(forward(f), forward(t)); } #else @@ -164,19 +190,19 @@ inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { // TODO This is an incomplete implementation of std::apply, not working for member functions. namespace detail { template -constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence) -> decltype(std::forward(f)(std::get(std::forward(t))...)) +constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence) -> decltype(forward(f)(std::get(forward(t))...)) { - return std::forward(f)(std::get(std::forward(t))...); + return forward(f)(std::get(forward(t))...); } } // namespace detail template constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl( - std::forward(f), std::forward(t), + forward(f), forward(t), guts::make_index_sequence>::value>{})) { return detail::apply_impl( - std::forward(f), std::forward(t), + forward(f), forward(t), guts::make_index_sequence>::value>{}); } diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h index 55c4a6a6e50af7..3217198dd8cdf3 100644 --- a/caffe2/utils/Array.h +++ b/caffe2/utils/Array.h @@ -259,7 +259,7 @@ template constexpr _Tp&& get(array<_Tp, _Nm>&& __arr) noexcept { static_assert(_Int < _Nm, "array index is within bounds"); - return std::move(get<_Int>(__arr)); + return guts::move(get<_Int>(__arr)); } template @@ -292,12 +292,12 @@ constexpr inline array tail(const array& arg) { namespace detail { template constexpr inline array prepend_(T&& head, const array& tail, guts::index_sequence) { - return {{std::forward(head), get(tail)...}}; + return {{guts::forward(head), get(tail)...}}; } } template constexpr inline array prepend(T&& head, const array& tail) { - return detail::prepend_(std::forward(head), tail, guts::make_index_sequence()); + return detail::prepend_(guts::forward(head), tail, guts::make_index_sequence()); } /** diff --git a/caffe2/utils/Array_test.cpp b/caffe2/utils/Array_test.cpp index 1d8c290b8a2249..1f3171ebe88eb0 100644 --- a/caffe2/utils/Array_test.cpp +++ b/caffe2/utils/Array_test.cpp @@ -78,11 +78,9 @@ namespace test_tail { static_assert(array < int, 0 > {{}} == tail(array < int, 1 > {{3}}), ""); } -TEST(ArrayTest, TestPrepend) { - // Some compilers can't handle move results as constexpr, so use - // gtest assert for this test - ASSERT_EQ((array {{2, 3, 4}}), (prepend(2, array {{3, 4}}))); - ASSERT_EQ((array {{3}}), (prepend(3, array {{}}))); +namespace test_prepend { + static_assert(array < int, 3 > {{2, 3, 4}} == prepend(2, array < int, 2 > {{3, 4}}), ""); + static_assert(array < int, 1 > {{3}} == prepend(3, array < int, 0 > {{}}), ""); } namespace test_to_std_array { From f129da1a47274615bce094e949b77c8791de9a3b Mon Sep 17 00:00:00 2001 From: Marc Ferradou Date: Thu, 13 Sep 2018 14:38:17 -0700 Subject: [PATCH 130/237] Add max to the ValueError for EmbeddingBag mode check (#11655) Summary: Related to #11624 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11655 Differential Revision: D9815454 Pulled By: SsnL fbshipit-source-id: 8dd82e0c0aa68362e12b301e095a85af7d7fd71a --- torch/nn/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 690112424db560..243974e152bb27 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1316,7 +1316,7 @@ def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2, raise ValueError("max mode does not support sparse weights") else: - raise ValueError("mode has to be one of sum or mean") + raise ValueError("mode has to be one of sum, mean or max") if max_norm is not None: with torch.no_grad(): From 29e29ca6ee82ae40da6142faeba3bb6f78a17b63 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Thu, 13 Sep 2018 14:53:23 -0700 Subject: [PATCH 131/237] Use MPI_Isend/MPI_Irecv to back send/recv (#11630) Summary: The isCompleted function is changed to being non-const to accomodate setting some internal status on the work object in the case of completion. Previously, it was only checking a member field, but for the MPI backend it calls MPI_Test to poll for completion of an asynchronous request. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11630 Reviewed By: SsnL Differential Revision: D9808008 Pulled By: pietern fbshipit-source-id: 18b70825b1fb4d561a552fa75e9475a522852cd4 --- torch/lib/c10d/ProcessGroup.hpp | 2 +- torch/lib/c10d/ProcessGroupGloo.cpp | 6 +- torch/lib/c10d/ProcessGroupGloo.hpp | 6 +- torch/lib/c10d/ProcessGroupMPI.cpp | 175 ++++++++++++++++++++-------- torch/lib/c10d/ProcessGroupMPI.hpp | 24 +++- torch/lib/c10d/ProcessGroupNCCL.cpp | 2 +- torch/lib/c10d/ProcessGroupNCCL.hpp | 2 +- 7 files changed, 156 insertions(+), 61 deletions(-) diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp index f312ed779e43ae..2ac2948457a7f6 100644 --- a/torch/lib/c10d/ProcessGroup.hpp +++ b/torch/lib/c10d/ProcessGroup.hpp @@ -38,7 +38,7 @@ class ProcessGroup { virtual ~Work(); // Checks if request has completed. Non-blocking operation. - virtual bool isCompleted() const = 0; + virtual bool isCompleted() = 0; // Returns if the work completed successfully. // If false, the exception function can be called to get details. diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index 124b4d4a9d0cc4..e110a78001c134 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -145,7 +145,7 @@ ProcessGroupGloo::WorkGloo::WorkGloo() ProcessGroupGloo::WorkGloo::~WorkGloo() {} -bool ProcessGroupGloo::WorkGloo::isCompleted() const { +bool ProcessGroupGloo::WorkGloo::isCompleted() { return completed_; } @@ -222,7 +222,7 @@ ProcessGroupGloo::SendWork::SendWork( std::unique_ptr<::gloo::transport::UnboundBuffer> buffer) : tensor_(tensor), buffer_(std::move(buffer)) {} -bool ProcessGroupGloo::SendWork::isCompleted() const { +bool ProcessGroupGloo::SendWork::isCompleted() { // No way to poll for completion yet return true; } @@ -252,7 +252,7 @@ ProcessGroupGloo::RecvWork::RecvWork( int* srcRank) : tensor_(tensor), buffer_(std::move(buffer)), srcRank_(srcRank) {} -bool ProcessGroupGloo::RecvWork::isCompleted() const { +bool ProcessGroupGloo::RecvWork::isCompleted() { // No way to poll for completion yet return true; } diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp index 0fe69250b31528..e78b1e36f8b94e 100644 --- a/torch/lib/c10d/ProcessGroupGloo.hpp +++ b/torch/lib/c10d/ProcessGroupGloo.hpp @@ -177,7 +177,7 @@ class ProcessGroupGloo : public ProcessGroup { explicit WorkGloo(); virtual ~WorkGloo(); - bool isCompleted() const override; + bool isCompleted() override; bool isSuccess() const override; void synchronize() override; bool wait() override; @@ -235,7 +235,7 @@ class ProcessGroupGloo : public ProcessGroup { virtual ~SendWork() = default; - bool isCompleted() const override; + bool isCompleted() override; bool isSuccess() const override; @@ -259,7 +259,7 @@ class ProcessGroupGloo : public ProcessGroup { virtual ~RecvWork() = default; - bool isCompleted() const override; + bool isCompleted() override; bool isSuccess() const override; diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp index 31578089677acd..39e5e5b609c1e8 100644 --- a/torch/lib/c10d/ProcessGroupMPI.cpp +++ b/torch/lib/c10d/ProcessGroupMPI.cpp @@ -93,7 +93,7 @@ ProcessGroupMPI::WorkMPI::WorkMPI() : completed_(false) {} ProcessGroupMPI::WorkMPI::~WorkMPI() {} -bool ProcessGroupMPI::WorkMPI::isCompleted() const { +bool ProcessGroupMPI::WorkMPI::isCompleted() { return completed_; } @@ -137,6 +137,78 @@ const std::exception& ProcessGroupMPI::WorkMPI::exception() const { } } +ProcessGroupMPI::AsyncWork::AsyncWork( + at::Tensor tensor, + MPI_Request request, + int* srcRank) + : tensor_(std::move(tensor)), request_(request), srcRank_(srcRank) { + memset(&status_, 0, sizeof(status_)); +} + +ProcessGroupMPI::AsyncWork::~AsyncWork() { + if (request_ != MPI_REQUEST_NULL) { + throw std::runtime_error( + "Attempted destruction of AsyncWork before work has completed"); + } +} + +bool ProcessGroupMPI::AsyncWork::isCompleted() { + if (request_ == MPI_REQUEST_NULL) { + return true; + } + + std::unique_lock globalLock(pgGlobalMutex_); + int flag = 0; + MPI_CHECK(MPI_Test(&request_, &flag, &status_)); + if (request_ != MPI_REQUEST_NULL) { + return false; + } + + // request_ == MPI_REQUEST_NULL; the work has completed + if (srcRank_ != nullptr) { + *srcRank_ = status_.MPI_SOURCE; + } + + return true; +} + +bool ProcessGroupMPI::AsyncWork::isSuccess() const { + if (request_ != MPI_REQUEST_NULL) { + throw std::runtime_error( + "Invalid call to AsyncWork::isSuccess before work has completed"); + } + + return status_.MPI_ERROR == MPI_SUCCESS; +} + +void ProcessGroupMPI::AsyncWork::synchronize() {} + +bool ProcessGroupMPI::AsyncWork::wait() { + if (request_ == MPI_REQUEST_NULL) { + return true; + } + + std::unique_lock globalLock(pgGlobalMutex_); + MPI_CHECK(MPI_Wait(&request_, &status_)); + if (srcRank_ != nullptr && status_.MPI_ERROR == MPI_SUCCESS) { + *srcRank_ = status_.MPI_SOURCE; + } + + return status_.MPI_ERROR == MPI_SUCCESS; +} + +const std::exception& ProcessGroupMPI::AsyncWork::exception() const { + if (request_ != MPI_REQUEST_NULL) { + throw std::runtime_error( + "Invalid call to AsyncWork::exception before work has completed"); + } + + std::array buf; + int len = buf.size(); + MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len)); + return std::runtime_error(std::string(buf.data(), len)); +} + // Static global states int ProcessGroupMPI::numProcessGroups_ = 0; int ProcessGroupMPI::mpiThreadSupport_ = 0; @@ -581,23 +653,25 @@ std::shared_ptr ProcessGroupMPI::send( if (pgComm_ == MPI_COMM_NULL) { return nullptr; } + checkSingleTensor(tensors); - std::function&)> runFunc = - [dstRank, this](std::unique_ptr& entry) { - auto data = (entry->src)[0]; - std::unique_lock globalLock(pgGlobalMutex_); - MPI_CHECK(MPI_Send( - data.data_ptr(), - data.numel(), - mpiDatatype.at(data.type().scalarType()), - dstRank, - 0, - pgComm_)); - }; - auto entry = std::unique_ptr( - new WorkEntry(&tensors, nullptr, std::move(runFunc))); - return enqueue(std::move(entry)); + auto& tensor = tensors[0]; + MPI_Request request = MPI_REQUEST_NULL; + + { + std::unique_lock globalLock(pgGlobalMutex_); + MPI_CHECK(MPI_Isend( + tensor.data_ptr(), + tensor.numel(), + mpiDatatype.at(tensor.type().scalarType()), + dstRank, + 0, + pgComm_, + &request)); + } + + return std::make_shared(tensor, request); } std::shared_ptr ProcessGroupMPI::recv( @@ -606,24 +680,25 @@ std::shared_ptr ProcessGroupMPI::recv( if (pgComm_ == MPI_COMM_NULL) { return nullptr; } + checkSingleTensor(tensors); - std::function&)> runFunc = - [srcRank, this](std::unique_ptr& entry) { - auto data = (entry->src)[0]; - std::unique_lock globalLock(pgGlobalMutex_); - MPI_CHECK(MPI_Recv( - data.data_ptr(), - data.numel(), - mpiDatatype.at(data.type().scalarType()), - srcRank, - 0, - pgComm_, - MPI_STATUS_IGNORE)); - }; - auto entry = std::unique_ptr( - new WorkEntry(&tensors, nullptr, std::move(runFunc))); - return enqueue(std::move(entry)); + auto& tensor = tensors[0]; + MPI_Request request = MPI_REQUEST_NULL; + + { + std::unique_lock globalLock(pgGlobalMutex_); + MPI_CHECK(MPI_Irecv( + tensor.data_ptr(), + tensor.numel(), + mpiDatatype.at(tensor.type().scalarType()), + srcRank, + 0, + pgComm_, + &request)); + } + + return std::make_shared(tensor, request); } std::shared_ptr ProcessGroupMPI::recvAnysource( @@ -632,27 +707,25 @@ std::shared_ptr ProcessGroupMPI::recvAnysource( if (pgComm_ == MPI_COMM_NULL) { return nullptr; } + checkSingleTensor(tensors); - std::function&)> runFunc = - [srcRank, this](std::unique_ptr& entry) { - auto data = (entry->src)[0]; - MPI_Status status; - std::unique_lock globalLock(pgGlobalMutex_); - MPI_CHECK(MPI_Recv( - data.data_ptr(), - data.numel(), - mpiDatatype.at(data.type().scalarType()), - MPI_ANY_SOURCE, - 0, - pgComm_, - &status)); - *(entry->srcRank) = status.MPI_SOURCE; - }; - auto entry = std::unique_ptr( - new WorkEntry(&tensors, nullptr, std::move(runFunc))); - entry->srcRank = srcRank; - return enqueue(std::move(entry)); + auto& tensor = tensors[0]; + MPI_Request request = MPI_REQUEST_NULL; + + { + std::unique_lock globalLock(pgGlobalMutex_); + MPI_CHECK(MPI_Irecv( + tensor.data_ptr(), + tensor.numel(), + mpiDatatype.at(tensor.type().scalarType()), + MPI_ANY_SOURCE, + 0, + pgComm_, + &request)); + } + + return std::make_shared(tensor, request, srcRank); } std::shared_ptr ProcessGroupMPI::barrier() { diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp index b7989aeba69e32..4a72570759ce91 100644 --- a/torch/lib/c10d/ProcessGroupMPI.hpp +++ b/torch/lib/c10d/ProcessGroupMPI.hpp @@ -78,7 +78,7 @@ class ProcessGroupMPI : public ProcessGroup { virtual ~WorkMPI(); // Checks if request has completed. Non-blocking operation. - bool isCompleted() const override; + bool isCompleted() override; // Returns if the work completed successfully // if false, the exception function can be called to get details. @@ -107,6 +107,28 @@ class ProcessGroupMPI : public ProcessGroup { friend class ProcessGroupMPI; }; + class AsyncWork : public ProcessGroup::Work { + public: + AsyncWork(at::Tensor tensor, MPI_Request request, int* srcRank = nullptr); + virtual ~AsyncWork(); + + bool isCompleted() override; + + bool isSuccess() const override; + + void synchronize() override; + + bool wait() override; + + const std::exception& exception() const override; + + protected: + at::Tensor tensor_; + MPI_Request request_; + int* const srcRank_; + MPI_Status status_; + }; + // Constructor will spawn up the worker thread loop explicit ProcessGroupMPI(int rank, int size, MPI_Comm pgComm); diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index a28757ede24c3e..157a9917f6803e 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -100,7 +100,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector& devices) ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {} // Check if the NCCL kernels are queued on the GPUs -bool ProcessGroupNCCL::WorkNCCL::isCompleted() const { +bool ProcessGroupNCCL::WorkNCCL::isCompleted() { return true; } diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 282c679f0dd1e8..d3862fc044321d 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -66,7 +66,7 @@ class ProcessGroupNCCL : public ProcessGroup { // Checks if request has completed. In this specific case of NCCL, it checks // if the NCCL operation has completed on the GPU in its own NCCL stream. // Non-blocking operation. - bool isCompleted() const override; + bool isCompleted() override; // Let current THC stream wait on the completing of the NCCL work // always return true and will throw if there are exceptions From 05e06f7de2c84b8dd770c2f5008f5b7620e97d39 Mon Sep 17 00:00:00 2001 From: Jeff Smith Date: Thu, 13 Sep 2018 15:04:43 -0700 Subject: [PATCH 132/237] migrating deprecated calls without abc module for containers (#11515) Summary: Implementing #10540. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11515 Reviewed By: apaszke Differential Revision: D9771045 Pulled By: jeffreyksmithjr fbshipit-source-id: 85ea39abaa9b465805a969f122b626b11fc85ef6 --- caffe2/python/compatibility.py | 8 ++++++++ caffe2/python/onnx/backend.py | 3 ++- caffe2/python/onnx/frontend.py | 4 ++-- caffe2/python/utils.py | 4 ++-- test/test_legacy_nn.py | 4 ++-- torch/_six.py | 7 +++++++ torch/autograd/gradcheck.py | 8 ++++---- torch/jit/__init__.py | 2 +- torch/nn/modules/container.py | 19 ++++++++++--------- torch/nn/modules/utils.py | 4 ++-- torch/onnx/utils.py | 4 ++-- torch/optim/optimizer.py | 5 +++-- torch/utils/data/dataloader.py | 10 +++++----- 13 files changed, 50 insertions(+), 32 deletions(-) create mode 100644 caffe2/python/compatibility.py diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py new file mode 100644 index 00000000000000..9d615a30833371 --- /dev/null +++ b/caffe2/python/compatibility.py @@ -0,0 +1,8 @@ +from six import PY2, PY3 + +if PY2: + import collections + container_abcs = collections +elif PY3: + import collections.abc + container_abcs = collections.abc diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 3d9239c8b5c92b..7eacaf327ad264 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -25,6 +25,7 @@ import caffe2 from caffe2.python import core, workspace, rnn_cell, gru_cell +from caffe2.python.compatibility import container_abcs from caffe2.python.model_helper import ModelHelper from caffe2.proto import caffe2_pb2 import caffe2.python.utils @@ -778,7 +779,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version) if isinstance(ops, Caffe2Ops): return ops - if not isinstance(ops, collections.Iterable): + if not isinstance(ops, container_abcs.Iterable): ops = [ops] return Caffe2Ops(ops, [], []) diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index 5fd470c932ac59..379ef65af904a6 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -12,11 +12,11 @@ from __future__ import unicode_literals import itertools -import collections import logging import re from caffe2.python import core as caffe2_core +from caffe2.python.compatibility import container_abcs from caffe2.proto import caffe2_legacy_pb2 from enum import Enum from onnx import (defs, checker, helper, numpy_helper, mapping, @@ -156,7 +156,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes): const_tensors = [] if isinstance(nodes, tuple): nodes, const_tensors = nodes - if not isinstance(nodes, collections.Iterable): + if not isinstance(nodes, container_abcs.Iterable): nodes = [nodes] return nodes, const_tensors diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py index 75124add41cecd..5e87df8058e017 100644 --- a/caffe2/python/utils.py +++ b/caffe2/python/utils.py @@ -6,13 +6,13 @@ from __future__ import unicode_literals from caffe2.proto import caffe2_pb2 +from caffe2.python.compatibility import container_abcs from future.utils import viewitems from google.protobuf.message import DecodeError, Message from google.protobuf import text_format import sys import copy -import collections import functools import numpy as np from six import integer_types, binary_type, text_type, string_types @@ -120,7 +120,7 @@ def MakeArgument(key, value): """Makes an argument based on the value type.""" argument = caffe2_pb2.Argument() argument.name = key - iterable = isinstance(value, collections.Iterable) + iterable = isinstance(value, container_abcs.Iterable) # Fast tracking common use case where a float32 array of tensor parameters # needs to be serialized. The entire array is guaranteed to have the same diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py index b446920c4fec65..f3a807a0a6d640 100644 --- a/test/test_legacy_nn.py +++ b/test/test_legacy_nn.py @@ -1,10 +1,10 @@ import math import random import unittest -import collections from copy import deepcopy import torch +from torch._six import container_abcs import torch.legacy.nn as nn from common import to_gpu, freeze_rng_state, run_tests, skipIfRocm, TEST_WITH_ROCM from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \ @@ -701,7 +701,7 @@ def require_grad(input): input = input.detach() input.requires_grad = True return input - elif isinstance(input, collections.Iterable): + elif isinstance(input, container_abcs.Iterable): return type(input)(require_grad(e) for e in input) return input diff --git a/torch/_six.py b/torch/_six.py index 1d70df51830d5e..84ba9a464891bb 100644 --- a/torch/_six.py +++ b/torch/_six.py @@ -108,3 +108,10 @@ def exec_(_code_, _globs_=None, _locs_=None): else: def raise_from(value, from_value): raise value + +if PY2: + import collections + container_abcs = collections +elif PY3: + import collections.abc + container_abcs = collections.abc diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py index 2cc4ebbfacd4a3..26dc9daf4a7350 100644 --- a/torch/autograd/gradcheck.py +++ b/torch/autograd/gradcheck.py @@ -1,5 +1,5 @@ import torch -from collections import Iterable +from torch._six import container_abcs import torch.testing import sys from itertools import product @@ -11,7 +11,7 @@ def zero_gradients(x): if x.grad is not None: x.grad.detach_() x.grad.data.zero_() - elif isinstance(x, Iterable): + elif isinstance(x, container_abcs.Iterable): for elem in x: zero_gradients(elem) @@ -23,7 +23,7 @@ def make_jacobian(input, num_out): if not input.requires_grad: return None return torch.zeros(input.nelement(), num_out, dtype=input.dtype) - elif isinstance(input, Iterable): + elif isinstance(input, container_abcs.Iterable): jacobians = list(filter( lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input))) if not jacobians: @@ -37,7 +37,7 @@ def iter_tensors(x, only_requiring_grad=False): if isinstance(x, torch.Tensor): if x.requires_grad or not only_requiring_grad: yield x - elif isinstance(x, Iterable): + elif isinstance(x, container_abcs.Iterable): for elem in x: for result in iter_tensors(elem, only_requiring_grad): yield result diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index b32edf715a6c00..e4543a6be12b87 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -6,7 +6,7 @@ import torch.jit.annotations from torch._six import raise_from, with_metaclass import torch.testing -from collections import defaultdict, OrderedDict, namedtuple, Iterable +from collections import defaultdict, OrderedDict, namedtuple import sys import warnings import itertools diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py index ef79b1ffb6a289..01e12e621ba4ac 100644 --- a/torch/nn/modules/container.py +++ b/torch/nn/modules/container.py @@ -1,5 +1,6 @@ import warnings -from collections import OrderedDict, Iterable, Mapping +from collections import OrderedDict +from torch._six import container_abcs from itertools import islice import operator @@ -178,7 +179,7 @@ def extend(self, modules): Arguments: modules (iterable): iterable of modules to append """ - if not isinstance(modules, Iterable): + if not isinstance(modules, container_abcs.Iterable): raise TypeError("ModuleList.extend should be called with an " "iterable, but got " + type(modules).__name__) offset = len(self) @@ -278,12 +279,12 @@ def update(self, modules): modules (iterable): a mapping (dictionary) of (string: :class:`~torch.nn.Module``) or an iterable of key/value pairs of type (string, :class:`~torch.nn.Module``) """ - if not isinstance(modules, Iterable): + if not isinstance(modules, container_abcs.Iterable): raise TypeError("ModuleDict.update should be called with an " "iterable of key/value pairs, but got " + type(modules).__name__) - if isinstance(modules, Mapping): + if isinstance(modules, container_abcs.Mapping): if isinstance(modules, OrderedDict): for key, module in modules.items(): self[key] = module @@ -292,7 +293,7 @@ def update(self, modules): self[key] = module else: for j, m in enumerate(modules): - if not isinstance(m, Iterable): + if not isinstance(m, container_abcs.Iterable): raise TypeError("ModuleDict update sequence element " "#" + str(j) + " should be Iterable; is" + type(m).__name__) @@ -375,7 +376,7 @@ def extend(self, parameters): Arguments: parameters (iterable): iterable of parameters to append """ - if not isinstance(parameters, Iterable): + if not isinstance(parameters, container_abcs.Iterable): raise TypeError("ParameterList.extend should be called with an " "iterable, but got " + type(parameters).__name__) offset = len(self) @@ -483,12 +484,12 @@ def update(self, parameters): (string : :class:`~torch.nn.Parameter`) or an iterable of key/value pairs of type (string, :class:`~torch.nn.Parameter`) """ - if not isinstance(parameters, Iterable): + if not isinstance(parameters, container_abcs.Iterable): raise TypeError("ParametersDict.update should be called with an " "iterable of key/value pairs, but got " + type(parameters).__name__) - if isinstance(parameters, Mapping): + if isinstance(parameters, container_abcs.Mapping): if isinstance(parameters, OrderedDict): for key, parameter in parameters.items(): self[key] = parameter @@ -497,7 +498,7 @@ def update(self, parameters): self[key] = parameter else: for j, p in enumerate(parameters): - if not isinstance(p, Iterable): + if not isinstance(p, container_abcs.Iterable): raise TypeError("ParameterDict update sequence element " "#" + str(j) + " should be Iterable; is" + type(p).__name__) diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py index 3cff6a9e9ffba9..2b8ebd642b000a 100644 --- a/torch/nn/modules/utils.py +++ b/torch/nn/modules/utils.py @@ -1,10 +1,10 @@ -import collections +from torch._six import container_abcs from itertools import repeat def _ntuple(n): def parse(x): - if isinstance(x, collections.Iterable): + if isinstance(x, container_abcs.Iterable): return x return tuple(repeat(x, n)) return parse diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 963e0bc9591255..d027267053052a 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -9,7 +9,7 @@ import torch.autograd import torch.serialization import re -import collections +from torch._six import container_abcs import contextlib import numbers import warnings @@ -354,7 +354,7 @@ def _run_symbolic_method(op_name, symbolic_fn, args): def _is_onnx_list(value): if not isinstance(value, string_classes) and \ not isinstance(value, torch.Tensor) and \ - isinstance(value, collections.Iterable): + isinstance(value, container_abcs.Iterable): return True return False diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py index 21f69bb82804d3..41c1e916f4d8d1 100644 --- a/torch/optim/optimizer.py +++ b/torch/optim/optimizer.py @@ -1,4 +1,5 @@ -from collections import defaultdict, Iterable +from collections import defaultdict +from torch._six import container_abcs import torch from copy import deepcopy @@ -123,7 +124,7 @@ def cast(param, value): return value elif isinstance(value, dict): return {k: cast(param, v) for k, v in value.items()} - elif isinstance(value, Iterable): + elif isinstance(value, container_abcs.Iterable): return type(value)(cast(param, v) for v in value) else: return value diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 9d69ab4daf0fc8..0874fd1185028d 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -6,7 +6,7 @@ from . import SequentialSampler, RandomSampler, BatchSampler import signal import functools -import collections +from torch._six import container_abcs import re import sys import threading @@ -187,9 +187,9 @@ def default_collate(batch): return torch.DoubleTensor(batch) elif isinstance(batch[0], string_classes): return batch - elif isinstance(batch[0], collections.Mapping): + elif isinstance(batch[0], container_abcs.Mapping): return {key: default_collate([d[key] for d in batch]) for key in batch[0]} - elif isinstance(batch[0], collections.Sequence): + elif isinstance(batch[0], container_abcs.Sequence): transposed = zip(*batch) return [default_collate(samples) for samples in transposed] @@ -201,9 +201,9 @@ def pin_memory_batch(batch): return batch.pin_memory() elif isinstance(batch, string_classes): return batch - elif isinstance(batch, collections.Mapping): + elif isinstance(batch, container_abcs.Mapping): return {k: pin_memory_batch(sample) for k, sample in batch.items()} - elif isinstance(batch, collections.Sequence): + elif isinstance(batch, container_abcs.Sequence): return [pin_memory_batch(sample) for sample in batch] else: return batch From 4672280b550928e9d4c41adc8a7cecf652b0b0c4 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Thu, 13 Sep 2018 15:13:05 -0700 Subject: [PATCH 133/237] Pass Storage by value (#11546) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11546 - Reviewed By: ezyang Differential Revision: D9778043 fbshipit-source-id: b1381cd60a826055ce8771d6c67eac4cc375b3b4 --- aten/src/ATen/core/TensorImpl.cpp | 36 +++++++++++++++++-------------- aten/src/ATen/core/TensorImpl.h | 5 +---- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp index 1bf6802c7d4d8e..577959c57a65c2 100644 --- a/aten/src/ATen/core/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -17,28 +17,32 @@ const Tensor& TensorImpl::grad() const { AT_ERROR("grad is not implemented for Tensor"); } -TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable) - : TensorImpl({}, type_id, scalar_type, is_variable) { +namespace { +Storage createStorage_(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator) { // UndefinedTensors and SparseTensors don't have storages. if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) { - storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true); + return Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true); + } else { + return Storage(); } } +} + +TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable) + : TensorImpl(createStorage_(type_id, scalar_type, allocator), type_id, is_variable) { +} -TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable) - : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {} - -TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable) - : storage_(std::move(storage)), - storage_offset_(0), - sizes_{0}, - strides_{1}, - is_contiguous_(true), - numel_(0), - type_id_(type_id), - scalar_type_(scalar_type), - is_variable_(is_variable) {} +TensorImpl::TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable) + :storage_(std::move(storage)), + storage_offset_(0), + sizes_{0}, + strides_{1}, + is_contiguous_(true), + numel_(0), + type_id_(type_id), + scalar_type_(storage_ ? dataTypeToScalarType(storage_.dtype().id()) : ScalarType::Undefined), + is_variable_(is_variable) {} IntList TensorImpl::sizes() const { return sizes_; diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index 831c53e23b6d49..acc8efbf86c986 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -23,7 +23,7 @@ namespace at { struct AT_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl() = delete; TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable); - TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); + TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable); virtual void release_resources() override; @@ -209,8 +209,5 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { ScalarType scalar_type_; bool is_variable_ = false; bool is_wrapped_number_ = false; - - private: - TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable); }; } // namespace at From 85ff72348d92c9e4b38dd00bbe42f0ca3e66667f Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Thu, 13 Sep 2018 16:15:56 -0700 Subject: [PATCH 134/237] Only involve tensor device in CUDA -> CPU copy, not current device. (#11592) Summary: This also unifies the device usage between the async and sync case. Fixes https://github.com/pytorch/pytorch/issues/10832. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11592 Differential Revision: D9797355 Pulled By: gchanan fbshipit-source-id: e496cd371111cfaf9a6c664167967b395e3d72e9 --- aten/src/THC/generic/THCTensorCopy.cpp | 11 +++++ test/run_test.py | 1 + test/test_cuda_primary_ctx.py | 58 ++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 test/test_cuda_primary_ctx.py diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp index 96ab307182639c..0c20edfbd9fd36 100644 --- a/aten/src/THC/generic/THCTensorCopy.cpp +++ b/aten/src/THC/generic/THCTensorCopy.cpp @@ -58,6 +58,13 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src) { THTensor *selfc = THTensor_(newContiguous)(self); + int tensorDevice = THCTensor_(getDevice)(state, src); + int currentDevice; + THCudaCheck(cudaGetDevice(¤tDevice)); + + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(tensorDevice)); + } src = THCTensor_(newContiguous)(state, src); cudaStream_t stream = THCState_getCurrentStream(state); @@ -68,6 +75,10 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src) stream)); THCudaCheck(cudaStreamSynchronize(stream)); + if (currentDevice != tensorDevice) { + THCudaCheck(cudaSetDevice(currentDevice)); + } + THCTensor_(free)(state, src); THTensor_(freeCopyTo)(selfc, self); } diff --git a/test/run_test.py b/test/run_test.py index d7af8e47ab8767..1e3c2f60e1df34 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -21,6 +21,7 @@ 'cpp_extensions', 'c10d', 'cuda', + 'cuda_primary_ctx', 'dataloader', 'distributed', 'distributions', diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py new file mode 100644 index 00000000000000..2006b340aa22de --- /dev/null +++ b/test/test_cuda_primary_ctx.py @@ -0,0 +1,58 @@ +import ctypes +import torch +from common import TestCase, run_tests, skipIfRocm +import unittest + +# NOTE: this needs to be run in a brand new process + +# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here, +# because if we do that, the TEST_CUDNN line from common_cuda will be executed +# multiple times as well during the execution of this test suite, and it will +# cause CUDA OOM error on Windows. +TEST_CUDA = torch.cuda.is_available() +TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2 + +if not TEST_CUDA: + print('CUDA not available, skipping tests') + TestCase = object # noqa: F811 + + +def get_is_primary_context_created(device): + flags = ctypes.cast((ctypes.c_uint * 1)(), ctypes.POINTER(ctypes.c_uint)) + active = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) + result = torch.cuda.cudart().cuDevicePrimaryCtxGetState(ctypes.c_int(device), flags, active) + assert result == 0, 'cuDevicePrimaryCtxGetState failed' + return bool(active[0]) + + +class TestCudaPrimaryCtx(TestCase): + @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @skipIfRocm + def test_cuda_primary_ctx(self): + # Ensure context has not been created beforehand + self.assertFalse(get_is_primary_context_created(0)) + self.assertFalse(get_is_primary_context_created(1)) + + x = torch.randn(1, device='cuda:1') + + # We should have only created context on 'cuda:1' + self.assertFalse(get_is_primary_context_created(0)) + self.assertTrue(get_is_primary_context_created(1)) + + print(x) + + # We should still have only created context on 'cuda:1' + self.assertFalse(get_is_primary_context_created(0)) + self.assertTrue(get_is_primary_context_created(1)) + + y = torch.randn(1, device='cpu') + y.copy_(x) + + # We should still have only created context on 'cuda:1' + self.assertFalse(get_is_primary_context_created(0)) + self.assertTrue(get_is_primary_context_created(1)) + + # DO NOT ADD ANY OTHER TESTS HERE! ABOVE TEST REQUIRES FRESH PROCESS + +if __name__ == '__main__': + run_tests() From 8402fde27946d56bf2236f9415864f99b989936c Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 16:26:34 -0700 Subject: [PATCH 135/237] Revert D9778043: Pass Storage by value Differential Revision: D9778043 Original commit changeset: b1381cd60a82 fbshipit-source-id: 40f1de67e939cb41605978d632105a48a91e7629 --- aten/src/ATen/core/TensorImpl.cpp | 36 ++++++++++++++----------------- aten/src/ATen/core/TensorImpl.h | 5 ++++- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp index 577959c57a65c2..1bf6802c7d4d8e 100644 --- a/aten/src/ATen/core/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -17,32 +17,28 @@ const Tensor& TensorImpl::grad() const { AT_ERROR("grad is not implemented for Tensor"); } -namespace { -Storage createStorage_(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator) { +TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable) + : TensorImpl({}, type_id, scalar_type, is_variable) { // UndefinedTensors and SparseTensors don't have storages. if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) { - return Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true); - } else { - return Storage(); + storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true); } } -} - -TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable) - : TensorImpl(createStorage_(type_id, scalar_type, allocator), type_id, is_variable) { -} -TensorImpl::TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable) - :storage_(std::move(storage)), - storage_offset_(0), - sizes_{0}, - strides_{1}, - is_contiguous_(true), - numel_(0), - type_id_(type_id), - scalar_type_(storage_ ? dataTypeToScalarType(storage_.dtype().id()) : ScalarType::Undefined), - is_variable_(is_variable) {} +TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable) + : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {} + +TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable) + : storage_(std::move(storage)), + storage_offset_(0), + sizes_{0}, + strides_{1}, + is_contiguous_(true), + numel_(0), + type_id_(type_id), + scalar_type_(scalar_type), + is_variable_(is_variable) {} IntList TensorImpl::sizes() const { return sizes_; diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index acc8efbf86c986..831c53e23b6d49 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -23,7 +23,7 @@ namespace at { struct AT_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl() = delete; TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable); - TensorImpl(Storage storage, TensorTypeId type_id, bool is_variable); + TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); virtual void release_resources() override; @@ -209,5 +209,8 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { ScalarType scalar_type_; bool is_variable_ = false; bool is_wrapped_number_ = false; + + private: + TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable); }; } // namespace at From c18510463b36b2c825ae29a120f2481b097f06ed Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 16:29:02 -0700 Subject: [PATCH 136/237] Reduce includes in tensor_impl.h (#11643) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11643 - Reduce the tensor_impl.h includes to the bare minimum necessary - Explicitly namespace std:: Reviewed By: jerryzh168 Differential Revision: D9811028 fbshipit-source-id: 44e32720962b35c12a7b2c93605721b9f6c5b254 --- caffe2/core/tensor_impl.h | 70 +++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index ffa3ef61c4cc27..2b07537d43774b 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -1,8 +1,14 @@ #pragma once -#include "caffe2/core/storage.h" +#include -#include +#include + +#include "caffe2/core/allocator.h" +#include "caffe2/core/common.h" +#include "caffe2/core/flags.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/context_base.h" // A global boolean variable to control whether we free memory when a Tensor // is shrinked to a smaller size. As a result, a Tensor is always going to @@ -19,14 +25,14 @@ namespace caffe2 { /** * A utility function to convert vector to vector. */ -inline vector ToVectorTIndex(const std::vector& src) { - return vector(src.begin(), src.end()); +inline std::vector ToVectorTIndex(const std::vector& src) { + return std::vector(src.begin(), src.end()); } /** * Return product of all dimensions starting from k */ -inline TIndex size_from_dim_(int k, const vector& dims) { +inline TIndex size_from_dim_(int k, const std::vector& dims) { TIndex r = 1; for (size_t i = k; i < dims.size(); ++i) { r *= dims[i]; @@ -35,7 +41,7 @@ inline TIndex size_from_dim_(int k, const vector& dims) { } // Product of all dims up to k (not including dims[k]) -inline TIndex size_to_dim_(int k, const vector& dims) { +inline TIndex size_to_dim_(int k, const std::vector& dims) { CAFFE_ENFORCE((unsigned)k <= dims.size()); TIndex r = 1; for (int i = 0; i < k; ++i) { @@ -45,7 +51,7 @@ inline TIndex size_to_dim_(int k, const vector& dims) { } // Product of all dims between k and l (not including dims[k] and dims[l]) -inline TIndex size_between_dim_(int k, int l, const vector& dims) { +inline TIndex size_between_dim_(int k, int l, const std::vector& dims) { CAFFE_ENFORCE((unsigned)l < dims.size()); TIndex r = 1; if (k < l) { @@ -94,12 +100,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { // and immediately discard it in Resize() since // reset_tensor will be true and FreeMemory will be called, // we might want to avoid creating Storage twice? - explicit TensorImpl(const vector& dims, at::DeviceType device_type) + explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) : storage_(device_type) { Resize(dims); } - explicit TensorImpl(const vector& dims, at::DeviceType device_type) + explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) : storage_(device_type) { Resize(dims); } @@ -109,7 +115,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ TensorImpl( const TensorImpl& src, - BaseContext* context_for_copy, + at::BaseContext* context_for_copy, at::DeviceType device_type) : storage_(device_type) { CopyFrom(src, context_for_copy); @@ -130,9 +136,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ template TensorImpl( - const vector& dims, - const vector& values, - BaseContext* context) + const std::vector& dims, + const std::vector& values, + at::BaseContext* context) : storage_(context->GetDevicetype(), TypeMeta::Make()) { Resize(dims); CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); @@ -147,9 +153,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { template < typename T, typename = typename std::enable_if::value>::type> - TensorImpl(const T& value, BaseContext* context) + TensorImpl(const T& value, at::BaseContext* context) : storage_(context->GetDevicetype(), TypeMeta::Make()) { - Resize(vector{}); + Resize(std::vector{}); context->CopyItemsFromCPU( storage_.dtype(), numel_, &value, mutable_data()); } @@ -174,7 +180,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * Since we removed template from tensor, we now store a static * context pointer in tensor, which indicates the type of the tensor. */ - BaseStaticContext* GetStaticContext() const { + at::BaseStaticContext* GetStaticContext() const { return get_static_context(GetDeviceType()); } @@ -185,7 +191,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * TODO(jerryzh): move this to a global registry * that can create context for us */ - std::unique_ptr CreateContext() const { + std::unique_ptr CreateContext() const { return GetStaticContext()->CreateContext(); } @@ -197,12 +203,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * @brief Copies the data from a source tensor, with a contex provided to * carry out the underlying memcpy operation. */ - void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) { + void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) { if ((void*)&src == (void*)this) { return; } if (storage_.dtype() != src.meta()) { - storage_ = Storage(GetDeviceType(), src.meta()); + storage_ = at::Storage(GetDeviceType(), src.meta()); } if (src.size() == -1) { dims_.clear(); @@ -250,7 +256,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * @brief Extend the outer-most dimension of this tensor * to dimension of `num`. */ - void ExtendTo(TIndex num, float growthPct, BaseContext* context) { + void ExtendTo(TIndex num, float growthPct, at::BaseContext* context) { CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); CAFFE_ENFORCE(context != nullptr, "Context must be provided."); @@ -266,7 +272,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * growthPct. This ensures that Extend runs on an amortized O(1) time * complexity. */ - void Extend(TIndex num, float growthPct, BaseContext* context) { + void Extend(TIndex num, float growthPct, at::BaseContext* context) { CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); CAFFE_ENFORCE_GE_WITH_CALLER( num, 0, "`num` must be non-negative for Extend"); @@ -412,7 +418,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * Resizes the tensor without touching underlying storage. * This requires the total size of the tensor to remains constant. */ - inline void Reshape(const vector& dims) { + inline void Reshape(const std::vector& dims) { TIndex new_size = 1; for (auto d : dims) { CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); @@ -430,7 +436,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { dims_ = dims; } - inline void Reshape(const vector& dims) { + inline void Reshape(const std::vector& dims) { Reshape(ToVectorTIndex(dims)); } @@ -441,7 +447,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ inline void FreeMemory() { // We'll detach from the old Storage and create a new one - storage_ = Storage(storage_.device_type(), storage_.dtype()); + storage_ = at::Storage(storage_.device_type(), storage_.dtype()); } /** @@ -449,7 +455,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * is very slow since it involves quite some string operations, so do not use * it in your performance-critical code. */ - string DebugString() const { + std::string DebugString() const { std::stringstream ss; ss << "A Tensor of item size " << storage_.itemsize() << " and type " << storage_.dtype().name() << " and dimension ("; @@ -552,7 +558,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { } else { int64_t numel = capacity / data_type.itemsize(); // Create a new Storage - storage_ = Storage(data_type, numel, std::move(data_ptr), nullptr, true); + storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true); } } @@ -613,7 +619,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { storage_.set_dtype(meta); } else { if (storage_.dtype() != meta) { - storage_ = Storage(storage_.device_type(), meta); + storage_ = at::Storage(storage_.device_type(), meta); } } @@ -735,7 +741,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { /** * Returns the dimensions of the tensor as a vector. */ - inline const vector& dims() const { + inline const std::vector& dims() const { return dims_; } @@ -813,11 +819,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { GetStaticContext()->ExtractDeviceOption(device, raw_data()); } - const Storage& storage() { + const at::Storage& storage() { return storage_; } - const Storage& storage() const { + const at::Storage& storage() const { return storage_; } @@ -830,14 +836,14 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { // The logic is that if Extend() or ReserveSpace() were ever called, // then subsequent Resize()s will not free up Storage. bool reserved_ = false; - Storage storage_; + at::Storage storage_; // int64_t storage_offset_; private: template < typename T, typename = typename std::enable_if::value>::type> - bool SetDims(const vector& src) { + bool SetDims(const std::vector& src) { auto old_numel = numel_; dims_.resize(src.size()); TIndex new_numel = 1; From 7607b495383397cb89920f7cd794933dfe9a89fa Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 16:29:04 -0700 Subject: [PATCH 137/237] s/GetDevicetype/device_type/ (#11656) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11656 The mis-capitalization really sticks up my craw. I know why (we already have a static function named GetDeviceType), but let's name it differently. ``` codemod -d . --extensions cc,cpp,cu,cuh,h,py,hpp,TARGETS GetDevicetype device_type ``` Reviewed By: jerryzh168 Differential Revision: D9813544 fbshipit-source-id: fe462f4bc40b03e74921f8cf5ebd9cfc52e7e636 --- aten/src/ATen/core/context_base.h | 6 +++--- caffe2/core/context.h | 2 +- caffe2/core/context_gpu.h | 2 +- caffe2/core/hip/context_hip.h | 2 +- caffe2/core/tensor_impl.h | 6 +++--- caffe2/ideep/utils/ideep_context.h | 2 +- caffe2/mkl/utils/mkl_context.h | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h index 2ca9a7f6851102..7cf1b7cc174980 100644 --- a/aten/src/ATen/core/context_base.h +++ b/aten/src/ATen/core/context_base.h @@ -61,7 +61,7 @@ class AT_CORE_API BaseContext { virtual BaseStaticContext* GetStaticContext() const = 0; /* Sorry for the naming, will get rid of this in future diff */ - virtual DeviceType GetDevicetype() const = 0; + virtual DeviceType device_type() const = 0; virtual void SwitchToDevice(int /*stream_id*/) = 0; @@ -96,13 +96,13 @@ class AT_CORE_API BaseContext { DeviceType type) { if (type == DeviceType::CPU) { CopyBytesToCPU(nbytes, src, dst); - } else if (type == GetDevicetype()) { + } else if (type == device_type()) { CopyBytesSameDevice(nbytes, src, dst); } else { AT_ERROR( "CopyBytesToDevice can only copy to CPU or between same " "device. Can't copy from: ", - GetDevicetype(), + device_type(), " to", type); } diff --git a/caffe2/core/context.h b/caffe2/core/context.h index 4faaea93c6da12..a4e87e8110dc83 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -153,7 +153,7 @@ class CAFFE2_API CPUContext final : public BaseContext { return true; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return CPU; } diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index cca1b8be6500a8..987c9ffe35299d 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -285,7 +285,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext { return cudaStreamQuery(stream) == cudaSuccess; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return CUDA; } diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h index cd309e6473be4f..5a7613cf934fd0 100644 --- a/caffe2/core/hip/context_hip.h +++ b/caffe2/core/hip/context_hip.h @@ -269,7 +269,7 @@ class HIPContext final : public BaseContext { return hipStreamQuery(stream) == hipSuccess; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return HIP; } diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 2b07537d43774b..4abe8c910e3361 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -139,7 +139,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { const std::vector& dims, const std::vector& values, at::BaseContext* context) - : storage_(context->GetDevicetype(), TypeMeta::Make()) { + : storage_(context->device_type(), TypeMeta::Make()) { Resize(dims); CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); context->CopyItemsFromCPU( @@ -154,7 +154,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { typename T, typename = typename std::enable_if::value>::type> TensorImpl(const T& value, at::BaseContext* context) - : storage_(context->GetDevicetype(), TypeMeta::Make()) { + : storage_(context->device_type(), TypeMeta::Make()) { Resize(std::vector{}); context->CopyItemsFromCPU( storage_.dtype(), numel_, &value, mutable_data()); @@ -236,7 +236,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); } else { CAFFE_ENFORCE( - context->GetDevicetype() == src.GetDeviceType(), + context->device_type() == src.GetDeviceType(), "Type for provided context does not match the type of source"); context->CopyBytesToDevice( nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index c8657728c57e76..f50a4f34c66789 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -119,7 +119,7 @@ class IDEEPContext final : public BaseContext { return true; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return IDEEP; } diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h index 636ebf2217eac8..0a7b5808a446be 100644 --- a/caffe2/mkl/utils/mkl_context.h +++ b/caffe2/mkl/utils/mkl_context.h @@ -127,7 +127,7 @@ class MKLContext : public BaseContext { return true; } - DeviceType GetDevicetype() const override { + DeviceType device_type() const override { return MKLDNN; } From 02980d7f8ca707be9f111d5b34d7654dafdefb03 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 16:29:06 -0700 Subject: [PATCH 138/237] Refactor Tensor/TensorImpl constructors. (#11657) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11657 Previously, we had a constructor in TensorImpl for every constructor in Tensor. This was unnecessary and wordy: Tensor is the user-visible class, so it deserves the constructors, but TensorImpl is internal and doesn't need it. So I replaced TensorImpl with a single, Storage accepting constructor, and then rewrote Tensor to use that constructor. Reviewed By: jerryzh168 Differential Revision: D9813742 fbshipit-source-id: 7501b54fe5f39180f1bc07573fd7c1640b0f4e89 --- caffe2/core/tensor.h | 76 ++++++++++++++++++++++++++++----------- caffe2/core/tensor_impl.h | 71 +----------------------------------- 2 files changed, 56 insertions(+), 91 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 5564034e545837..f3ecfae765a615 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -50,45 +50,75 @@ class CAFFE2_API Tensor final { return impl_.get(); } - explicit Tensor(DeviceType type) - : impl_(c10::make_intrusive(type)) {} - + explicit Tensor(Storage storage) + : impl_(c10::make_intrusive(std::move(storage))) {} + + /** + * @brief Creates a tensor of the given dimension. + * + * Note that the actual data allocation is not going to be carried out until + * the first time mutable_data() is called. + */ explicit Tensor(const vector& dims, DeviceType type) - : impl_( - c10::make_intrusive(dims, type)) {} + : Tensor(Storage(type)) { + // TODO: here, we create a Storage + // and immediately discard it in Resize() since + // reset_tensor will be true and FreeMemory will be called, + // we might want to avoid creating Storage twice? + Resize(dims); + } explicit Tensor(const vector& dims, DeviceType type) - : impl_( - c10::make_intrusive(dims, type)) {} + : Tensor(Storage(type)) { + Resize(dims); + } + /** + * context_for_copy is required to have the same DeviceType as src + */ Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type) - : impl_(c10::make_intrusive( - *src.impl_, - context_for_copy, - type)) {} + : Tensor(Storage(type)) { + CopyFrom(src, context_for_copy); + } + /** + * @brief: Create a Tensor of at::DeviceType `type` and initialize it with + * src Tensor + */ Tensor(const Tensor& src, DeviceType type) - : impl_(c10::make_intrusive( - *src.impl_, - type)) {} + : Tensor(Storage(type)) { + CopyFrom(src); + } + /** + * @brief Creates a tensor, and fills its contents with the given values. + * The type of tensor will be decided by the context parameter + */ template Tensor( const vector& dims, const vector& values, BaseContext* context) - : impl_(c10::make_intrusive( - dims, - values, - context)) {} + : Tensor(Storage(context->device_type(), TypeMeta::Make())) { + Resize(dims); + CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size()); + context->CopyItemsFromCPU( + storage().dtype(), size(), values.data(), mutable_data()); + } + /** + * @brief Creates a scalar tensor, and fills its content with the given value. + * The type of tensor will be decided by the context parameter + */ template < typename T, typename = typename std::enable_if::value>::type> Tensor(const T& value, BaseContext* context) - : impl_(c10::make_intrusive( - value, - context)) {} + : Tensor(Storage(context->device_type(), TypeMeta::Make())) { + Resize(std::vector{}); + context->CopyItemsFromCPU( + storage().dtype(), size(), &value, mutable_data()); + } Tensor Clone() const { Tensor x(GetDeviceType()); @@ -275,6 +305,10 @@ class CAFFE2_API Tensor final { inline void ExtractDeviceOption(DeviceOption* device) const { return impl_.get()->ExtractDeviceOption(device); } + + const Storage& storage() { + return impl_->storage(); + } }; using TensorCPU = Tensor; diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 4abe8c910e3361..6771612022302b 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -88,77 +88,8 @@ inline int canonical_axis_index_(int axis_index, int ndims) { class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { public: TensorImpl() = delete; - explicit TensorImpl(DeviceType device_type) : storage_(device_type) {} - /** - * @brief Creates a tensor of the given dimension. - * - * Note that the actual data allocation is not going to be carried out until - * the first time mutable_data() is called. - */ - // TODO: here, we create a Storage - // and immediately discard it in Resize() since - // reset_tensor will be true and FreeMemory will be called, - // we might want to avoid creating Storage twice? - explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - /* Now we require that context_for_copy has the same device type as src since - * template is removed - */ - TensorImpl( - const TensorImpl& src, - at::BaseContext* context_for_copy, - at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src, context_for_copy); - } - - /** - * @brief: Create a Tensor of at::DeviceType `type` and initialize it with - * src Tensor - */ - TensorImpl(const TensorImpl& src, at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src); - } - - /** - * @brief Creates a tensor, and fills its contents with the given values. - * The type of tensor will be decided by the context parameter - */ - template - TensorImpl( - const std::vector& dims, - const std::vector& values, - at::BaseContext* context) - : storage_(context->device_type(), TypeMeta::Make()) { - Resize(dims); - CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, values.data(), mutable_data()); - } - - /** - * @brief Creates a scalar tensor, and fills its content with the given value. - * The type of tensor will be decided by the context parameter - */ - template < - typename T, - typename = typename std::enable_if::value>::type> - TensorImpl(const T& value, at::BaseContext* context) - : storage_(context->device_type(), TypeMeta::Make()) { - Resize(std::vector{}); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, &value, mutable_data()); - } + explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {} /** * @brief Delete the copy constructor and use Clone explicitly From e1cd220b901c7e1bb37bf822c356ba3f8e6b3134 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 16:29:09 -0700 Subject: [PATCH 139/237] Reimplement swap() using default move constructor. (#11659) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11659 This is less error-prone and less code. Reviewed By: smessmer Differential Revision: D9814536 fbshipit-source-id: 028510e31e2fa7a9fa11c1398b0743c5cd085dd5 --- caffe2/core/tensor.h | 3 ++- caffe2/core/tensor_impl.h | 21 +++------------------ 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index f3ecfae765a615..a167658060e7fc 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -188,7 +188,8 @@ class CAFFE2_API Tensor final { // swap method swaps the CONTENTS of the tensors, while std::swap // swaps the POINTERS. void swap(const Tensor& other) const noexcept { - impl_.get()->swap(*other.impl_.get()); + // NB: use get() to get a non-const pointer! + std::swap(*impl_.get(), *other.impl_.get()); } void ShareData(const Tensor& src) const { diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 6771612022302b..4c146e0d257530 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -91,19 +91,10 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {} - /** - * @brief Delete the copy constructor and use Clone explicitly - */ - TensorImpl(const TensorImpl& src) = delete; - - TensorImpl(TensorImpl&& src) noexcept { - swap(src); - } - + TensorImpl(const TensorImpl&) = default; + TensorImpl& operator=(const TensorImpl&) = default; + TensorImpl(TensorImpl&&) = default; TensorImpl& operator=(TensorImpl&&) = default; - // Note(jiayq): possibly a rule-of-three violation, but we explicitly - // discourage the use of = for Tensors. - TensorImpl& operator=(const TensorImpl& src) = delete; virtual ~TensorImpl() noexcept {} @@ -397,12 +388,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { return ss.str(); } - void swap(TensorImpl& other) noexcept { - std::swap(dims_, other.dims_); - std::swap(numel_, other.numel_); - std::swap(storage_, other.storage_); - } - /** * @brief Shares the data with another tensor. * From 760679352e8a6fda08b55c7e0cf2c916c7ed6d53 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Thu, 13 Sep 2018 18:19:24 -0700 Subject: [PATCH 140/237] Move Pixel Shuffle to ATen (#9721) Summary: #9692 Pull Request resolved: https://github.com/pytorch/pytorch/pull/9721 Differential Revision: D8955829 Pulled By: SsnL fbshipit-source-id: 4f4d1c7720b6f757fbef9a10f70209ae76f61399 --- aten/src/ATen/native/PixelShuffle.cpp | 34 +++++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 2 ++ torch/nn/functional.py | 39 ++++++++-------------- torch/nn/modules/pixelshuffle.py | 16 ++++----- 4 files changed, 57 insertions(+), 34 deletions(-) create mode 100644 aten/src/ATen/native/PixelShuffle.cpp diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp new file mode 100644 index 00000000000000..1f93ecbc8235ab --- /dev/null +++ b/aten/src/ATen/native/PixelShuffle.cpp @@ -0,0 +1,34 @@ +#include "ATen/native/TensorTransformations.h" + +#include +#include + +#include +#include + +namespace at { +namespace native { + +Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) { + AT_ASSERTM(self.dim() == 4, + "pixel_shuffle expects 4D input, but got input with sizes ",self.sizes()); + int64_t b = self.size(0); + int64_t c = self.size(1); + int64_t h = self.size(2); + int64_t w = self.size(3); + int64_t upscale_factor_squared = upscale_factor * upscale_factor; + AT_ASSERTM(c % upscale_factor_squared == 0, + "pixel_shuffle expects input channel to be divisible by square of " + "upscale_factor, but got input with sizes ", self.sizes(), + ", upscale_factor=", upscale_factor, + ", and self.size(1)=", c, " is not divisible by ", upscale_factor_squared); + int64_t oc = c / upscale_factor_squared; + int64_t oh = h * upscale_factor; + int64_t ow = w * upscale_factor; + + auto input_reshaped = self.reshape({b, oc, upscale_factor, upscale_factor, h, w}); + return input_reshaped.permute({0 /* b */, 1 /* oc */, 4 /* h */, 2 /* 1st upscale_factor */, 5 /* w */, 3 /* 2nd upscale_factor */}) + .reshape({b, oc, oh, ow}); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index c15aefa2f671d6..5c99d7c97e9b3e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1213,6 +1213,8 @@ - func: permute(Tensor self, IntList dims) -> Tensor variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. +- func: pixel_shuffle(Tensor self, int64_t upscale_factor) -> Tensor + - func: pin_memory(Tensor self) -> Tensor variants: function, method diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 243974e152bb27..01d3775d5f0927 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1918,36 +1918,23 @@ def multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=No return torch._C._nn.multi_margin_loss(input, target, p, margin, weight, reduction) -def pixel_shuffle(input, upscale_factor): - r"""Rearranges elements in a tensor of shape :math:`[*, C*r^2, H, W]` to a - tensor of shape :math:`[C, H*r, W*r]`. +pixel_shuffle = _add_docstr(torch.pixel_shuffle, r""" +Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a +tensor of shape :math:`(C, H \times r, W \times r)`. - See :class:`~torch.nn.PixelShuffle` for details. +See :class:`~torch.nn.PixelShuffle` for details. - Args: - input (Tensor): Input - upscale_factor (int): factor to increase spatial resolution by - - Examples:: - - >>> ps = nn.PixelShuffle(3) - >>> input = torch.empty(1, 9, 4, 4) - >>> output = ps(input) - >>> print(output.size()) - torch.Size([1, 1, 12, 12]) - """ - batch_size, channels, in_height, in_width = input.size() - channels //= upscale_factor ** 2 - - out_height = in_height * upscale_factor - out_width = in_width * upscale_factor +Args: + input (Tensor): the input tensor + upscale_factor (int): factor to increase spatial resolution by - input_view = input.contiguous().view( - batch_size, channels, upscale_factor, upscale_factor, - in_height, in_width) +Examples:: - shuffle_out = input_view.permute(0, 1, 4, 2, 5, 3).contiguous() - return shuffle_out.view(batch_size, channels, out_height, out_width) + >>> input = torch.randn(1, 9, 4, 4) + >>> output = torch.nn.functional.pixel_shuffle(input, 3) + >>> print(output.size()) + torch.Size([1, 1, 12, 12]) +""") def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=None): diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py index e68946d7c7fe90..0e3883120ca352 100644 --- a/torch/nn/modules/pixelshuffle.py +++ b/torch/nn/modules/pixelshuffle.py @@ -3,28 +3,28 @@ class PixelShuffle(Module): - r"""Rearranges elements in a Tensor of shape :math:`(*, r^2C, H, W)` to a - tensor of shape :math:`(C, rH, rW)`. + r"""Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` + to a tensor of shape :math:`(C, H \times r, W \times r)`. This is useful for implementing efficient sub-pixel convolution with a stride of :math:`1/r`. Look at the paper: `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ - by Shi et. al (2016) for more details + by Shi et. al (2016) for more details. Args: upscale_factor (int): factor to increase spatial resolution by Shape: - - Input: :math:`(N, C * \text{upscale\_factor}^2, H, W)` - - Output: :math:`(N, C, H * \text{upscale\_factor}, W * \text{upscale\_factor})` + - Input: :math:`(N, C \times \text{upscale_factor}^2, H, W)` + - Output: :math:`(N, C, H \times \text{upscale_factor}, W \times \text{upscale_factor})` Examples:: - >>> ps = nn.PixelShuffle(3) - >>> input = torch.tensor(1, 9, 4, 4) - >>> output = ps(input) + >>> pixel_shuffle = nn.PixelShuffle(3) + >>> input = torch.randn(1, 9, 4, 4) + >>> output = pixel_shuffle(input) >>> print(output.size()) torch.Size([1, 1, 12, 12]) From 513fd3dd36f146cc0ed45437e58372a4b46b93b2 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Thu, 13 Sep 2018 19:15:15 -0700 Subject: [PATCH 141/237] Improve doc of `torch.nn.functional.pad` (#11623) Summary: I'm reading the doc of `torch.nn.functional.pad` and it looks a bit confusing to me. Hopefully this PR makes it clearer. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11623 Differential Revision: D9818255 Pulled By: soumith fbshipit-source-id: 4f6b17b0211c6927007f44bfdf42df5f84d47536 --- torch/nn/functional.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 01d3775d5f0927..61c0fdce5fe048 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -2227,22 +2227,22 @@ def affine_grid(theta, size): def pad(input, pad, mode='constant', value=0): r"""Pads tensor. - `Nd` constant padding: The number of dimensions to pad is - :math:`\left\lfloor\frac{len(padding)}{2}\right\rfloor` and the dimensions that get padded begins with the - last dimension and moves forward. See below for examples. - - `1D`, `2D` and `3D` "reflect" / "replicate" padding: - for 1D: - 3D input tensor with padding of the form `(padLeft, padRight)` - for 2D: - 4D input tensor with padding of the form `(padLeft, padRight, padTop, padBottom)`. - for 3D: - 5D input tensor with padding of the form - `(padLeft, padRight, padTop, padBottom, padFront, padBack)`. No "reflect" implementation. - - See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and - :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the - padding modes works. + Pading size: + The number of dimensions to pad is :math:`\left\lfloor\frac{\text{len(pad)}}{2}\right\rfloor` + and the dimensions that get padded begins with the last dimension and moves forward. + For example, to pad the last dimension of the input tensor, then `pad` has form + `(padLeft, padRight)`; to pad the last 2 dimensions of the input tensor, then use + `(padLeft, padRight, padTop, padBottom)`; to pad the last 3 dimensions, use + `(padLeft, padRight, padTop, padBottom, padFront, padBack)`. + + Padding mode: + See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and + :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the + padding modes works. Constant padding is implemented for arbitrary dimensions. + Replicate padding is implemented for padding the last 3 dimensions of 5D input + tensor, or the last 2 dimensions of 4D input tensor, or the last dimension of + 3D input tensor. Reflect padding is only implemented for padding the last 2 + dimensions of 4D input tensor, or the last dimension of 3D input tensor. Args: input (Tensor): `Nd` tensor From 98e04db955f0d142a132ab615d3188db78c18608 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Thu, 13 Sep 2018 19:23:06 -0700 Subject: [PATCH 142/237] Implement requires_grad propagation in the JIT (#11586) Summary: Previously, we would pretty much assume that all floating point tensors do require grad, which might result in some unnecessary compute. I don't really like the fact that `TensorType` uses `tensor.is_variable() && tensor.requires_grad()` to infer the value of `requires_grad`, but changing constants to keep variables turns out to be pretty hard. I got halfway there, but it would still need some more work. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11586 Reviewed By: ezyang Differential Revision: D9813648 Pulled By: apaszke fbshipit-source-id: 77f77756d18ff7632fca3aa68ce855e1d7f3bdb8 --- test/expect/TestJit.test_cpp_cuda.expect | 36 ++--- ...pt.test_milstm_fusion_cuda-backward.expect | 91 ++++++------- test/test_jit.py | 6 +- torch/CMakeLists.txt | 1 + torch/csrc/jit/autodiff.cpp | 78 ++++------- torch/csrc/jit/autodiff.h | 4 +- torch/csrc/jit/graph_executor.cpp | 14 +- torch/csrc/jit/init.cpp | 4 +- torch/csrc/jit/ir.h | 3 + .../jit/passes/requires_grad_analysis.cpp | 128 ++++++++++++++++++ .../csrc/jit/passes/requires_grad_analysis.h | 15 ++ torch/csrc/jit/test_jit.cpp | 23 ++-- torch/csrc/jit/type.h | 44 ++++-- 13 files changed, 287 insertions(+), 160 deletions(-) create mode 100644 torch/csrc/jit/passes/requires_grad_analysis.cpp create mode 100644 torch/csrc/jit/passes/requires_grad_analysis.h diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect index 50d2cbe963833c..451f1f9329601c 100644 --- a/test/expect/TestJit.test_cpp_cuda.expect +++ b/test/expect/TestJit.test_cpp_cuda.expect @@ -129,38 +129,38 @@ graph(%0 : Float(2, 3, 4) } testDifferentiateWithRequiresGrad -graph(%0 : Float(2, 3, 4) - %1 : Float(2, 3, 4)) { - %2 : Float(2, 3, 4) = aten::mul(%1, %1) +graph(%0 : Float(*) + %1 : Float(*)) { + %2 : Float(*) = aten::mul(%1, %1) %3 : int = prim::Constant[value=1]() - %4 : Float(2, 3, 4) = aten::add(%2, %1, %3) + %4 : Float(*) = aten::add(%2, %1, %3) %5 : int = prim::Constant[value=1]() - %6 : Float(2, 3, 4) = aten::add(%4, %0, %5) - %7 : Float(2, 3, 4) = aten::mul(%6, %0) + %6 : Float(*) = aten::add(%4, %0, %5) + %7 : Float(*) = aten::mul(%6, %0) %8 : int = prim::Constant[value=1]() - %9 : Float(2, 3, 4) = aten::add(%7, %1, %8) + %9 : Float(*) = aten::add(%7, %1, %8) return (%4, %9, %6); } -graph(%0 : Float(2, 3, 4) - %1 : Float(2, 3, 4) - %2 : Float(2, 3, 4) - %3 : Float(2, 3, 4)) { +graph(%0 : Float(*) + %1 : Float(*) + %2 : Float(*) + %3 : Float(*)) { %4 : int = prim::Constant[value=1]() - %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0) + %5 : Float(*), %6 : Dynamic = prim::GradOf[name="aten::add"](%0) block0() { - %7 : Float(2, 3, 4) = aten::mul(%0, %4) + %7 : Dynamic = aten::mul(%0, %4) -> (%0, %7) } - %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%5) + %8 : Dynamic, %9 : Dynamic = prim::GradOf[name="aten::mul"](%5) block0() { - %10 : Float(2, 3, 4) = aten::mul(%5, %2) - %11 : Float(2, 3, 4) = aten::mul(%5, %3) + %10 : Dynamic = aten::mul(%5, %2) + %11 : Dynamic = aten::mul(%5, %3) -> (%10, %11) } %12 : Dynamic = prim::AutogradAdd(%1, %8) - %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%12) + %13 : Dynamic, %14 : Dynamic = prim::GradOf[name="aten::add"](%12) block0() { - %15 : Float(2, 3, 4) = aten::mul(%12, %4) + %15 : Dynamic = aten::mul(%12, %4) -> (%12, %15) } %16 : Dynamic = prim::AutogradAdd(%9, %14) diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect index 5a099e3b3e6903..fb41a574947080 100644 --- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect +++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect @@ -26,19 +26,15 @@ graph(%0 : Float(*, *) %cellgate : Float(*, *) %outgate : Float(*, *) %27 : Float(*, *)) { - %28 : Float(*, *), %29 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %1, %27, %0) - %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *), %35 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16) - %36 : Float(*, *) = aten::t(%20) - %37 : Float(*, *) = aten::mm(%32, %36) - %38 : Float(*, *) = aten::t(%13) - %39 : Float(*, *) = aten::mm(%38, %32) + %28 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %0, %27, %1) + %29 : Float(*, *), %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16) + %35 : Float(*, *) = aten::t(%13) + %36 : Float(*, *) = aten::mm(%35, %31) + %37 : Float(*, *) = aten::t(%36) + %38 : Float(*, *) = aten::t(%12) + %39 : Float(*, *) = aten::mm(%38, %29) %40 : Float(*, *) = aten::t(%39) - %41 : Float(*, *) = aten::t(%18) - %42 : Float(*, *) = aten::mm(%30, %41) - %43 : Float(*, *) = aten::t(%12) - %44 : Float(*, *) = aten::mm(%43, %30) - %45 : Float(*, *) = aten::t(%44) - return (%45, %42, %40, %37, %31, %33, %34, %35, %29); + return (%40, %37, %30, %32, %33, %34); } with prim::FusionGroup_0 = graph(%0 : Float(*, *) %1 : Float(*, *) @@ -48,45 +44,42 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *) %5 : Float(*, *) %6 : Float(*, *) %7 : Float(*, *)) { - %8 : Float(*, *) = aten::mul(%5, %3) - %9 : Float(*, *) = aten::mul(%6, %6) - %10 : Float(*, *) = aten::neg(%9) - %11 : int = prim::Constant[value=1]() + %8 : Float(*, *) = aten::mul(%7, %6) + %9 : Float(*, *) = aten::mul(%7, %3) + %10 : Float(*, *) = aten::mul(%6, %6) + %11 : Float(*, *) = aten::neg(%10) %12 : int = prim::Constant[value=1]() - %13 : Float(*, *) = aten::add(%10, %12, %12) - %14 : Float(*, *) = aten::mul(%8, %13) + %13 : Float(*, *) = aten::add(%11, %12, %12) + %14 : Float(*, *) = aten::mul(%9, %13) %15 : int = prim::Constant[value=1]() - %16 : int = prim::Constant[value=1]() - %17 : Float(*, *) = aten::add(%7, %14, %16) - %18 : Float(*, *) = aten::mul(%17, %1) - %19 : Float(*, *) = aten::mul(%5, %6) - %20 : int = prim::Constant[value=1]() - %21 : Float(*, *) = aten::mul(%17, %20) - %22 : Float(*, *) = aten::mul(%21, %2) - %23 : Float(*, *) = aten::mul(%21, %0) - %24 : Float(*, *) = aten::mul(%17, %4) - %25 : Float(*, *) = aten::neg(%3) - %26 : int = prim::Constant[value=1]() - %27 : Float(*, *) = aten::add(%25, %26, %26) - %28 : Float(*, *) = aten::mul(%19, %3) - %29 : Float(*, *) = aten::mul(%28, %27) - %30 : Float(*, *) = aten::mul(%2, %2) - %31 : Float(*, *) = aten::neg(%30) - %32 : int = prim::Constant[value=1]() - %33 : Float(*, *) = aten::add(%31, %32, %32) - %34 : Float(*, *) = aten::mul(%23, %33) - %35 : Float(*, *) = aten::neg(%1) - %36 : int = prim::Constant[value=1]() - %37 : Float(*, *) = aten::add(%35, %36, %36) - %38 : Float(*, *) = aten::mul(%24, %1) - %39 : Float(*, *) = aten::mul(%38, %37) - %40 : Float(*, *) = aten::neg(%0) - %41 : int = prim::Constant[value=1]() - %42 : Float(*, *) = aten::add(%40, %41, %41) - %43 : Float(*, *) = aten::mul(%22, %0) - %44 : Float(*, *) = aten::mul(%43, %42) - %45 : Float(*, *) = prim::FusedConcat[dim=1](%44, %39, %34, %29) - return (%45, %18); + %16 : Float(*, *) = aten::add(%5, %14, %15) + %17 : int = prim::Constant[value=1]() + %18 : Float(*, *) = aten::mul(%16, %17) + %19 : Float(*, *) = aten::mul(%18, %2) + %20 : Float(*, *) = aten::mul(%18, %0) + %21 : Float(*, *) = aten::mul(%16, %4) + %22 : Float(*, *) = aten::neg(%3) + %23 : int = prim::Constant[value=1]() + %24 : Float(*, *) = aten::add(%22, %23, %23) + %25 : Float(*, *) = aten::mul(%8, %3) + %26 : Float(*, *) = aten::mul(%25, %24) + %27 : Float(*, *) = aten::mul(%2, %2) + %28 : Float(*, *) = aten::neg(%27) + %29 : int = prim::Constant[value=1]() + %30 : Float(*, *) = aten::add(%28, %29, %29) + %31 : Float(*, *) = aten::mul(%20, %30) + %32 : Float(*, *) = aten::neg(%1) + %33 : int = prim::Constant[value=1]() + %34 : Float(*, *) = aten::add(%32, %33, %33) + %35 : Float(*, *) = aten::mul(%21, %1) + %36 : Float(*, *) = aten::mul(%35, %34) + %37 : Float(*, *) = aten::neg(%0) + %38 : int = prim::Constant[value=1]() + %39 : Float(*, *) = aten::add(%37, %38, %38) + %40 : Float(*, *) = aten::mul(%19, %0) + %41 : Float(*, *) = aten::mul(%40, %39) + %42 : Float(*, *) = prim::FusedConcat[dim=1](%41, %36, %31, %26) + return (%42); } with prim::FusionGroup_1 = graph(%0 : Float(*) %1 : Float(*) diff --git a/test/test_jit.py b/test/test_jit.py index 54bc33e8542e43..985a4f760466b7 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -119,9 +119,9 @@ def canonical(graph): def get_lstm_inputs(device, training=False): - input = torch.randn(3, 10, dtype=torch.float, device=device) - hx = torch.randn(3, 20, dtype=torch.float, device=device) - cx = torch.randn(3, 20, dtype=torch.float, device=device) + input = torch.randn(3, 10, dtype=torch.float, device=device, requires_grad=training) + hx = torch.randn(3, 20, dtype=torch.float, device=device, requires_grad=training) + cx = torch.randn(3, 20, dtype=torch.float, device=device, requires_grad=training) module = nn.LSTMCell(10, 20).to(device, torch.float) # Just to allocate weights with correct sizes if training: params = tuple(module.parameters()) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 427270d2a89fa1..04a72819717bad 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -174,6 +174,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index 3d53ad3967c8cd..ab6a07cbb1d959 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -72,25 +72,15 @@ bool isDifferentiable(Graph & g) { } -bool outputRequiresGrad(Node* node, std::function requires_grad) { - switch (node->kind()) { - case aten::le: - case aten::ge: - case aten::lt: - case aten::gt: - case aten::ne: - case aten::eq: - return false; - case aten::type_as: - // type_as has two inputs, the second of which (setting type) might require grad, - // but it still won't affect the output of type_as requiring grad. - return requires_grad(node->inputs().at(0)); - default: - return std::any_of(node->inputs().begin(), node->inputs().end(), requires_grad); - } -} - static std::vector gradientForNode(Node* node, ArrayRef grad_values) { + static const OperatorSet comparison_ops = { + "aten::lt(Tensor self, Tensor other) -> Tensor", + "aten::le(Tensor self, Tensor other) -> Tensor", + "aten::gt(Tensor self, Tensor other) -> Tensor", + "aten::ge(Tensor self, Tensor other) -> Tensor", + "aten::eq(Tensor self, Tensor other) -> Tensor", + "aten::ne(Tensor self, Tensor other) -> Tensor" + }; const auto build_sym_grad = [node](const std::vector& grads) -> std::vector { auto inputs = fmap(node->inputs()); auto outputs = fmap(node->outputs()); @@ -212,6 +202,8 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val tensor_grads.push_back(nullptr); // for attr::dim return tensor_grads; } + } else if (comparison_ops.find(node)) { + return {nullptr, nullptr}; } else if (node->kind() == prim::Constant) { return {}; @@ -226,28 +218,6 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val return fmap(sym_grads, [](const SymbolicVariable &v) { return v.value(); }); } -static value_set findAllRequiresGradNodes( - Graph& graph, const std::vector& input_requires_grad) { - JIT_ASSERT(graph.inputs().size() == input_requires_grad.size()); - std::unordered_set requires_grad_set; - const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; }; - - auto inputs = graph.inputs(); - for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) { - if (!input_requires_grad[i]) continue; - requires_grad_set.emplace(inputs[i]); - } - - for (Node * node : graph.nodes()) { - if (!outputRequiresGrad(node, requires_grad)) continue; - for (Value * output : node->outputs()) - requires_grad_set.emplace(output); - } - - return requires_grad_set; -} - - // If we have a function y = f(x) with jacobian J, the backwards of f is dx = J^t dy. // Note that because the backwards always implements this matrix multiply, // we know that it maps an input vector of zeros to an output vector of zero @@ -277,13 +247,11 @@ static std::vector linearGradientForNode(Node* node, ArrayRef gr } struct ReverseDetails { - ReverseDetails(value_map&& grad_map, value_set&& requires_grad_set, Block * reverse_block) + ReverseDetails(value_map&& grad_map, Block * reverse_block) : grad_map(std::move(grad_map)) - , requires_grad_set(std::move(requires_grad_set)) , reverse_block(reverse_block) {} value_map grad_map; - value_set requires_grad_set; Block * reverse_block; }; @@ -305,8 +273,7 @@ static Value* createAutogradAdd(Value* a, Value* b) { // and vjp outputs for all primal inputs that require_grad // - grad_desc has df_input_vjps and df_output_vjps set // (but df_input_vjps will be modified later as well) -static ReverseDetails addReverseInline(Gradient& grad_desc, - const std::vector& input_requires_grad) { +static ReverseDetails addReverseInline(Gradient& grad_desc) { auto & graph = *grad_desc.f; // note: reverse_node is intentionally not inserted to avoid // accidentally acting on it (e.g. in elminate dead code), @@ -314,8 +281,6 @@ static ReverseDetails addReverseInline(Gradient& grad_desc, auto reverse_node = graph.create(prim::Reverse, 0); auto reverse_block = reverse_node->addBlock(); WithInsertPoint guard(reverse_block); - auto requires_grad_set = findAllRequiresGradNodes(graph, input_requires_grad); - const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; }; value_map grad_map; // x -> dx mapping const auto get_grad = [&](Value* v) -> Value* { @@ -337,7 +302,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc, auto outputs = graph.outputs(); for (size_t i = 0, num_outputs = outputs.size(); i < num_outputs; ++i) { Value * output = outputs[i]; - if (!requires_grad(output)) + if (!output->requires_grad()) continue; Value * output_grad = reverse_block->addInput()->setType(output->type()); set_grad(output, output_grad); @@ -347,12 +312,15 @@ static ReverseDetails addReverseInline(Gradient& grad_desc, for (auto it = graph.nodes().rbegin(), end = graph.nodes().rend(); it != end; ++it) { Node *node = *it; auto inputs = node->inputs(); - if (!outputRequiresGrad(node, requires_grad)) continue; + auto outputs = node->outputs(); + if (std::all_of(outputs.begin(), outputs.end(), [](Value *v) { return !v->requires_grad(); })) { + continue; + } value_list grad_inputs = linearGradientForNode(node, fmap(node->outputs(), get_grad)); JIT_ASSERT(grad_inputs.size() == node->inputs().size()); for (size_t i = 0, num_inputs = grad_inputs.size(); i < num_inputs; ++i) { - if (!requires_grad(inputs[i])) continue; + if (!inputs[i]->requires_grad()) continue; // NB: Not returning a gradient w.r.t. a value that requires grad is normal if the // input is non-differentiable. This happens e.g. in the aten::type_as case. if (!grad_inputs[i]) continue; @@ -363,7 +331,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc, auto inputs = graph.inputs(); for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) { Value * input = inputs[i]; - if (!requires_grad(input)) + if (!input->requires_grad()) continue; // NB: Not having a gradient defined w.r.t. an input to the graph which requires grad // can happen and is not an error. It might have been used only in non-differentiable @@ -373,7 +341,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc, reverse_block->registerOutput(get_grad(input)); grad_desc.df_output_vjps.push_back(i); } - return ReverseDetails(std::move(grad_map), std::move(requires_grad_set), reverse_block); + return ReverseDetails(std::move(grad_map), reverse_block); } // Any temporary value from the primal graphs needs to be captured for later use in the @@ -500,7 +468,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) { for (size_t i = grad_desc.f_real_outputs; i < graph.outputs().size(); ++i) { Value * tmp = graph.outputs().at(i); // Add VJP inputs only for intermediates that actually required grad. - if (rev_info.requires_grad_set.count(tmp) == 0) continue; + if (!tmp->requires_grad()) continue; Value * tmp_vjp_in = reverse_block->addInput()->setType(tmp->type()); Value * tmp_vjp_prev = rev_info.grad_map.at(tmp); // This is quite weird because we can't first make a sum and then replace all uses @@ -536,7 +504,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) { reverse_block->owningNode()->destroy(); } -Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad) { +Gradient differentiate(std::shared_ptr& graph) { Gradient grad_desc; // Take ownership of the graph JIT_ASSERTM(graph.use_count() == 1, @@ -547,7 +515,7 @@ Gradient differentiate(std::shared_ptr& graph, const std::vector& r WithInsertPoint guard(grad_desc.f->block()); // Fills in df_input_vjps and df_output_vjps - auto rev_info = addReverseInline(grad_desc, requires_grad); + auto rev_info = addReverseInline(grad_desc); // Lift constants captured for the reverse graph into it liftConstants(grad_desc, rev_info); // addReverseInline has to call gradientForNode if *any* of the outputs diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h index ea2b7a1170efeb..ffb07a767083cc 100644 --- a/torch/csrc/jit/autodiff.h +++ b/torch/csrc/jit/autodiff.h @@ -85,9 +85,7 @@ struct Gradient { // - Interpret df // - Wrap outputs of df into Variables (that don't require grad) }; -// XXX: When calling this function, graph should have complete type information. -// Use the shape analysis pass to fill in the gaps if it doesn't. -TORCH_API Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad); +TORCH_API Gradient differentiate(std::shared_ptr& graph); // can we take a derivative of this node symbolically? TORCH_API bool isDifferentiable(Node * n); diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index d77f1b2be1aa28..1202713121362d 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -23,6 +23,7 @@ #include "torch/csrc/jit/passes/lower_grad_of.h" #include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/passes/inline_autodiff_subgraphs.h" +#include "torch/csrc/jit/passes/requires_grad_analysis.h" #include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/ivalue.h" #include "torch/csrc/jit/custom_operator.h" @@ -384,6 +385,7 @@ struct GraphExecutorImpl { // anyway, so it's better to run it first. ConstantPropagation(opt_graph); PropagateInputShapes(*opt_graph, spec); + PropagateRequiresGrad(opt_graph, spec); // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites that // we can still execute using autograd). @@ -396,18 +398,8 @@ struct GraphExecutorImpl { if (needsGradient(opt_graph, spec)) { auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph); for (Node * dnode : diff_nodes) { - // XXX: we don't have requires_grad information on the intermediate values, - // so we conservatively assume it's always true (on tensor inputs). auto diff_graph = std::move(dnode->g(attr::Subgraph)); - auto requires_grads = fmap(diff_graph->inputs(), [](Value* v) { - // NB: only floating-point inputs can have requires_grad=True. If we - // don't have type information, we have to assume that it's true. - if (auto tensor_type = v->type()->cast()) { - return at::isFloatingType(tensor_type->scalarType()); - } - return v->type()->isSubtypeOf(DynamicType::get()); - }); - Gradient gradient = differentiate(diff_graph, requires_grads); + Gradient gradient = differentiate(diff_graph); runNondiffOptimization(gradient.f); packGradient(gradient, dnode); } diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index 0d607c3c364a80..5484c04716c406 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -116,12 +116,12 @@ void initJITBindings(PyObject *module) { .def("_jit_pass_fixup_onnx_loops", FixupONNXLoops) .def("_jit_pass_canonicalize_ops", CanonicalizeOps) .def("_jit_pass_specialize_undef", specializeUndef) - .def("_jit_differentiate", [](Graph &g, const std::vector& requires_grad) { + .def("_jit_differentiate", [](Graph &g) { // the python binding slightly differs in semantics // it makes a copy of the input Graph, and works on that // jit::differentiate mutates the input Graph auto g_clone = g.copy(); - return differentiate(g_clone, requires_grad); + return differentiate(g_clone); }); py::class_(m, "CompleteArgumentSpec") diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 5e70adfe400916..58e702b771acaf 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -190,6 +190,9 @@ struct Value { JIT_ASSERT(type_ != nullptr); return type_; } + bool requires_grad() const { + return type()->requires_grad(); + } bool isTensor() const { return type()->kind() == TypeKind::CompleteTensorType; } diff --git a/torch/csrc/jit/passes/requires_grad_analysis.cpp b/torch/csrc/jit/passes/requires_grad_analysis.cpp new file mode 100644 index 00000000000000..8b17859af02c07 --- /dev/null +++ b/torch/csrc/jit/passes/requires_grad_analysis.cpp @@ -0,0 +1,128 @@ +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/type.h" +#include "torch/csrc/jit/argument_spec.h" +#include "torch/csrc/jit/operator.h" + +#include + +namespace torch { namespace jit { + +namespace { + +bool getRequiresGrad(Value * value) { + return value->requires_grad(); +} + +void setRequiresGrad(Value * value, bool req_value) { + if (auto type = value->type()->cast()) { + value->setType(type->withRequiresGrad(req_value)); + } +} + +void setRequiresGrad(at::ArrayRef outputs, const std::vector& values) { + JIT_ASSERT(outputs.size() == values.size()); + for (size_t i = 0; i < values.size(); ++i) { + setRequiresGrad(outputs[i], values[i]); + } +} + +void setRequiresGrad(Node * node, const std::vector& values) { + setRequiresGrad(node->outputs(), values); +} + +std::vector bitwiseOr(std::vector a, const std::vector& b) { + JIT_ASSERT(a.size() == b.size()); + for (size_t i = 0; i < a.size(); ++i) { + a[i] = a[i] || b[i]; + } + return a; +} + + +void PropagateRequiresGradSimpleNode(Node* node) { + static const OperatorSet comparison_ops = { + "aten::lt(Tensor self, Tensor other) -> Tensor", + "aten::le(Tensor self, Tensor other) -> Tensor", + "aten::gt(Tensor self, Tensor other) -> Tensor", + "aten::ge(Tensor self, Tensor other) -> Tensor", + "aten::eq(Tensor self, Tensor other) -> Tensor", + "aten::ne(Tensor self, Tensor other) -> Tensor", + "aten::lt(Tensor self, Scalar other) -> Tensor", + "aten::le(Tensor self, Scalar other) -> Tensor", + "aten::gt(Tensor self, Scalar other) -> Tensor", + "aten::ge(Tensor self, Scalar other) -> Tensor", + "aten::eq(Tensor self, Scalar other) -> Tensor", + "aten::ne(Tensor self, Scalar other) -> Tensor", + }; + + if (comparison_ops.find(node)) { + return setRequiresGrad(node->output(), false); + } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) { + return setRequiresGrad(node->output(), node->input(0)->requires_grad()); + } else if (node->matches("aten::detach(Tensor self) -> Tensor")) { + return setRequiresGrad(node->output(), false); + } + + auto inputs = node->inputs(); + auto outputs = node->outputs(); + bool should_require = std::any_of(inputs.begin(), inputs.end(), getRequiresGrad); + for (size_t i = 0; i < outputs.size(); ++i) { + if (auto type = outputs[i]->type()->cast()) { + setRequiresGrad(outputs[i], should_require && at::isFloatingType(type->scalarType())); + } + } +} + +void PropagateRequiresGrad(Block * block); + +void PropagateRequiresGrad(Node * node) { + if (node->kind() == prim::If) { + auto blocks = node->blocks(); + auto true_block = blocks.at(0); + auto false_block = blocks.at(1); + + PropagateRequiresGrad(true_block); + PropagateRequiresGrad(false_block); + + auto outputs_require = + bitwiseOr(fmap(true_block->outputs(), getRequiresGrad), + fmap(false_block->outputs(), getRequiresGrad)); + setRequiresGrad(node, outputs_require); + } else if (node->kind() == prim::Loop) { + auto body = node->blocks().at(0); + std::vector body_inputs_require = fmap(node->inputs().slice(2), getRequiresGrad); + std::vector body_outputs_require (node->outputs().size(), false); + + while (body_inputs_require != body_outputs_require) { + body_inputs_require = bitwiseOr(body_inputs_require, body_outputs_require); + setRequiresGrad(body->param_node()->outputs().slice(1), body_inputs_require); + PropagateRequiresGrad(body); + body_outputs_require = fmap(body->return_node()->inputs().slice(1), getRequiresGrad); + } + + setRequiresGrad(node, body_outputs_require); + } else { + PropagateRequiresGradSimpleNode(node); + } +} + +void PropagateRequiresGrad(Block * block) { + for (Node * node : block->nodes()) { + PropagateRequiresGrad(node); + } +} + +} // anonymous namespace + +void PropagateRequiresGrad(std::shared_ptr& graph, const ArgumentSpec & spec) { + auto inputs = graph->inputs(); + JIT_ASSERT(spec.size() == inputs.size()); + for (size_t i = 0; i < spec.size(); ++i) { + auto & arg = spec.at(i); + if (!arg.isTensor()) continue; + setRequiresGrad(inputs[i], arg.requires_grad()); + } + PropagateRequiresGrad(graph->block()); +} + +}} // namespace torch::jit diff --git a/torch/csrc/jit/passes/requires_grad_analysis.h b/torch/csrc/jit/passes/requires_grad_analysis.h new file mode 100644 index 00000000000000..37e53461ef4b47 --- /dev/null +++ b/torch/csrc/jit/passes/requires_grad_analysis.h @@ -0,0 +1,15 @@ +#pragma once + +#include "torch/csrc/WindowsTorchApiMacro.h" + +#include + +namespace torch { namespace jit { + +struct Graph; +struct ArgumentSpec; + +TORCH_API void PropagateRequiresGrad(std::shared_ptr& graph, const ArgumentSpec & spec); + +}} + diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index 4ffd16ff24c95a..7a652970243f34 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -26,6 +26,7 @@ using Catch::StartsWith; #include "torch/csrc/utils/hash.h" #include "torch/csrc/jit/argument_spec.h" #include "torch/csrc/jit/passes/shape_analysis.h" +#include "torch/csrc/jit/passes/requires_grad_analysis.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" #include "torch/csrc/jit/passes/lower_grad_of.h" #include "torch/csrc/jit/operator.h" @@ -34,7 +35,6 @@ using Catch::StartsWith; #include "torch/csrc/autograd/variable.h" #include "torch/csrc/autograd/engine.h" -#include "torch/csrc/jit/passes/shape_analysis.h" #include "torch/csrc/jit/graph_executor.h" #include "torch/csrc/jit/script/compiler.h" @@ -618,7 +618,7 @@ void testADFormulas() { // Trace and differentiate the op auto graph = trace(test, vars_in); EliminateDeadCode(graph); // Tracing of some ops depends on the DCE trick - auto grad_spec = differentiate(graph, std::vector(vars_in.size(), true)); + auto grad_spec = differentiate(graph); LowerGradOf(*grad_spec.df); // Get outputs from the interpreter auto tensors_in = fmap(vars_in, unwrap); @@ -651,7 +651,7 @@ void testDifferentiate(std::ostream & out) { auto c = a * b * a + b; graph->registerOutput(c.value()); - auto grad_spec = differentiate(graph, {true, true}); + auto grad_spec = differentiate(graph); std::vector expected_captured_inputs = {0, 1}; std::vector expected_captured_outputs = {1}; std::vector expected_input_vjps = {0, 1}; @@ -668,19 +668,22 @@ void testDifferentiate(std::ostream & out) { } void testDifferentiateWithRequiresGrad(std::ostream & out) { - auto graph = std::make_shared(); - at::ScalarType s = at::ScalarType::Float; - auto type = CompleteTensorType::create(s, -1, {2, 3, 4}, {12, 4, 1}); - // Build up a fake graph - auto a = SymbolicVariable::asNewInput(*graph, type); - auto b = SymbolicVariable::asNewInput(*graph, type); + auto graph = std::make_shared(); + auto a = SymbolicVariable::asNewInput(*graph); + auto b = SymbolicVariable::asNewInput(*graph); auto d = b * b + b; auto e = (d + a) * a + b; graph->registerOutput(d.value()); graph->registerOutput(e.value()); - auto grad_spec = differentiate(graph, {true, false}); + auto a_var = autograd::make_variable(at::CPU(at::kFloat).tensor(2, 2), true); + auto b_var = autograd::make_variable(at::CPU(at::kFloat).tensor(2, 2), false); + ArgumentSpec spec (true, {a_var, b_var}); + PropagateInputShapes(*graph, spec); + PropagateRequiresGrad(graph, spec); + + auto grad_spec = differentiate(graph); std::vector expected_input_vjps = {1, 2}; // for e and %4 = (d + a) std::vector expected_output_vjps = {0}; // only a requires grad REQUIRE(grad_spec.f_real_outputs == 2); // we need one temporary %4 = (d + a) diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index e812f2e02fdee5..166dfd6e06b7d3 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -10,6 +10,7 @@ #include #include +#include namespace torch { namespace jit { @@ -51,13 +52,20 @@ struct cloneType { template struct cloneType { std::shared_ptr operator()(std::shared_ptr ptr) const { - return std::make_shared(*ptr); + auto result = std::make_shared::type>(*ptr); + // XXX: the line above will correctly slice the struct, and make its runtype + // type exactly equal to T. However, kind_ is a field of Type, so it will simply + // be copied, and we need to fix it in here to match the dynamic type. + result->kind_ = T::Kind; + return result; } }; struct TORCH_API Type : std::enable_shared_from_this { private: TypeKind kind_; + template + friend struct cloneType; protected: Type(TypeKind kind) @@ -85,6 +93,8 @@ struct TORCH_API Type : std::enable_shared_from_this { return kind_; } + virtual bool requires_grad() const { return false; } + // Dynamically cast this object to the subclass indicated by the // template variable, returning nullptr if the cast is invalid. // NOTE: if the cast succeeds, but the casted kind is not the @@ -138,6 +148,8 @@ struct TORCH_API DynamicType : public Type { return DynamicTypePtr(new DynamicType( std::forward(all)... )); // NOLINT(modernize-make-shared) } + bool requires_grad() const override { return true; } + bool operator==(const Type& rhs) const override { return rhs.kind() == kind(); } @@ -168,6 +180,7 @@ struct TORCH_API TensorType : public Type { at::ScalarType scalarType() const { return scalar_type_; } int device() const { return device_; } int dim() const { return dim_; } + bool requires_grad() const override { return requires_grad_; } TensorTypePtr toScalarType(at::ScalarType type){ auto t = TensorType::create(*this); @@ -179,6 +192,11 @@ struct TORCH_API TensorType : public Type { t->dim_ = new_dim; return t; } + TensorTypePtr withRequiresGrad(bool req) { + auto t = TensorType::create(*this); + t->requires_grad_ = req; + return t; + } bool operator==(const Type& rhs) const override { if (rhs.kind() != TypeKind::TensorType) @@ -201,14 +219,20 @@ struct TORCH_API TensorType : public Type { protected: TensorType(const at::Tensor& tensor, TypeKind kind=TypeKind::TensorType) - : TensorType(tensor.type().scalarType(), tensor.type().is_cuda() ? tensor.get_device() : -1, tensor.dim(), kind) {} - TensorType(at::ScalarType scalar_type, int device, int dim, TypeKind kind=TypeKind::TensorType) + : TensorType(tensor.type().scalarType(), + tensor.type().is_cuda() ? tensor.get_device() : -1, + tensor.dim(), + tensor.is_variable() && tensor.requires_grad(), + kind) {} + TensorType(at::ScalarType scalar_type, int device, int dim, bool requires_grad=true, TypeKind kind=TypeKind::TensorType) : Type(kind) , scalar_type_(scalar_type) + , requires_grad_(at::isFloatingType(scalar_type) && requires_grad) , device_(device) , dim_(dim) {} at::ScalarType scalar_type_; + bool requires_grad_; int device_; int dim_; }; @@ -292,10 +316,10 @@ struct TORCH_API CompleteTensorType : public TensorType { : TensorType(tensor, TypeKind::CompleteTensorType) , sizes_(tensor.sizes().vec()) , strides_(tensor.strides().vec()) {} - CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes) - : CompleteTensorType(scalar_type, device, sizes, CompleteTensorType::contiguousStridesOf(sizes)) {} - CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides) - : TensorType(scalar_type, device, sizes.size(), TypeKind::CompleteTensorType) + CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, bool requires_grad=true) + : CompleteTensorType(scalar_type, device, sizes, CompleteTensorType::contiguousStridesOf(sizes), requires_grad) {} + CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides, bool requires_grad=true) + : TensorType(scalar_type, device, sizes.size(), requires_grad, TypeKind::CompleteTensorType) , sizes_(sizes.vec()) , strides_(strides.vec()) {} @@ -349,10 +373,11 @@ struct TORCH_API ListType : public Type { static ListTypePtr ofTensors(); static ListTypePtr ofInts(); static ListTypePtr ofFloats(); + + static const TypeKind Kind = TypeKind::ListType; private: ListType(TypePtr elem) : Type(TypeKind::ListType), elem(std::move(elem)) {} - static const TypeKind Kind = TypeKind::ListType; TypePtr elem; }; @@ -401,11 +426,12 @@ struct TORCH_API TupleType : public Type { ss << "]"; return ss.str(); } + + static const TypeKind Kind = TypeKind::TupleType; private: TupleType(std::vector elements_) : Type(TypeKind::TupleType) , elements_(std::move(elements_)) {} - static const TypeKind Kind = TypeKind::TupleType; bool compare(const Type& rhs, std::function fn) const { if(rhs.kind() != kind()) From 99c0b96f6891ddb62799466a9ab5c8068d2fe1d8 Mon Sep 17 00:00:00 2001 From: xhzhao Date: Thu, 13 Sep 2018 19:25:43 -0700 Subject: [PATCH 143/237] optimize norm on ATen CPU backend (#11565) Summary: current torch.norm() runs sequentially on CPU. This PR did parallelization and vectorization of torch.norm() on ATen CPU path, roughly provide 2 order of magnitude performance boost. Performance is benchmarks on Xeon skylake 8180, 2*28 cores 2.5GHz, using the following script: ```python import torch from time import time count = 1000 size = 1000*1000 def test_norm(p=2): a = torch.randn(size) tstart = time() for i in range(count): torch.norm(a, p) tend = time() print("norm on size %d tensor p = %d: %f s" % (size, p, (tend-tstart))) for p in range(4): test_norm(p) ``` without this optimization, ``` (intel-pytorch) [mingfeim@mlt-skx065 unit_tests]$ python test_norm.py norm on size 1000000 tensor p = 0: 1.071235 s norm on size 1000000 tensor p = 1: 1.069149 s norm on size 1000000 tensor p = 2: 1.068212 s norm on size 1000000 tensor p = 3: 69.735312 s ``` and with this optimization, ``` (pytorch-tf) [mingfeim@mlt-skx053 unit_tests]$ python test_norm.py norm on size 1000000 tensor p = 0: 0.127507 s norm on size 1000000 tensor p = 1: 0.011867 s norm on size 1000000 tensor p = 2: 0.011907 s norm on size 1000000 tensor p = 3: 0.014470 s ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11565 Differential Revision: D9804484 Pulled By: ezyang fbshipit-source-id: 52899f30ac26139d00684d07edfb47cb9b25d871 --- aten/src/ATen/native/LegacyBridge.cpp | 8 - aten/src/ATen/native/ReduceOps.cpp | 56 ++++++- aten/src/ATen/native/cpu/ReduceOpsKernel.cpp | 154 +++++++++++++++++++ aten/src/ATen/native/cpu/ReduceOpsKernel.h | 3 + 4 files changed, 208 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp index 1364c0cceb1847..2803112b7692bb 100644 --- a/aten/src/ATen/native/LegacyBridge.cpp +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -22,14 +22,6 @@ namespace { // TODO: Maybe the foo_ variants should call th_foo_ -Tensor norm(const Tensor & self, Scalar p) { - if (_has_native(self)) { - return native_norm(self, p); - } else { - return th_norm(self, p); - } -} - Tensor clone(const Tensor& self) { if (_has_native(self)) { return native_clone(self); diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 2c7e641dcbe843..c976121e77ae3f 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -19,6 +19,7 @@ namespace native { DEFINE_DISPATCH(sum_kernel); DEFINE_DISPATCH(prod_kernel); +DEFINE_DISPATCH(norm_kernel); static inline Tensor integer_upcast(const Tensor& self, optional dtype) { ScalarType scalarType = self.type().scalarType(); @@ -584,12 +585,23 @@ Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim) return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim); } -Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) { - Tensor result = self.type().tensor(); - return at::native::norm_out(result, self, p, dim, keepdim); +Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_, bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim()); + if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) + return result; + if (self.is_contiguous() && result.is_contiguous()) { + _dimreduce_setup(result, self, dim); + norm_kernel(kCPU, result, self, p, dim); + if (!keepdim) { + result.squeeze_(dim); + } + return result; + } else { + return at::_th_norm_out(result, self, p, dim, keepdim); + } } -Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) { +Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) { AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes"); @@ -597,10 +609,44 @@ Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { return result; } else { - return at::_th_norm_out(result, self, p, dim, keepdim); + if (self.is_cuda()) { + return at::_th_norm_out(result, self, p, dim, keepdim); + } else { + return _norm_out_cpu(result, self, p, dim, keepdim); + } + } +} + +Tensor _norm(const Tensor &self, Scalar p) { + if (self.type().is_sparse()) { + return at::native_norm(self, p); + } else { + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend())); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes"); + if (self.is_cuda()) { + return at::th_norm(self, p); + } else { + if (self.is_contiguous()) { + Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type()); + norm_kernel(kCPU, result, self, p, nullopt); + return result; + } else { + return at::th_norm(self, p); + } + } } } +Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) { + Tensor result = self.type().tensor(); + return at::native::norm_out(result, self, p, dim, keepdim); +} + +Tensor norm(const Tensor& self, Scalar p) { + return at::native::_norm(self, p); +} + Tensor all(const Tensor& self, int64_t dim, bool keepdim) { Tensor result = self.type().tensor(); return at::native::all_out(result, self, dim, keepdim); diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index feea350fd08306..3be4ed4e06c28d 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -183,9 +183,163 @@ static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional +struct NormReduction { + // reduction width in number of scalar elements + static constexpr int WIDTH = 128 / sizeof(scalar_t); + using Vec = Vec256; + + static void apply(Tensor& res, const Tensor& self, Scalar p, at::optional dim) { + auto out_ = res.data(); + auto data_ = self.data(); + auto numel = self.numel(); + float pval = 0.0; + if (p.isIntegral()){ + pval = p.to(); + } else if (p.isFloatingPoint()) { + pval = p.to(); + } + if (!dim.has_value()) { + *out_ = reduce_all(data_, numel, pval); + return; + } + int64_t n = self.size(*dim); + int64_t stride = self.stride(*dim); + // A contiguous tensor does not need to hold a meaningful stride + // if the corresponding size is 1 + if (n == 1) { + stride = 1; + for (int64_t i = self.ndimension() - 1; i > *dim; i--) { + stride *= self.size(i); + } + } + int64_t batch = numel / n; + parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) { + for (int64_t bi = begin; bi < end; bi++) { + int64_t b = bi / stride; + int64_t i = bi % stride; + const scalar_t* data = &data_[b * n * stride + i]; + out_[bi] = norm_reduce(data, n, stride, pval); + } + }); + } + + static scalar_t reduce_all(const scalar_t* data_, int64_t size, float pval) { + scalar_t sum = parallel_reduce( + 0, + size, + internal::GRAIN_SIZE, + (scalar_t)0, + [=](int64_t begin, int64_t end, scalar_t init) { + const scalar_t* data = &data_[begin]; + int64_t n = end - begin; + scalar_t result = norm_reduce(data, n, 1, pval); + return result; + }, + std::plus()); + return sum; + } + + static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) { + scalar_t result = 0.0; + if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) { + int64_t n_rounded = round_down(n, WIDTH); + scalar_t result1 = norm_reduce128(data, n_rounded, pval); + scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval); + result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval); + } else { + result = norm_reduce_sequential(data, n, stride, pval); + } + return result; + } + + static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) { + scalar_t result = 0.0; + if (pval == 0) { + for (int64_t k = 0; k < n; k++) { + result += (data[k * stride] != 0.0); + } + } else if (pval == 1) { + for (int64_t k = 0; k < n; k++) { + result += std::abs(data[k * stride]); + } + } else if (pval == 2) { + for (int64_t k = 0; k < n; k++) { + result += data[k * stride] * data[k * stride]; + } + result = std::sqrt(result); + } else if (pval == 3) { + for (int64_t k = 0; k < n; k++) { + result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]); + } + result = std::pow(result, 1.0/3); + } else if (std::isinf(pval)) { + for (int64_t k = 0; k < n; k++) { + result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result; + } + result = result; + } else { + for (int64_t k = 0; k < n; k++) { + result += std::pow(std::abs(data[k * stride]), pval); + } + result = std::pow(result, 1.0/pval); + } + return result; + } + + // Reduce down a column of WIDTH elements (128 bytes) with the given number n + // n is already rounded by 128 + static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) { + scalar_t result = 0.0; + Vec acc[4] = {0.0, 0.0, 0.0, 0.0}; // 128 bytes (two cache lines) + static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes"); + int64_t rows = n / WIDTH; + if (pval == 1){ + for (int row = 0; row < rows; row ++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); + acc[j] = acc[j] + val.abs(); + } + } + } + else if (pval == 2) { + for (int row = 0; row < rows; row ++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); + acc[j] = acc[j] + val * val; + } + } + } + else if (pval == 3) { + for (int row = 0; row < rows; row ++) { + for (int j = 0; j != 4; j++) { + auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]); + acc[j] = acc[j] + (val * val * val).abs(); + } + } + } + scalar_t buf[WIDTH] = {0}; + for (int j = 0; j != 4; j++) { + acc[j].store(&buf[j * Vec::size]); + } + for (int i = 0; i < WIDTH; i++) { + result += buf[i]; + } + result = std::pow(result, 1.0/pval); + return result; + } +}; + +static void norm_kernel_impl(Tensor& result, const Tensor& self, Scalar p, at::optional dim) { + AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] { + NormReduction::apply(result, self, p, dim); + }); +} + } // anonymous namespace REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl); REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl); +REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl); }} // namespace at::native diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h index 4c5c8c15149a1b..5fc7c60ff2803d 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h @@ -11,4 +11,7 @@ using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional); DECLARE_DISPATCH(reduce_fn, sum_kernel); DECLARE_DISPATCH(reduce_fn, prod_kernel); +using reduce_norm_fn = void(*)(Tensor &, const Tensor &, Scalar, at::optional); +DECLARE_DISPATCH(reduce_norm_fn, norm_kernel); + }} // namespace at::native From 2431eac7c011afe42d4c22b8b3f46dedae65e7c0 Mon Sep 17 00:00:00 2001 From: Fritz Obermeyer Date: Thu, 13 Sep 2018 19:45:02 -0700 Subject: [PATCH 144/237] Ensure most Distribution methods are jittable (#11560) Summary: This adds tests in tests/test_distributions.py to ensure that all methods of `Distribution` objects are jittable. I've replaced a few samplers with jittable versions: - `.uniform_()` -> `torch.rand()` - `.exponential_()` -> `-(-torch.rand()).log1p()` - `.normal_()` -> `torch.normal(torch.zeros(...), torch.ones(...), ...)` Some jit failures remain, and are marked in test_distributions.py - `Cauchy` and `HalfCauchy` do not support sampling due to missing `.cauchy_()` - `Binomial` does not support `.enumerate_support()` due to `arange` ignoring its first arg. - `MultivariateNormal`, `LowRankMultivariateNormal` do not support `.mean`, `.entropy` - [x] Currently some tests fail (I've skipped those) due to unavailability of `aten::uniform` and `aten::cauchy` in the jit. Can someone suggest how to add these? I tried to add declarations to `torch/csrc/ir.cpp` and `torch/csrc/passes/shape_analysis.cpp`, but that resulted in "Couldn't find operator" errors. - [x] There are still lots of `TracerWarning`s that something doesn't match something. I'm not sure whether these are real. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11560 Differential Revision: D9816327 Pulled By: apaszke fbshipit-source-id: 72ec998ea13fc4c76d1ed003d9502e0fbaf728b8 --- test/test_distributions.py | 288 +++++++++++++++++- torch/distributions/binomial.py | 6 +- torch/distributions/constraint_registry.py | 2 + torch/distributions/distribution.py | 4 +- torch/distributions/exponential.py | 4 + torch/distributions/geometric.py | 7 +- torch/distributions/laplace.py | 4 + .../lowrank_multivariate_normal.py | 7 +- torch/distributions/multivariate_normal.py | 4 +- torch/distributions/normal.py | 4 +- torch/distributions/one_hot_categorical.py | 8 +- torch/distributions/relaxed_bernoulli.py | 2 +- torch/distributions/relaxed_categorical.py | 6 +- torch/distributions/studentT.py | 10 +- torch/distributions/uniform.py | 2 +- torch/distributions/utils.py | 8 + 16 files changed, 343 insertions(+), 23 deletions(-) diff --git a/test/test_distributions.py b/test/test_distributions.py index 60bcec1605b6e5..61d5e57d910b54 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -647,7 +647,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params): distribution = dist_ctor(*ctor_params) s = distribution.sample() if s.is_floating_point(): - s.detach_().requires_grad_() + s = s.detach().requires_grad_() expected_shape = distribution.batch_shape + distribution.event_shape self.assertEqual(s.size(), expected_shape) @@ -3971,6 +3971,55 @@ def test_transformed_distribution_shapes(self): except NotImplementedError: continue + def test_jit_fwd(self): + for transform in self.unique_transforms: + x = torch.tensor(self._generate_data(transform), requires_grad=True) + + def f(x): + return transform(x) + + try: + traced_f = torch.jit.trace(f, (x,)) + except NotImplementedError: + continue + + # check on different inputs + x = torch.tensor(self._generate_data(transform), requires_grad=True) + self.assertEqual(f(x), traced_f(x)) + + def test_jit_inv(self): + for transform in self.unique_transforms: + y = torch.tensor(self._generate_data(transform.inv), requires_grad=True) + + def f(y): + return transform.inv(y) + + try: + traced_f = torch.jit.trace(f, (y,)) + except NotImplementedError: + continue + + # check on different inputs + y = torch.tensor(self._generate_data(transform.inv), requires_grad=True) + self.assertEqual(f(y), traced_f(y)) + + def test_jit_jacobian(self): + for transform in self.unique_transforms: + x = torch.tensor(self._generate_data(transform), requires_grad=True) + + def f(x): + y = transform(x) + return transform.log_abs_det_jacobian(x, y) + + try: + traced_f = torch.jit.trace(f, (x,)) + except NotImplementedError: + continue + + # check on different inputs + x = torch.tensor(self._generate_data(transform), requires_grad=True) + self.assertEqual(f(x), traced_f(x)) + class TestConstraintRegistry(TestCase): def get_constraints(self, is_cuda=False): @@ -4092,5 +4141,242 @@ def tearDown(self): super(TestCase, self).tearDown() Distribution.set_default_validate_args(False) + +class TestJit(TestCase): + def _examples(self): + for Dist, params in EXAMPLES: + for param in params: + keys = param.keys() + values = tuple(param[key] for key in keys) + if not all(isinstance(x, torch.Tensor) for x in values): + continue + sample = Dist(**param).sample() + yield Dist, keys, values, sample + + def _perturb_tensor(self, value, constraint): + if isinstance(constraint, constraints._IntegerGreaterThan): + return value + 1 + if isinstance(constraint, constraints._PositiveDefinite): + return value + torch.eye(value.shape[-1]) + if value.dtype in [torch.float, torch.double]: + transform = transform_to(constraint) + delta = value.new(value.shape).normal_() + return transform(transform.inv(value) + delta) + if value.dtype == torch.long: + result = value.clone() + result[value == 0] = 1 + result[value == 1] = 0 + return result + raise NotImplementedError + + def _perturb(self, Dist, keys, values, sample): + with torch.no_grad(): + if Dist is Uniform: + param = dict(zip(keys, values)) + param['low'] = param['low'] - torch.rand(param['low'].shape) + param['high'] = param['high'] + torch.rand(param['high'].shape) + values = [param[key] for key in keys] + else: + values = [self._perturb_tensor(value, Dist.arg_constraints.get(key, constraints.real)) + for key, value in zip(keys, values)] + param = dict(zip(keys, values)) + sample = Dist(**param).sample() + return values, sample + + def test_sample(self): + for Dist, keys, values, sample in self._examples(): + + def f(*values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.sample() + + traced_f = torch.jit.trace(f, values, check_trace=False) + + # FIXME Schema not found for node + xfail = [ + Cauchy, # aten::cauchy(Double(2,1), float, float, Generator) + HalfCauchy, # aten::cauchy(Double(2, 1), float, float, Generator) + ] + if Dist in xfail: + continue + + with torch.random.fork_rng(): + sample = f(*values) + traced_sample = traced_f(*values) + self.assertEqual(sample, traced_sample) + + # FIXME no nondeterministic nodes found in trace + xfail = [Beta, Dirichlet] + if Dist not in xfail: + self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes())) + + def test_rsample(self): + for Dist, keys, values, sample in self._examples(): + if not Dist.has_rsample: + continue + + def f(*values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.rsample() + + traced_f = torch.jit.trace(f, values, check_trace=False) + + # FIXME Schema not found for node + xfail = [ + Cauchy, # aten::cauchy(Double(2,1), float, float, Generator) + HalfCauchy, # aten::cauchy(Double(2, 1), float, float, Generator) + ] + if Dist in xfail: + continue + + with torch.random.fork_rng(): + sample = f(*values) + traced_sample = traced_f(*values) + self.assertEqual(sample, traced_sample) + + # FIXME no nondeterministic nodes found in trace + xfail = [Beta, Dirichlet] + if Dist not in xfail: + self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes())) + + def test_log_prob(self): + for Dist, keys, values, sample in self._examples(): + # FIXME traced functions produce incorrect results + xfail = [LowRankMultivariateNormal, MultivariateNormal] + if Dist in xfail: + continue + + def f(sample, *values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.log_prob(sample) + + traced_f = torch.jit.trace(f, (sample,) + values) + + # check on different data + values, sample = self._perturb(Dist, keys, values, sample) + expected = f(sample, *values) + actual = traced_f(sample, *values) + self.assertEqual(expected, actual, + message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual)) + + def test_enumerate_support(self): + for Dist, keys, values, sample in self._examples(): + # FIXME traced functions produce incorrect results + xfail = [Binomial] + if Dist in xfail: + continue + + def f(*values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.enumerate_support() + + try: + traced_f = torch.jit.trace(f, values) + except NotImplementedError: + continue + + # check on different data + values, sample = self._perturb(Dist, keys, values, sample) + expected = f(*values) + actual = traced_f(*values) + self.assertEqual(expected, actual, + message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual)) + + def test_mean(self): + for Dist, keys, values, sample in self._examples(): + + def f(*values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.mean + + try: + traced_f = torch.jit.trace(f, values) + except NotImplementedError: + continue + + # check on different data + values, sample = self._perturb(Dist, keys, values, sample) + expected = f(*values) + actual = traced_f(*values) + expected[expected == float('inf')] = 0. + actual[actual == float('inf')] = 0. + self.assertEqual(expected, actual, allow_inf=True, + message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual)) + + def test_variance(self): + for Dist, keys, values, sample in self._examples(): + if Dist in [Cauchy, HalfCauchy]: + continue # infinite variance + + def f(*values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.variance + + try: + traced_f = torch.jit.trace(f, values) + except NotImplementedError: + continue + + # check on different data + values, sample = self._perturb(Dist, keys, values, sample) + expected = f(*values) + actual = traced_f(*values) + expected[expected == float('inf')] = 0. + actual[actual == float('inf')] = 0. + self.assertEqual(expected, actual, allow_inf=True, + message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual)) + + def test_entropy(self): + for Dist, keys, values, sample in self._examples(): + # FIXME traced functions produce incorrect results + xfail = [LowRankMultivariateNormal, MultivariateNormal] + if Dist in xfail: + continue + + def f(*values): + param = dict(zip(keys, values)) + dist = Dist(**param) + return dist.entropy() + + try: + traced_f = torch.jit.trace(f, values) + except NotImplementedError: + continue + + # check on different data + values, sample = self._perturb(Dist, keys, values, sample) + expected = f(*values) + actual = traced_f(*values) + self.assertEqual(expected, actual, allow_inf=True, + message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual)) + + def test_cdf(self): + for Dist, keys, values, sample in self._examples(): + + def f(sample, *values): + param = dict(zip(keys, values)) + dist = Dist(**param) + cdf = dist.cdf(sample) + return dist.icdf(cdf) + + try: + traced_f = torch.jit.trace(f, (sample,) + values) + except NotImplementedError: + continue + + # check on different data + values, sample = self._perturb(Dist, keys, values, sample) + expected = f(sample, *values) + actual = traced_f(sample, *values) + self.assertEqual(expected, actual, + message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual)) + + if __name__ == '__main__': run_tests() diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py index 5e168ffe1d7a2f..e1763c1c6ecbfd 100644 --- a/torch/distributions/binomial.py +++ b/torch/distributions/binomial.py @@ -100,7 +100,11 @@ def sample(self, sample_shape=torch.Size()): if self.total_count.min() != max_count: arange = torch.arange(max_count, dtype=self._param.dtype, device=self._param.device) mask = arange >= self.total_count.unsqueeze(-1) - bernoullis.masked_fill_(mask, 0.) + if torch._C._get_tracing_state(): + # [JIT WORKAROUND] lack of support for .masked_fill_() + bernoullis[mask.expand(shape)] = 0. + else: + bernoullis.masked_fill_(mask, 0.) return bernoullis.sum(dim=-1) def log_prob(self, value): diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py index f8688af3f3a392..0981a3ce85b3b6 100644 --- a/torch/distributions/constraint_registry.py +++ b/torch/distributions/constraint_registry.py @@ -152,7 +152,9 @@ def __call__(self, constraint): ################################################################################ @biject_to.register(constraints.real) +@biject_to.register(constraints.real_vector) @transform_to.register(constraints.real) +@transform_to.register(constraints.real_vector) def _transform_to_real(constraint): return transforms.identity_transform diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py index 8d357a4e9371a0..726158b8694432 100644 --- a/torch/distributions/distribution.py +++ b/torch/distributions/distribution.py @@ -215,7 +215,9 @@ def _extended_shape(self, sample_shape=torch.Size()): Args: sample_shape (torch.Size): the size of the sample to be drawn. """ - return torch.Size(sample_shape + self._batch_shape + self._event_shape) + if not isinstance(sample_shape, torch.Size): + sample_shape = torch.Size(sample_shape) + return sample_shape + self._batch_shape + self._event_shape def _validate_sample(self, value): """ diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py index a2fde7b382593c..41d7cd9f9787a5 100644 --- a/torch/distributions/exponential.py +++ b/torch/distributions/exponential.py @@ -51,6 +51,10 @@ def expand(self, batch_shape, _instance=None): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) + if torch._C._get_tracing_state(): + # [JIT WORKAROUND] lack of support for ._exponential() + u = torch.rand(shape, dtype=self.rate.dtype, device=self.rate.device) + return -(-u).log1p() / self.rate return self.rate.new(shape).exponential_() / self.rate def log_prob(self, value): diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py index a84963664b4cf3..7b9e796b0a9ac2 100644 --- a/torch/distributions/geometric.py +++ b/torch/distributions/geometric.py @@ -75,7 +75,12 @@ def probs(self): def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) with torch.no_grad(): - u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1) + if torch._C._get_tracing_state(): + # [JIT WORKAROUND] lack of support for .uniform_() + u = torch.rand(shape, dtype=self.probs.dtype, device=self.probs.device) + u = u.clamp(min=_finfo(self.probs).tiny) + else: + u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1) return (u.log() / (-self.probs).log1p()).floor() def log_prob(self, value): diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py index cc5b079fbdbdcb..c15a35ce7c7db1 100644 --- a/torch/distributions/laplace.py +++ b/torch/distributions/laplace.py @@ -54,6 +54,10 @@ def expand(self, batch_shape, _instance=None): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) + if torch._C._get_tracing_state(): + # [JIT WORKAROUND] lack of support for .uniform_() + u = torch.rand(shape, dtype=self.loc.dtype, device=self.loc.device) * 2 - 1 + return self.loc - self.scale * u.sign() * torch.log1p(-u.abs().clamp(min=_finfo(self.loc).tiny)) u = self.loc.new(shape).uniform_(_finfo(self.loc).eps - 1, 1) # TODO: If we ever implement tensor.nextafter, below is what we want ideally. # u = self.loc.new(shape).uniform_(self.loc.nextafter(-.5, 0), .5) diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py index 5e6239a69d0d37..4cc3d986c7a6c1 100644 --- a/torch/distributions/lowrank_multivariate_normal.py +++ b/torch/distributions/lowrank_multivariate_normal.py @@ -5,7 +5,7 @@ from torch.distributions.distribution import Distribution from torch.distributions.multivariate_normal import (_batch_diag, _batch_mahalanobis, _batch_mv, _batch_potrf_lower, _batch_trtrs_lower) -from torch.distributions.utils import lazy_property +from torch.distributions.utils import _standard_normal, lazy_property def _batch_vector_diag(bvec): @@ -169,8 +169,9 @@ def precision_matrix(self): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) - eps_W = self.loc.new_empty(shape[:-1] + (self.cov_factor.size(-1),)).normal_() - eps_D = self.loc.new_empty(shape).normal_() + W_shape = shape[:-1] + self.cov_factor.shape[-1:] + eps_W = _standard_normal(W_shape, dtype=self.loc.dtype, device=self.loc.device) + eps_D = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device) return self.loc + _batch_mv(self.cov_factor, eps_W) + self.cov_diag.sqrt() * eps_D def log_prob(self, value): diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py index 855c727dc9b951..014a07e53c9532 100644 --- a/torch/distributions/multivariate_normal.py +++ b/torch/distributions/multivariate_normal.py @@ -3,7 +3,7 @@ import torch from torch.distributions import constraints from torch.distributions.distribution import Distribution -from torch.distributions.utils import lazy_property +from torch.distributions.utils import _standard_normal, lazy_property def _batch_mv(bmat, bvec): @@ -194,7 +194,7 @@ def variance(self): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) - eps = self.loc.new_empty(shape).normal_() + eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device) return self.loc + _batch_mv(self._unbroadcasted_scale_tril, eps) def log_prob(self, value): diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py index 98a5318d7b9829..a125806108e8cd 100644 --- a/torch/distributions/normal.py +++ b/torch/distributions/normal.py @@ -4,7 +4,7 @@ import torch from torch.distributions import constraints from torch.distributions.exp_family import ExponentialFamily -from torch.distributions.utils import broadcast_all +from torch.distributions.utils import _standard_normal, broadcast_all class Normal(ExponentialFamily): @@ -64,7 +64,7 @@ def sample(self, sample_shape=torch.Size()): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) - eps = self.loc.new(shape).normal_() + eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device) return self.loc + eps * self.scale def log_prob(self, value): diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py index dc37b4c7fabb8e..5165ed6768bd89 100644 --- a/torch/distributions/one_hot_categorical.py +++ b/torch/distributions/one_hot_categorical.py @@ -76,11 +76,15 @@ def param_shape(self): def sample(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) probs = self._categorical.probs - one_hot = probs.new(self._extended_shape(sample_shape)).zero_() indices = self._categorical.sample(sample_shape) + if torch._C._get_tracing_state(): + # [JIT WORKAROUND] lack of support for .scatter_() + eye = torch.eye(self.event_shape[-1], dtype=self._param.dtype, device=self._param.device) + return eye[indices] + one_hot = probs.new_zeros(self._extended_shape(sample_shape)) if indices.dim() < one_hot.dim(): indices = indices.unsqueeze(-1) - return one_hot.scatter_(-1, indices, 1) + return one_hot.scatter_(-1, indices, 1.) def log_prob(self, value): if self._validate_args: diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py index 643fddc2c96b68..f885bfea2bc422 100644 --- a/torch/distributions/relaxed_bernoulli.py +++ b/torch/distributions/relaxed_bernoulli.py @@ -78,7 +78,7 @@ def param_shape(self): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) probs = clamp_probs(self.probs.expand(shape)) - uniforms = clamp_probs(self.probs.new(shape).uniform_()) + uniforms = clamp_probs(torch.rand(shape, dtype=probs.dtype, device=probs.device)) return (uniforms.log() - (-uniforms).log1p() + probs.log() - (-probs).log1p()) / self.temperature def log_prob(self, value): diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py index a43d631bec9b2b..59f5b5998a12d3 100644 --- a/torch/distributions/relaxed_categorical.py +++ b/torch/distributions/relaxed_categorical.py @@ -66,8 +66,8 @@ def probs(self): return self._categorical.probs def rsample(self, sample_shape=torch.Size()): - sample_shape = torch.Size(sample_shape) - uniforms = clamp_probs(self.logits.new(self._extended_shape(sample_shape)).uniform_()) + shape = self._extended_shape(sample_shape) + uniforms = clamp_probs(torch.rand(shape, dtype=self.logits.dtype, device=self.logits.device)) gumbels = -((-(uniforms.log())).log()) scores = (self.logits + gumbels) / self.temperature return scores - scores.logsumexp(dim=-1, keepdim=True) @@ -77,7 +77,7 @@ def log_prob(self, value): if self._validate_args: self._validate_sample(value) logits, value = broadcast_all(self.logits, value) - log_scale = (self.temperature.new(self.temperature.shape).fill_(K).lgamma() - + log_scale = (self.temperature.new_tensor(float(K)).lgamma() - self.temperature.log().mul(-(K - 1))) score = logits - value.mul(self.temperature) score = (score - score.logsumexp(dim=-1, keepdim=True)).sum(-1) diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py index cefa4684298261..6530940b328e7f 100644 --- a/torch/distributions/studentT.py +++ b/torch/distributions/studentT.py @@ -1,11 +1,11 @@ +import math from numbers import Number + import torch from torch._six import inf, nan -import math -from torch.distributions import constraints +from torch.distributions import Chi2, constraints from torch.distributions.distribution import Distribution -from torch.distributions import Chi2 -from torch.distributions.utils import broadcast_all +from torch.distributions.utils import _standard_normal, broadcast_all class StudentT(Distribution): @@ -65,7 +65,7 @@ def rsample(self, sample_shape=torch.Size()): # Z ~ Chi2(df) # Y = X / sqrt(Z / df) ~ StudentT(df) shape = self._extended_shape(sample_shape) - X = self.df.new(shape).normal_() + X = _standard_normal(shape, dtype=self.df.dtype, device=self.df.device) Z = self._chi2.rsample(sample_shape) Y = X * torch.rsqrt(Z / self.df) return self.loc + self.scale * Y diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py index ce9cf496656346..071f086d782673 100644 --- a/torch/distributions/uniform.py +++ b/torch/distributions/uniform.py @@ -64,7 +64,7 @@ def support(self): def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) - rand = self.low.new(shape).uniform_() + rand = torch.rand(shape, dtype=self.low.dtype, device=self.low.device) return self.low + rand * (self.high - self.low) def log_prob(self, value): diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py index 0219942aac155a..1fcc7278e3f481 100644 --- a/torch/distributions/utils.py +++ b/torch/distributions/utils.py @@ -65,6 +65,14 @@ def broadcast_all(*values): return torch.broadcast_tensors(*values) +def _standard_normal(shape, dtype, device): + if torch._C._get_tracing_state(): + # [JIT WORKAROUND] lack of support for .normal_() + return torch.normal(torch.zeros(shape, dtype=dtype, device=device), + torch.ones(shape, dtype=dtype, device=device)) + return torch.empty(shape, dtype=dtype, device=device).normal_() + + def _sum_rightmost(value, dim): r""" Sum out ``dim`` many rightmost dimensions of a given tensor. From e6fe8d9cf55c37fdbf1b1aa50eb6a2b448b6d3ee Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 13 Sep 2018 20:10:21 -0700 Subject: [PATCH 145/237] Try to delete codeowners for ATen/core (#10693) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/10693 Reviewed By: soumith Differential Revision: D9772210 Pulled By: ezyang fbshipit-source-id: 14560eaf77441980e9784536acd0ffe20b15c5b8 --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 113be035c9b99f..5723d6ebe5c058 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,6 +2,7 @@ # Each line is a file pattern followed by one or more owners. /aten/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang +/aten/src/ATen/core/ /torch/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang /docs/source @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ssnl @zou3519 /docs/cpp @goldsborough @ebetica @apaszke @soumith @colesbury @gchanan @zdevito @ezyang From 1637729620599706a8647f32fd375f623c4b7d9a Mon Sep 17 00:00:00 2001 From: zrphercule Date: Thu, 13 Sep 2018 20:15:55 -0700 Subject: [PATCH 146/237] Fix ci by skipping some tests (#11668) Summary: scalar_tensor_test skipped Pull Request resolved: https://github.com/pytorch/pytorch/pull/11668 Differential Revision: D9825819 Pulled By: zrphercule fbshipit-source-id: 6e62a001bcde49be8f7af1501b303bd93d09d005 --- .jenkins/caffe2/test.sh | 9 ++++++++- .jenkins/pytorch/test.sh | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 6ca0850f940968..ac29672c8e2ced 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -62,7 +62,14 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do */mkl_utils_test|*/aten/integer_divider_test) continue ;; - *) + */scalar_tensor_test|*/basic|*/native_test) + if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + continue + else + "$test" + fi + ;; + *) # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While # planning to migrate to gtest as the common PyTorch c++ test suite, we # currently do NOT use the xml test reporter, because Catch doesn't diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index bc22fa7a36070a..471fd8fac1fc6e 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -90,6 +90,8 @@ test_python_all_except_nn() { test_aten() { # Test ATen + # The following test(s) of ATen have already been skipped by caffe2 in rocm environment: + # scalar_tensor_test, basic, native_test if ([[ "$BUILD_ENVIRONMENT" != *asan* ]] && [[ "$BUILD_ENVIRONMENT" != *rocm* ]]); then echo "Running ATen tests with pytorch lib" TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib From c5f7da3f4ae289bf110b5b0b6605d0c51e0bc86a Mon Sep 17 00:00:00 2001 From: Jiyan Yang Date: Fri, 14 Sep 2018 02:36:26 -0700 Subject: [PATCH 147/237] Support FP16 sparse lookup (#11674) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11674 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11658 Reviewed By: hyuen Differential Revision: D9676950 fbshipit-source-id: 89a115b9664b84e4e4436b7da033e5a428c2246d --- caffe2/python/layers/sparse_lookup.py | 33 ++++++++++++++++++++++++--- caffe2/python/optimizer.py | 31 +++++++++++++++++++------ 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py index d96739b66c865b..4c3661b284dc3c 100644 --- a/caffe2/python/layers/sparse_lookup.py +++ b/caffe2/python/layers/sparse_lookup.py @@ -28,6 +28,12 @@ def get_sparse_lookup_predictor_version(version): return version +def get_sparse_lookup_trainer_version(version): + assert version in {'fp32', 'fp16'},\ + "Unexpected version of sparse_lookup layer {0}".format(version) + return version + + def _is_id_list(input_record): return schema.equal_schemas(input_record, IdList) @@ -72,10 +78,12 @@ def __init__(self, model, input_record, inner_shape, reducer, "{} should have categorical limit > 0, but got {}".format( get_key(input_record)(), input_dim)) - scale = math.sqrt(1.0 / input_dim) + self.input_dim = input_dim self.shape = [input_dim] + inner_shape - self.weight_init = weight_init if weight_init else ( - 'UniformFill', {'min': -scale, 'max': scale}) + + default_init_op = self._get_default_init_op() + + self.weight_init = weight_init or default_init_op if _is_id_list(self.input_record): sparse_key = self.input_record.items() @@ -141,6 +149,25 @@ def get_8bits_compatible_parameters(self, fused=True): ) return [RowwiseQuantized8BitsWeight(self.w, self.scale_bias)] + def _get_default_init_op(self): + scale = math.sqrt(1.0 / self.input_dim) + + cur_scope = get_current_scope() + trainer_version = get_sparse_lookup_trainer_version( + **cur_scope.get(get_sparse_lookup_trainer_version.__name__, + {'version': 'fp32'})) + + if trainer_version == 'fp32': + default_weight_init = ('UniformFill', {'min': -scale, 'max': scale}) + elif trainer_version == 'fp16': + default_weight_init = ("Float16UniformFill", {'min': -scale, 'max': scale}) + else: + raise NotImplementedError( + "Train version {} is not currently supported".format(trainer_version) + ) + + return default_weight_init + def _gather_wrapper(self, net, version, in_indices, out): # Gather can work on all kinds of input data types, and output # data with the same type. Convert the output of Gather to float, diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 2ebcf1d92a1240..a48502c68a3cd6 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -8,6 +8,8 @@ from collections import namedtuple, defaultdict from past.builtins import basestring +import logging + import numpy as np from caffe2.python import core, scope, utils, workspace @@ -20,6 +22,8 @@ AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"]) _optimizer_instance_count = defaultdict(int) +logger = logging.getLogger(__name__) + class Optimizer(object): def __init__(self): @@ -554,6 +558,8 @@ def _run(self, net, param_init_net, param_info): ) if self.rowWise: + assert self.engine == "SIMD", "Got {}".format(self.engine) + shapes, types = workspace.InferShapesAndTypes([param_init_net]) if str(param) not in shapes: # Type/shape inference is not available for this param, fallback @@ -577,13 +583,24 @@ def _run(self, net, param_init_net, param_info): shape=[shapes[str(param)][0]], value=0.0 ) - else: - param_squared_sum = param_init_net.ConstantFill( - [param], - str(param) + "_squared_sum", - value=0.0 - ) + if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16": + shapes, types = workspace.InferShapesAndTypes([param_init_net]) + assert str(param) in shapes, shapes + shape = shapes[str(param)] + + param_squared_sum = param_init_net.Float16ConstantFill( + [], + str(param) + "_squared_sum", + value=0.0, + shape=shape, + ) + else: + param_squared_sum = param_init_net.ConstantFill( + [param], + str(param) + "_squared_sum", + value=0.0 + ) self._aux_params.local.append(param_squared_sum) @@ -604,7 +621,7 @@ def _run(self, net, param_init_net, param_info): [param, param_squared_sum, grad.indices, grad.values, lr], [param, param_squared_sum], epsilon=self.epsilon, - engine=self.engine + engine=self.engine, ) else: output_args = [param, param_squared_sum] From 19065f91fc71116ab639b27fb1d25026d4d23191 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 14 Sep 2018 07:34:06 -0700 Subject: [PATCH 148/237] Centralize TypeExtendedInterface casts. (#11576) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11576 Previously, they were spattered throughout the codebase. We now follow this convention: - LegacyTypeDispatch gives you Type - Context gives you TypeExtendedInterface - Tensor::type() gives you Type - at::getType() gives you TypeExtendedInterface I change some sites to use getType() over type(). Reviewed By: SsnL Differential Revision: D9790187 fbshipit-source-id: 5e2577cb590a5bbf5df530f3763d3b3c0b4625ca --- aten/src/ATen/Context.cpp | 9 ++++-- aten/src/ATen/Context.h | 36 +++++++++++++---------- aten/src/ATen/function_wrapper.py | 2 +- aten/src/ATen/native/LegacyBridge.cpp | 16 +++++----- aten/src/ATen/native/TensorFactories.cpp | 4 +-- aten/src/ATen/templates/Functions.h | 7 ++--- caffe2/contrib/aten/aten_op_template.h | 2 +- caffe2/contrib/aten/gen_op.py | 2 +- tools/autograd/templates/VariableType.cpp | 8 ++--- tools/autograd/templates/VariableType.h | 4 +-- 10 files changed, 48 insertions(+), 42 deletions(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 40cb364e91fd0c..cb96814927130c 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -12,6 +12,7 @@ #include "ATen/CPUGenerator.h" #include "ATen/RegisterCPU.h" +#include "ATen/Tensor.h" #include "TH/TH.h" // for USE_LAPACK @@ -107,17 +108,21 @@ bool Context::setFlushDenormal(bool on) { #endif } -Type& getType(TensorOptions options) { +TypeExtendedInterface& getType(TensorOptions options) { return globalContext().getType( options.backend(), options.dtype(), options.is_variable()); } -Type& getType(const TensorImpl* impl) { +TypeExtendedInterface& getType(const TensorImpl* impl) { Backend backend = tensorTypeIdToBackend(impl->type_id()); return globalContext().getType( backend, impl->scalar_type(), impl->is_variable()); } +TypeExtendedInterface& getType(const Tensor& t) { + return getType(t.unsafeGetTensorImpl()); +} + Allocator* getCPUAllocator() { return getTHDefaultAllocator(); } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 7b3634dd83086f..4e147cffabbe86 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -5,6 +5,7 @@ #include "ATen/CUDAStream.h" #include "ATen/core/Generator.h" #include "ATen/Type.h" +#include "ATen/TypeExtendedInterface.h" #include "ATen/Utils.h" #include "ATen/core/Error.h" #include "ATen/detail/CUDAHooksInterface.h" @@ -21,23 +22,25 @@ namespace at { +struct Tensor; + class AT_API Context { public: Context(); - Type* getNonVariableTypeRaw(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s); + TypeExtendedInterface* getNonVariableTypeRaw(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s)); } - Type * getNonVariableTypeOpt(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s); + TypeExtendedInterface * getNonVariableTypeOpt(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s)); } - Type & getNonVariableType(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getNonVariableType(p, s); + TypeExtendedInterface & getNonVariableType(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getNonVariableType(p, s)); } - Type & getVariableType(Backend p, ScalarType s) { - return globalLegacyTypeDispatch().getVariableType(p, s); + TypeExtendedInterface & getVariableType(Backend p, ScalarType s) { + return static_cast(globalLegacyTypeDispatch().getVariableType(p, s)); } - Type & getType(Backend p, ScalarType s, bool is_variable) { - return globalLegacyTypeDispatch().getType(p, s, is_variable); + TypeExtendedInterface & getType(Backend p, ScalarType s, bool is_variable) { + return static_cast(globalLegacyTypeDispatch().getType(p, s, is_variable)); } // The passed in Type must be delete'able // TODO: Just make it take a unique_ptr @@ -142,24 +145,25 @@ static inline void init() { } } -static inline Type& getNonVariableType(Backend p, ScalarType s) { +static inline TypeExtendedInterface& getNonVariableType(Backend p, ScalarType s) { return globalContext().getNonVariableType(p, s); } -static inline Type& getNonVariableType(DeviceType p, ScalarType s) { +static inline TypeExtendedInterface& getNonVariableType(DeviceType p, ScalarType s) { return globalContext().getNonVariableType(deviceTypeToBackend(p), s); } -AT_API Type& getType(TensorOptions options); -AT_API Type& getType(const TensorImpl*); +AT_API TypeExtendedInterface& getType(TensorOptions options); +AT_API TypeExtendedInterface& getType(const TensorImpl*); +AT_API TypeExtendedInterface& getType(const Tensor&); AT_API Allocator* getCPUAllocator(); -static inline Type& CPU(ScalarType s) { +static inline TypeExtendedInterface& CPU(ScalarType s) { return getNonVariableType(Backend::CPU, s); } -static inline Type& CUDA(ScalarType s) { +static inline TypeExtendedInterface& CUDA(ScalarType s) { return getNonVariableType(Backend::CUDA, s); } diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 266f9645a18527..e88dd8867a0e96 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -1151,7 +1151,7 @@ def find_formal(formal_name, formals): option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor) else: # doesn't depend on a specific type, use undefined float - option['inferred_type'] = 'detail::non_specific_type()' + option['inferred_type'] = 'at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)' declaration = DEPRECATED_FUNCTION_DECLARATION if option['deprecated'] else FUNCTION_DECLARATION top_env['function_declarations'].append(declaration.substitute(env)) if is_factory_method: diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp index 2803112b7692bb..0aaf2149b42a05 100644 --- a/aten/src/ATen/native/LegacyBridge.cpp +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -136,34 +136,34 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta Tensor tensor(const Type& dtype) { if (_type_has_native(dtype)) { - return static_cast(dtype).native_tensor(); + return at::getType(dtype.options()).native_tensor(); } else { - return static_cast(dtype).th_tensor(); + return at::getType(dtype.options()).th_tensor(); } } Tensor tensor(const Type& dtype, ArrayRef size) { if (_type_has_native(dtype)) { - return static_cast(dtype).native_tensor(size); + return at::getType(dtype.options()).native_tensor(size); } else { - return static_cast(dtype).th_tensor(size); + return at::getType(dtype.options()).th_tensor(size); } } Tensor sparse_coo_tensor(const Type& dtype, ArrayRef size) { - return static_cast(dtype.toSparse()).native_sparse_coo_tensor(size); + return at::getType(dtype.options().layout(at::kSparse)).native_sparse_coo_tensor(size); } Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) { - return static_cast(values.type().toSparse()).native_sparse_coo_tensor(indices, values); + return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values); } Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef size) { - return static_cast(values.type().toSparse()).native_sparse_coo_tensor(indices, values, size); + return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size); } Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef size) { - return static_cast(values.type().toSparse())._native_sparse_coo_tensor_unsafe(indices, values, size); + return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size); } int64_t get_device(const Tensor& self) { diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 20211aeedfa32b..178045d9fd0de4 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -60,7 +60,7 @@ void window_function_checks( } const TypeExtendedInterface& getFactoryType(const TensorOptions& options) { - return static_cast(at::getType(options)); + return at::getType(options); } } // namespace @@ -98,7 +98,7 @@ Tensor& arange_out(Tensor& result, Scalar end) { } Tensor _dim_arange(const Tensor& like, int64_t dim) { - return static_cast(like.type().toScalarType(at::kLong))._arange(like.size(dim)); + return at::getType(like.options().dtype(at::kLong))._arange(like.size(dim)); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index 7e2b6580343be4..8bbc17af5da291 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -25,14 +25,11 @@ namespace detail { static inline TypeExtendedInterface & infer_type(const Tensor & t) { AT_CHECK(t.defined(), "undefined Tensor"); - return static_cast(t.type()); + return getType(t); } static inline TypeExtendedInterface & infer_type(const TensorList & tl) { AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); - return static_cast(tl[0].type()); -} -static inline TypeExtendedInterface & non_specific_type() { - return static_cast(at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)); + return getType(tl[0]); } } // namespace detail diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index 8b12cc9dad16d8..583a2fa492647b 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -223,7 +223,7 @@ class ATenOp : public Operator { CAFFE_THROW("unsupported type annotation: ", name); } at::TypeExtendedInterface & stringToType(const std::string & name) { - return static_cast(at::getNonVariableType(backend(), stringToScalarType(name))); + return at::getNonVariableType(backend(), stringToScalarType(name)); } at::TypeExtendedInterface * readTypeAttribute(const std::string & name) { CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType(name)); diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index bc75ac9859dcc5..70843bb0d91108 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -278,7 +278,7 @@ def find_factory_methods(decls): # first tensor input is used to define the output type. defined_inferred_type = True env['statements'].append( - 'auto inferred_type = &(static_cast({}.type()));'.format( + 'auto inferred_type = &at::getType({});'.format( arg['name'])) else: init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name']) diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index c6a84afe710a89..6bd409ad0a4040 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -42,9 +42,9 @@ using namespace torch::autograd::generated; namespace torch { namespace autograd { -VariableType::VariableType(Context* context, Type* baseType) +VariableType::VariableType(Context* context, TypeExtendedInterface* baseType) : TypeDefault(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false) - , baseType(static_cast(baseType)) + , baseType(baseType) , id_(context->freshTypeID()) { str = std::string("Variable[") + baseType->toString() + "]"; } @@ -105,7 +105,7 @@ TypeID VariableType::ID() const { std::vector> type_to_variable_type; // XXX - this is not threadsafe with uses of Variables -void register_variable_type_for(Type* baseType) { +void register_variable_type_for(TypeExtendedInterface* baseType) { AT_ASSERT(baseType); size_t base_id = static_cast(baseType->ID()); if(type_to_variable_type.size() <= base_id) { @@ -166,7 +166,7 @@ REGISTER_VARIABLE_HOOKS(VariableHooks) // Pre-condition: backend/scalar_type is a valid type in the type_registry void VariableHooks::registerVariableTypeFor(at::LegacyTypeDispatch* context, at::Backend backend, at::ScalarType scalar_type) const { auto* baseType = context->getNonVariableTypeRaw(backend, scalar_type); - register_variable_type_for(baseType); + register_variable_type_for(static_cast(baseType)); } at::Type& VariableHooks::getVariableTypeFromBaseType(const at::Type& baseType) const { diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index 19de1b910a0b69..446fb5b889f47a 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -31,10 +31,10 @@ using at::ScalarType; using at::optional; using at::Device; -void register_variable_type_for(at::Type* baseType); +void register_variable_type_for(at::TypeExtendedInterface* baseType); struct TORCH_API VariableType final : public at::TypeDefault { - VariableType(Context* context, at::Type* baseType); + VariableType(Context* context, at::TypeExtendedInterface* baseType); at::ScalarType scalarType() const override; virtual caffe2::TypeMeta typeMeta() const override; at::Backend backend() const override; From 74197c7115af427d041fd6783c9a95207b6603c5 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 14 Sep 2018 07:36:16 -0700 Subject: [PATCH 149/237] Restore support for dim=None on WeightNorm. (#11661) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11661 Reviewed By: veenix Differential Revision: D9826799 Pulled By: ezyang fbshipit-source-id: 9eec57bb27a365406669e412f6eb88741b22ed3d --- test/test_nn.py | 6 ++++++ torch/nn/utils/weight_norm.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/test/test_nn.py b/test/test_nn.py index 03a35bf10d88f8..809b820c42202e 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -1683,6 +1683,12 @@ def test_weight_norm(self): self.assertEqual(m.weight_g.size(), (1, 5)) self.assertEqual(m(input), expected_output) + # test with dim=None + m = nn.Linear(5, 7) + expected_output = m(input) + m = torch.nn.utils.weight_norm(m, dim=None) + self.assertEqual(m(input), expected_output) + def test_weight_norm_pickle(self): m = torch.nn.utils.weight_norm(nn.Linear(5, 7)) m = pickle.loads(pickle.dumps(m)) diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py index ddefa3d6f5f544..a80ef02b411ea0 100644 --- a/torch/nn/utils/weight_norm.py +++ b/torch/nn/utils/weight_norm.py @@ -7,6 +7,8 @@ class WeightNorm(object): def __init__(self, name, dim): + if dim is None: + dim = -1 self.name = name self.dim = dim @@ -17,6 +19,9 @@ def compute_weight(self, module): @staticmethod def apply(module, name, dim): + if dim is None: + dim = -1 + fn = WeightNorm(name, dim) weight = getattr(module, name) From c391c2006304233eb5745f99bba850785c0297d6 Mon Sep 17 00:00:00 2001 From: Neeraj Pradhan Date: Fri, 14 Sep 2018 07:49:47 -0700 Subject: [PATCH 150/237] Adding .expand method for TransformedDistribution (#11607) Summary: This PR: - adds a `.expand` method for `TransformedDistribution` along the lines of #11341. - uses this method to simplify `.expand` in distribution classes that subclass off of `TransformedDistribution`. - restores testing of `TransformedDistribution` fixtures. - fixes some bugs wherein we were not setting certain attributes in the expanded instances, and adds tests for `.mean` and `.variance` which use these attributes. There are many cases where users directly use `TransformedDistribution` rather than subclassing off it. In such cases, it seems rather inconvenient to have to write a separate class just to define a `.expand` method. The default implementation should suffice in these cases. cc. fritzo, vishwakftw, alicanb Pull Request resolved: https://github.com/pytorch/pytorch/pull/11607 Differential Revision: D9818225 Pulled By: soumith fbshipit-source-id: 2c4b3812b9a03e6985278cfce0f9a127ce536f23 --- test/common.py | 7 +++++++ test/test_distributions.py | 18 +++++++++++------- torch/distributions/chi2.py | 1 - torch/distributions/gumbel.py | 9 +++------ torch/distributions/half_cauchy.py | 6 +----- torch/distributions/half_normal.py | 6 +----- torch/distributions/log_normal.py | 6 +----- torch/distributions/logistic_normal.py | 9 +-------- torch/distributions/pareto.py | 7 +++---- torch/distributions/relaxed_bernoulli.py | 5 +---- torch/distributions/relaxed_categorical.py | 7 +------ .../distributions/transformed_distribution.py | 10 ++++++++++ torch/distributions/weibull.py | 6 ++++-- 13 files changed, 44 insertions(+), 53 deletions(-) diff --git a/test/common.py b/test/common.py index 04986fcf5af57b..c1cd7999d82901 100644 --- a/test/common.py +++ b/test/common.py @@ -350,6 +350,13 @@ def assertTensorsEqual(a, b): self.assertTrue(torch.equal(nan_mask, b != b), message) diff = a - b diff[nan_mask] = 0 + # inf check if allow_inf=True + if allow_inf: + inf_mask = (a == float("inf")) | (a == float("-inf")) + self.assertTrue(torch.equal(inf_mask, + (b == float("inf")) | (b == float("-inf"))), + message) + diff[inf_mask] = 0 # TODO: implement abs on CharTensor if diff.is_signed() and 'CharTensor' not in diff.type(): diff = diff.abs() diff --git a/test/test_distributions.py b/test/test_distributions.py index 61d5e57d910b54..8264337a681782 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -796,27 +796,33 @@ def test_has_examples(self): def test_distribution_expand(self): shapes = [torch.Size(), torch.Size((2,)), torch.Size((2, 1))] for Dist, params in EXAMPLES: - if Dist.__name__ == "TransformedDistribution": - continue for param in params: for shape in shapes: d = Dist(**param) expanded_shape = shape + d.batch_shape original_shape = d.batch_shape + d.event_shape expected_shape = shape + original_shape - expanded = d.expand(batch_shape=expanded_shape) + expanded = d.expand(batch_shape=list(expanded_shape)) sample = expanded.sample() actual_shape = expanded.sample().shape self.assertEqual(expanded.__class__, d.__class__) self.assertEqual(d.sample().shape, original_shape) self.assertEqual(expanded.log_prob(sample), d.log_prob(sample)) self.assertEqual(actual_shape, expected_shape) + self.assertEqual(expanded.batch_shape, expanded_shape) + try: + self.assertEqual(expanded.mean, + d.mean.expand(expanded_shape + d.event_shape), + allow_inf=True) + self.assertEqual(expanded.variance, + d.variance.expand(expanded_shape + d.event_shape), + allow_inf=True) + except NotImplementedError: + pass def test_distribution_subclass_expand(self): expand_by = torch.Size((2,)) for Dist, params in EXAMPLES: - if Dist.__name__ == "TransformedDistribution": - continue class SubClass(Dist): pass @@ -2221,8 +2227,6 @@ def test_independent_shape(self): def test_independent_expand(self): for Dist, params in EXAMPLES: - if Dist.__name__ == "TransformedDistribution": - continue for param in params: base_dist = Dist(**param) for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1): diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py index fe8c5956fb5b68..7fdc5e8d0fd114 100644 --- a/torch/distributions/chi2.py +++ b/torch/distributions/chi2.py @@ -24,7 +24,6 @@ def __init__(self, df, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(Chi2, _instance) - batch_shape = torch.Size(batch_shape) return super(Chi2, self).expand(batch_shape, new) @property diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py index f6d10aa4b0efb9..b489c8754aa441 100644 --- a/torch/distributions/gumbel.py +++ b/torch/distributions/gumbel.py @@ -40,12 +40,9 @@ def __init__(self, loc, scale, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(Gumbel, _instance) - batch_shape = torch.Size(batch_shape) - base_dist = self.base_dist.expand(batch_shape) - transforms = self.transforms - super(Gumbel, new).__init__(base_dist, transforms, validate_args=False) - new._validate_args = self._validate_args - return new + new.loc = self.loc.expand(batch_shape) + new.scale = self.scale.expand(batch_shape) + return super(Gumbel, self).expand(batch_shape, _instance=new) @property def mean(self): diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py index 8b81ea8b8c862d..916cb47019a28f 100644 --- a/torch/distributions/half_cauchy.py +++ b/torch/distributions/half_cauchy.py @@ -35,11 +35,7 @@ def __init__(self, scale, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(HalfCauchy, _instance) - batch_shape = torch.Size(batch_shape) - base_dist = self.base_dist.expand(batch_shape) - super(HalfCauchy, new).__init__(base_dist, AbsTransform(), validate_args=False) - new._validate_args = self._validate_args - return new + return super(HalfCauchy, self).expand(batch_shape, _instance=new) @property def scale(self): diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py index d18cb3b971cd2c..00d0015231c2e6 100644 --- a/torch/distributions/half_normal.py +++ b/torch/distributions/half_normal.py @@ -35,11 +35,7 @@ def __init__(self, scale, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(HalfNormal, _instance) - batch_shape = torch.Size(batch_shape) - base_dist = self.base_dist.expand(batch_shape) - super(HalfNormal, new).__init__(base_dist, AbsTransform(), validate_args=False) - new._validate_args = self._validate_args - return new + return super(HalfNormal, self).expand(batch_shape, _instance=new) @property def scale(self): diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py index 274bcd83c47a83..38fe532bde31f7 100644 --- a/torch/distributions/log_normal.py +++ b/torch/distributions/log_normal.py @@ -33,11 +33,7 @@ def __init__(self, loc, scale, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(LogNormal, _instance) - batch_shape = torch.Size(batch_shape) - base_dist = self.base_dist.expand(batch_shape) - super(LogNormal, new).__init__(base_dist, ExpTransform(), validate_args=False) - new._validate_args = self._validate_args - return new + return super(LogNormal, self).expand(batch_shape, _instance=new) @property def loc(self): diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py index 909a30d3db8a7c..c1be0d2b086b46 100644 --- a/torch/distributions/logistic_normal.py +++ b/torch/distributions/logistic_normal.py @@ -41,14 +41,7 @@ def __init__(self, loc, scale, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(LogisticNormal, _instance) - batch_shape = torch.Size(batch_shape) - base_dist = self.base_dist.expand(batch_shape + self.base_dist.batch_shape[-1:]) - super(LogisticNormal, new).__init__(base_dist, - StickBreakingTransform(), - validate_args=False) - new._event_shape = self._event_shape - new._validate_args = self._validate_args - return new + return super(LogisticNormal, self).expand(batch_shape, _instance=new) @property def loc(self): diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py index 6fa499642065de..c860f07b13835d 100644 --- a/torch/distributions/pareto.py +++ b/torch/distributions/pareto.py @@ -30,10 +30,9 @@ def __init__(self, scale, alpha, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(Pareto, _instance) - base_dist = self.base_dist.expand(batch_shape) - super(Pareto, new).__init__(base_dist, self.transforms, validate_args=False) - new._validate_args = self._validate_args - return new + new.scale = self.scale.expand(batch_shape) + new.alpha = self.alpha.expand(batch_shape) + return super(Pareto, self).expand(batch_shape, _instance=new) @property def mean(self): diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py index f885bfea2bc422..09897e1d34190f 100644 --- a/torch/distributions/relaxed_bernoulli.py +++ b/torch/distributions/relaxed_bernoulli.py @@ -120,10 +120,7 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(RelaxedBernoulli, _instance) - base_dist = self.base_dist.expand(batch_shape) - super(RelaxedBernoulli, new).__init__(base_dist, SigmoidTransform(), validate_args=False) - new._validate_args = self._validate_args - return new + return super(RelaxedBernoulli, self).expand(batch_shape, _instance=new) @property def temperature(self): diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py index 59f5b5998a12d3..86406cd207ab52 100644 --- a/torch/distributions/relaxed_categorical.py +++ b/torch/distributions/relaxed_categorical.py @@ -116,12 +116,7 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(RelaxedOneHotCategorical, _instance) - base_dist = self.base_dist.expand(batch_shape) - super(RelaxedOneHotCategorical, new).__init__(base_dist, - ExpTransform(), - validate_args=False) - new._validate_args = self._validate_args - return new + return super(RelaxedOneHotCategorical, self).expand(batch_shape, _instance=new) @property def temperature(self): diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py index 79e2c691638076..a7c49b4f7b1810 100644 --- a/torch/distributions/transformed_distribution.py +++ b/torch/distributions/transformed_distribution.py @@ -56,6 +56,16 @@ def __init__(self, base_distribution, transforms, validate_args=None): event_shape = shape[len(shape) - event_dim:] super(TransformedDistribution, self).__init__(batch_shape, event_shape, validate_args=validate_args) + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(TransformedDistribution, _instance) + batch_shape = torch.Size(batch_shape) + base_dist_batch_shape = batch_shape + self.base_dist.batch_shape[len(self.batch_shape):] + new.base_dist = self.base_dist.expand(base_dist_batch_shape) + new.transforms = self.transforms + super(TransformedDistribution, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + @constraints.dependent_property def support(self): return self.transforms[-1].codomain if self.transforms else self.base_dist.support diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py index 0636a74d113f82..8b5afee400b780 100644 --- a/torch/distributions/weibull.py +++ b/torch/distributions/weibull.py @@ -36,12 +36,14 @@ def __init__(self, scale, concentration, validate_args=None): def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(Weibull, _instance) - batch_shape = torch.Size(batch_shape) new.scale = self.scale.expand(batch_shape) new.concentration = self.concentration.expand(batch_shape) + new.concentration_reciprocal = new.concentration.reciprocal() base_dist = self.base_dist.expand(batch_shape) + transforms = [PowerTransform(exponent=new.concentration_reciprocal), + AffineTransform(loc=0, scale=new.scale)] super(Weibull, new).__init__(base_dist, - self.transforms, + transforms, validate_args=False) new._validate_args = self._validate_args return new From cda71e260026b84990650809b5b5eae3a201742f Mon Sep 17 00:00:00 2001 From: Neeraj Pradhan Date: Fri, 14 Sep 2018 07:52:21 -0700 Subject: [PATCH 151/237] Disallow scalar parameters in Dirichlet and Categorical (#11589) Summary: This adds a small check in `Dirichlet` and `Categorical` `__init__` methods to ensure that scalar parameters are not admissible. **Motivation** Currently, `Dirichlet` throws no error when provided with a scalar parameter, but if we `expand` a scalar instance, it inherits the empty event shape from the original instance and gives unexpected results. The alternative to this check is to promote `event_shape` to be `torch.Size((1,))` if the original instance was a scalar, but that seems to add a bit more complexity (and changes the behavior of `expand` in that it would affect the `event_shape` as well as the `batch_shape` now). Does this seem reasonable? cc. alicanb, fritzo. ```python In [4]: d = dist.Dirichlet(torch.tensor(1.)) In [5]: d.sample() Out[5]: tensor(1.0000) In [6]: d.log_prob(d.sample()) Out[6]: tensor(0.) In [7]: e = d.expand([3]) In [8]: e.sample() Out[8]: tensor([0.3953, 0.1797, 0.4250]) # interpreted as events In [9]: e.log_prob(e.sample()) Out[9]: tensor(0.6931) # wrongly summed out In [10]: e.batch_shape Out[10]: torch.Size([3]) In [11]: e.event_shape Out[11]: torch.Size([]) # cannot be empty ``` Additionally, based on review comments, this removes `real_vector` constraint. This was only being used in `MultivariateNormal`, but I am happy to revert this if we want to keep it around for backwards compatibility. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11589 Differential Revision: D9818271 Pulled By: soumith fbshipit-source-id: f9bbba90ed6f04e0b5bdfa169e70ca20b280fc74 --- torch/distributions/categorical.py | 4 ++++ torch/distributions/dirichlet.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py index 2ad69cb1bacec1..fcfea47c2eadd5 100644 --- a/torch/distributions/categorical.py +++ b/torch/distributions/categorical.py @@ -45,8 +45,12 @@ def __init__(self, probs=None, logits=None, validate_args=None): if (probs is None) == (logits is None): raise ValueError("Either `probs` or `logits` must be specified, but not both.") if probs is not None: + if probs.dim() < 1: + raise ValueError("`probs` parameter must be at least one-dimensional.") self.probs = probs / probs.sum(-1, keepdim=True) else: + if logits.dim() < 1: + raise ValueError("`logits` parameter must be at least one-dimensional.") self.logits = logits - logits.logsumexp(dim=-1, keepdim=True) self._param = self.probs if probs is not None else self.logits self._num_events = self._param.size()[-1] diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py index 94d66d60c0848c..f618628c9f68e7 100644 --- a/torch/distributions/dirichlet.py +++ b/torch/distributions/dirichlet.py @@ -54,7 +54,9 @@ class Dirichlet(ExponentialFamily): has_rsample = True def __init__(self, concentration, validate_args=None): - self.concentration, = broadcast_all(concentration) + if concentration.dim() < 1: + raise ValueError("`concentration` parameter must be at least one-dimensional.") + self.concentration = concentration batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:] super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args) From 6c3792b9ecdcd94c799c2a18dbfd056093e109ac Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Fri, 14 Sep 2018 08:28:49 -0700 Subject: [PATCH 152/237] Implement UndefinedType::typeMeta. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11666 Differential Revision: D9816212 Pulled By: gchanan fbshipit-source-id: 079899590150009bc2e2a3bbdc78a98de9380e37 --- aten/src/ATen/UndefinedType.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp index 9c3457bdd4803f..8e9722eae3be09 100644 --- a/aten/src/ATen/UndefinedType.cpp +++ b/aten/src/ATen/UndefinedType.cpp @@ -9,7 +9,7 @@ ScalarType UndefinedType::scalarType() const { return ScalarType::Undefined; } caffe2::TypeMeta UndefinedType::typeMeta() const { - AT_ERROR("typeMeta not defined for UndefinedType"); + return scalarTypeToTypeMeta(scalarType()); } Backend UndefinedType::backend() const { return Backend::Undefined; From 2631da0822149c19214dba94510e29abb5b25f75 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Fri, 14 Sep 2018 08:55:24 -0700 Subject: [PATCH 153/237] Move some Tensor method definitions from Type.h to TensorMethods.h. (#11650) Summary: There's no reason they need to be in Type.h and this moves us along the path of not having circular dependencies (so we can get rid of TensorMethods.h). Pull Request resolved: https://github.com/pytorch/pytorch/pull/11650 Reviewed By: ezyang Differential Revision: D9812271 Pulled By: gchanan fbshipit-source-id: 8b70db9a5eb0a332398ab2e8998eeaf7d2eea6d7 --- aten/src/ATen/core/TensorMethods.h | 16 ++++++++++++++++ aten/src/ATen/core/Type.h | 16 ---------------- aten/src/ATen/templates/TensorMethods.h | 16 ++++++++++++++++ aten/src/ATen/templates/Type.h | 16 ---------------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h index aba169279e6d2b..ff85267e78fb81 100644 --- a/aten/src/ATen/core/TensorMethods.h +++ b/aten/src/ATen/core/TensorMethods.h @@ -1215,6 +1215,22 @@ inline Scalar Tensor::_local_scalar() const { return type()._local_scalar(*this); } +inline bool Tensor::is_variable() const noexcept { + return type().is_variable(); +} + +inline ScalarType Tensor::dtype() const noexcept { + return type().scalarType(); +} + +inline Layout Tensor::layout() const noexcept { + return type().layout(); +} + +inline Device Tensor::device() const { + return Device(type().device_type(), type().is_cuda() ? get_device() : -1); +} + #define DEFINE_CAST(T, name, _) \ template <> \ inline T* Tensor::data() const { \ diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h index e8b2e2d40980c2..dabd19c7837223 100644 --- a/aten/src/ATen/core/Type.h +++ b/aten/src/ATen/core/Type.h @@ -629,20 +629,4 @@ struct AT_API Type { }; -inline bool Tensor::is_variable() const noexcept { - return type().is_variable(); -} - -inline ScalarType Tensor::dtype() const noexcept { - return type().scalarType(); -} - -inline Layout Tensor::layout() const noexcept { - return type().layout(); -} - -inline Device Tensor::device() const { - return Device(type().device_type(), type().is_cuda() ? get_device() : -1); -} - } // namespace at diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 54fd80c5f13463..8283bea01f6bed 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -55,6 +55,22 @@ inline void Tensor::set_data(Tensor new_data) { // all static inline to allow for inlining of the non-dynamic part of dispatch ${tensor_method_definitions} +inline bool Tensor::is_variable() const noexcept { + return type().is_variable(); +} + +inline ScalarType Tensor::dtype() const noexcept { + return type().scalarType(); +} + +inline Layout Tensor::layout() const noexcept { + return type().layout(); +} + +inline Device Tensor::device() const { + return Device(type().device_type(), type().is_cuda() ? get_device() : -1); +} + #define DEFINE_CAST(T, name, _) \ template <> \ inline T* Tensor::data() const { \ diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index ef49b2880dcec7..8dda48749b0c97 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -143,20 +143,4 @@ struct AT_API Type { }; -inline bool Tensor::is_variable() const noexcept { - return type().is_variable(); -} - -inline ScalarType Tensor::dtype() const noexcept { - return type().scalarType(); -} - -inline Layout Tensor::layout() const noexcept { - return type().layout(); -} - -inline Device Tensor::device() const { - return Device(type().device_type(), type().is_cuda() ? get_device() : -1); -} - } // namespace at From 72822ee6b251118aefbe9af75ac0fcb4ee56a0ee Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 14 Sep 2018 08:55:39 -0700 Subject: [PATCH 154/237] =?UTF-8?q?Fix=20#11430=20(CPU=20only=20builds=20r?= =?UTF-8?q?aise=20opaque=20error=20message=20when=20calling=20.=E2=80=A6?= =?UTF-8?q?=20(#11533)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: …cuda()) While I was at it, I audited all other ways I know how we might get a CUDA type from PyTorch and fixed more constructors which don't work. Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11533 Differential Revision: D9775786 Pulled By: ezyang fbshipit-source-id: cd07cdd375fdf74945539ec475a48bf08cbc0c17 --- test/test_torch.py | 9 +++++++++ tools/autograd/gen_python_functions.py | 4 ++-- .../templates/python_torch_functions.cpp | 14 +++++++------- .../templates/python_torch_functions_dispatch.h | 4 ++-- torch/csrc/DynamicTypes.cpp | 4 ++++ torch/csrc/tensor/python_tensor.cpp | 6 +++--- torch/csrc/utils/cuda_lazy_init.cpp | 12 +++++++++--- torch/csrc/utils/cuda_lazy_init.h | 16 ++++++++++++++-- 8 files changed, 50 insertions(+), 19 deletions(-) diff --git a/test/test_torch.py b/test/test_torch.py index 837d7715f3d9e1..260bfe3d42e8e7 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -8852,6 +8852,15 @@ def test_meshgrid(self): self.assertTrue(grid_b.equal(expected_grid_b)) self.assertTrue(grid_c.equal(expected_grid_c)) + @unittest.skipIf(torch.cuda.is_available(), "CUDA is available, can't test CUDA not built error") + def test_cuda_not_built(self): + msg = "Torch not compiled with CUDA enabled" + self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.current_device()) + self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1], device="cuda")) + self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).cuda()) + self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.FloatTensor()) + self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).to(device="cuda")) + # Functions to test negative dimension wrapping METHOD = 1 diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 8712c7c6ca2d43..249ba042bb7056 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -427,9 +427,9 @@ def append_actuals_formals(actual, formal): env['actuals'] = actuals if has_tensor_options: - env['initialize_cuda'] = 'maybe_initialize_cuda(at::getType(options));' + env['initialize_cuda'] = 'maybe_initialize_cuda(options);' else: - env['initialize_cuda'] = 'maybe_initialize_cuda({});'.format(type_args[0]['name']) if type_args else '' + env['initialize_cuda'] = '' if 'call_args' in declaration: env['dispatch_args'] = declaration['call_args'] diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 51ded6eb792f34..b2b6739710c8da 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -65,7 +65,7 @@ inline Tensor dispatch_arange(Scalar end, Tensor result) { } inline Tensor dispatch_arange(Scalar end, const TensorOptions& options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; return torch::arange(end, options); } @@ -76,7 +76,7 @@ inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, Tensor resu } inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, const TensorOptions& options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; return torch::arange(start, end, step, options); } @@ -147,7 +147,7 @@ inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, Tensor resul } inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, const TensorOptions& options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; DeviceGuard device_guard(options.device()); return torch::range(start, end, step, options); @@ -189,7 +189,7 @@ inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator return at::randint_out(result, high, size, generator); } inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator, const TensorOptions & options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; return torch::randint(high, size, generator, options); } @@ -198,7 +198,7 @@ inline Tensor dispatch_randint(int64_t high, IntList size, Tensor result) { return at::randint_out(result, high, size); } inline Tensor dispatch_randint(int64_t high, IntList size, const TensorOptions & options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; return torch::randint(high, size, options); } @@ -207,7 +207,7 @@ inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generato return at::randint_out(result, low, high, size, generator); } inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generator * generator, const TensorOptions & options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; return torch::randint(low, high, size, generator, options); } @@ -216,7 +216,7 @@ inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Tensor r return at::randint_out(result, low, high, size); } inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, const TensorOptions & options) { - maybe_initialize_cuda(at::getType(options)); + maybe_initialize_cuda(options); AutoNoGIL no_gil; return torch::randint(low, high, size, options); } diff --git a/tools/autograd/templates/python_torch_functions_dispatch.h b/tools/autograd/templates/python_torch_functions_dispatch.h index 78dbe4c225f639..fdc33538bcada2 100644 --- a/tools/autograd/templates/python_torch_functions_dispatch.h +++ b/tools/autograd/templates/python_torch_functions_dispatch.h @@ -28,8 +28,8 @@ static at::Type& default_type() { return torch::tensors::get_default_tensor_type(); } -static void maybe_initialize_cuda(const at::Type &type) { - if (type.is_cuda()) { +static void maybe_initialize_cuda(const at::TensorOptions& options) { + if (options.device().is_cuda()) { torch::utils::cuda_lazy_init(); } } diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp index bed40cb67a2d0e..e5a95a94e1ae85 100644 --- a/torch/csrc/DynamicTypes.cpp +++ b/torch/csrc/DynamicTypes.cpp @@ -7,6 +7,7 @@ #include "torch/csrc/Exceptions.h" #include "torch/csrc/autograd/generated/VariableType.h" #include "torch/csrc/utils/cuda_enabled.h" +#include "torch/csrc/utils/cuda_lazy_init.h" #include @@ -99,6 +100,9 @@ void registerLayoutObject(THPLayout *layout, at::Backend backend) { at::Type& getVariableType(at::ScalarType scalarType, const THPLayout& layout, const at::Device& device) { const at::Backend backend = get_backend(device.type() == at::Device::Type::CUDA, layout.layout == at::Layout::Sparse); + if (device.is_cuda()) { + torch::utils::cuda_lazy_init(); + } auto baseType = at::globalContext().getNonVariableTypeOpt(backend, scalarType); if (!baseType) { std::ostringstream oss; diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp index beda85e938be1b..4a40cf243f3a68 100644 --- a/torch/csrc/tensor/python_tensor.cpp +++ b/torch/csrc/tensor/python_tensor.cpp @@ -43,6 +43,9 @@ struct PyTensorType { // Precondition: Access to this struct is protected by the GIL at::Type* aten_type() { if (!aten_type_) { + if (is_cuda) { + torch::utils::cuda_lazy_init(); + } auto* baseType = globalContext().getNonVariableTypeOpt(static_cast(backend), static_cast(scalar_type)); aten_type_ = baseType ? torch::autograd::VariableType::getVariableTypeFromBaseType(*baseType) : nullptr; } @@ -69,9 +72,6 @@ static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs if (!aten_type) { throw unavailable_type(tensor_type); } - if (aten_type->is_cuda()) { - torch::utils::cuda_lazy_init(); - } return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(*aten_type, args, kwargs)); END_HANDLE_TH_ERRORS } diff --git a/torch/csrc/utils/cuda_lazy_init.cpp b/torch/csrc/utils/cuda_lazy_init.cpp index ac35e04e1decbf..6fd73526bba520 100644 --- a/torch/csrc/utils/cuda_lazy_init.cpp +++ b/torch/csrc/utils/cuda_lazy_init.cpp @@ -10,13 +10,19 @@ namespace torch { namespace utils { void cuda_lazy_init() { - static std::once_flag once; - std::call_once(once, []() { + AutoGIL g; + // Protected by the GIL. We don't use call_once because under ASAN it + // has a buggy implementation that deadlocks if an instance throws an + // exception. In any case, call_once isn't necessary, because we + // have taken a lock. + static bool run_yet = false; + if (!run_yet) { auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda")); if (!module) throw python_error(); auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", "")); if (!res) throw python_error(); - }); + run_yet = true; + } } } diff --git a/torch/csrc/utils/cuda_lazy_init.h b/torch/csrc/utils/cuda_lazy_init.h index 8b1d4be125beaa..f8522c1463443a 100644 --- a/torch/csrc/utils/cuda_lazy_init.h +++ b/torch/csrc/utils/cuda_lazy_init.h @@ -1,11 +1,23 @@ #pragma once -// It initially lies in torch/csrc/cuda, but to unconditionlly compile it -// we have to put it here. +// cuda_lazy_init() is always compiled, even for CPU-only builds. +// Thus, it does not live in the cuda/ folder. namespace torch { namespace utils { +// The INVARIANT is that this function MUST be called before you attempt +// to get a CUDA Type object from ATen, in any way. Here are some common +// ways that a Type object may be retrieved: +// +// - You call getNonVariableType or getNonVariableTypeOpt +// - You call toBackend() on a Type +// +// It's important to do this correctly, because if you forget to add it +// you'll get an oblique error message about "Cannot initialize CUDA without +// ATen_cuda library" if you try to use CUDA functionality from a CPU-only +// build, which is not good UX. +// void cuda_lazy_init(); } From 0d9b9100f977f677a0734af9096e9121ed85b337 Mon Sep 17 00:00:00 2001 From: Vishwak Srinivasan Date: Fri, 14 Sep 2018 09:12:45 -0700 Subject: [PATCH 155/237] Fix gesv and gels docs (#11699) Summary: Closes #9935 and closes #5431 . Differential Revision: D9830448 Pulled By: soumith fbshipit-source-id: 4e5320a1d0c1d4c8253a5b26f4842cea76530514 --- torch/_torch_docs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 19a1fcbc625003..64d93a5916ab16 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -1694,7 +1694,7 @@ def parse_kwargs(desc): Computes the solution to the least squares and least norm problems for a full rank matrix :math:`A` of size :math:`(m \times n)` and a matrix :math:`B` of -size :math:`(n \times k)`. +size :math:`(m \times k)`. If :math:`m \geq n`, :func:`gels` solves the least-squares problem: @@ -1713,7 +1713,7 @@ def parse_kwargs(desc): \end{array} Returned tensor :math:`X` has shape :math:`(\max(m, n) \times k)`. The first :math:`n` -rows of :math:`X` contains the solution. If :math`m \geq n`, the residual sum of squares +rows of :math:`X` contains the solution. If :math:`m \geq n`, the residual sum of squares for the solution in each column is given by the sum of squares of elements in the remaining :math:`m - n` rows of that column. @@ -1834,9 +1834,9 @@ def parse_kwargs(desc): Args: B (Tensor): input matrix of size :math:`(*, m, k)` , where :math:`*` - is zero or more batch dimensions. + is zero or more batch dimensions. A (Tensor): input square matrix of size :math:`(*, m, m)`, where - :math:`*` is zero or more batch dimensions. + :math:`*` is zero or more batch dimensions. out ((Tensor, Tensor), optional): optional output tuple. Example:: From eb039dc92c2801f40415d4e366251d506accf726 Mon Sep 17 00:00:00 2001 From: Alexander Sidorov Date: Fri, 14 Sep 2018 09:33:55 -0700 Subject: [PATCH 156/237] Add CHECKs into GetTensorInfo and ExtractDeviceOption (#11597) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11597 We should always CHECK pointers which we plan to dereference if they are inputs to the function. Nobody knows how the function will be called in the future. Reviewed By: yinghai Differential Revision: D9800002 fbshipit-source-id: 7fd05f4717f2256d1b09a9e75475b12de6685b03 --- caffe2/core/context.h | 1 + caffe2/core/tensor.cc | 1 + caffe2/core/tensor_impl.h | 7 +++++-- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/caffe2/core/context.h b/caffe2/core/context.h index a4e87e8110dc83..aff66534d22198 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -207,6 +207,7 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext { void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) override { + CHECK(device); device->set_device_type(TypeToProto(GetDeviceType())); } diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index e88ed360e8b92b..58b4c4b75e91cb 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -81,6 +81,7 @@ vector GetTensorInfo( const void* c, size_t* capacity, DeviceOption* device) { + CHECK(capacity); const Tensor* tc = static_cast(c); CHECK(tc); CHECK(tc->unsafeGetTensorImpl()); diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 4c146e0d257530..2e01c55318d88f 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -103,7 +103,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * context pointer in tensor, which indicates the type of the tensor. */ at::BaseStaticContext* GetStaticContext() const { - return get_static_context(GetDeviceType()); + auto device_type = GetDeviceType(); + return get_static_context(device_type); } /* @brief @@ -732,7 +733,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { } void ExtractDeviceOption(DeviceOption* device) const { - GetStaticContext()->ExtractDeviceOption(device, raw_data()); + auto* context = GetStaticContext(); + CHECK(context); + context->ExtractDeviceOption(device, raw_data()); } const at::Storage& storage() { From 115b13ffabec5d7ac3441aa1e2c7a792619bbd39 Mon Sep 17 00:00:00 2001 From: Roy Li Date: Fri, 14 Sep 2018 09:44:40 -0700 Subject: [PATCH 157/237] clean up some old Half stuff Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11687 Differential Revision: D9829027 Pulled By: li-roy fbshipit-source-id: f35dcdf93ea57ba4fa775e36e9d6378bed46a710 --- aten/src/ATen/core/Half.h | 7 ---- aten/src/ATen/cuda/ATenCUDAGeneral.h | 4 +++ aten/src/ATen/cuda/CUDAHalf.cu | 51 ---------------------------- aten/src/ATen/cuda/CUDAHalf.cuh | 31 ----------------- aten/src/ATen/gen.py | 2 +- 5 files changed, 5 insertions(+), 90 deletions(-) delete mode 100644 aten/src/ATen/cuda/CUDAHalf.cu delete mode 100644 aten/src/ATen/cuda/CUDAHalf.cuh diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h index c306fcd6b92b72..e2cbd1174fc91c 100644 --- a/aten/src/ATen/core/Half.h +++ b/aten/src/ATen/core/Half.h @@ -186,13 +186,6 @@ To checked_convert(From f, const char* name) { return convert(f); } -template -To HalfFix(From h) { - To ret; - ret.x = h.x; - return ret; -} - AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value); } // namespace at diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h index 7b41f1fe3f7235..761551f808610b 100644 --- a/aten/src/ATen/cuda/ATenCUDAGeneral.h +++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + #ifdef _WIN32 # if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_CUDA_BUILD_MAIN_LIB) # define AT_CUDA_API __declspec(dllexport) diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu deleted file mode 100644 index bd121250ee4847..00000000000000 --- a/aten/src/ATen/cuda/CUDAHalf.cu +++ /dev/null @@ -1,51 +0,0 @@ -#include "ATen/core/Half.h" -#include "ATen/cuda/CUDAHalf.cuh" - -#include -#include -#include - -namespace at { -#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__) - -half Converter::operator()(Half aten_half) { - return half{aten_half.x}; -} - -half Converter::operator()(double value) { - return half{Half(value).x}; -} - -Half Converter::operator()(half cuda_half) { - return Half(cuda_half.x, Half::from_bits); -} -#else -half Converter::operator()(Half aten_half) { - __half_raw x_raw; - x_raw.x = aten_half.x; - return half(x_raw); -} - -Half Converter::operator()(half cuda_half) { - __half_raw raw(cuda_half); - return Half(raw.x, Half::from_bits); -} - -half Converter::operator()(double value) { - __half_raw raw; - raw.x = Half(value).x; - return half {raw}; -} - -template <> __half HalfFix(Half h) { - __half_raw raw; - raw.x = h.x; - return __half{raw}; -} - -template <> Half HalfFix(__half h) { - __half_raw raw(h); - return Half(raw.x, Half::from_bits); -} -#endif -} // namespace at diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh deleted file mode 100644 index 6558ed518ac1fd..00000000000000 --- a/aten/src/ATen/cuda/CUDAHalf.cuh +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "ATen/cuda/ATenCUDAGeneral.h" -#include "ATen/core/Half.h" - -#include -#include -#include - -namespace at { - -template <> -struct AT_CUDA_API Converter { - half operator()(Half); -}; - -template <> -struct AT_CUDA_API Converter { - Half operator()(half); -}; - -template <> -struct AT_CUDA_API Converter { - half operator()(double); -}; - -#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__) -template <> __half HalfFix(Half h); -template <> Half HalfFix(__half h); -#endif -} // namespace at diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 76ed7c74a88e07..66444e1837c540 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -259,7 +259,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations '#undef THNN_', '#undef THCIndexTensor_', ] - env['extra_cuda_headers'] = ['#include '] + env['extra_cuda_headers'] = ['#include '] env['extra_cuda_headers'].append('#include ') env['extra_cuda_headers'].append('#include ') env['extra_cuda_headers'].append('#include ') From 278e304c18b0da89cbc58ac43b2d5eabdd3fd6f3 Mon Sep 17 00:00:00 2001 From: James Reed Date: Fri, 14 Sep 2018 10:04:05 -0700 Subject: [PATCH 158/237] Implement elif in string frontend (#11667) Summary: Closes #11625 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11667 Differential Revision: D9828145 Pulled By: jamesr66a fbshipit-source-id: c72dc41cb310a4211b4e4c6b33f7e2c1fb3581a0 --- test/test_jit.py | 18 ++++++++++++++++++ torch/csrc/jit/script/parser.h | 23 +++++++++++++++++------ 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 985a4f760466b7..b5e26b304c57db 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -6889,6 +6889,24 @@ def forward(self, x): DynamicSliceExportMod(), (input,), f, example_outputs=example_outs) self.assertExpected(exported) + def test_string_frontend_elif(self): + code = ''' + def elif_test(niter : int): + rv = 0 + for i in range(niter): + if i % 3 == 0 and i % 5 == 0: + rv += 35 + elif i % 3 == 0: + rv += 3 + elif i % 5 == 0: + rv += 5 + else: + rv += i + return rv + ''' + + self.checkScript(code, (101,), name='elif_test', outputs=3028) + class MnistNet(nn.Module): def __init__(self): diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h index 53f4dc368714f4..14e5e4f5ae1354 100644 --- a/torch/csrc/jit/script/parser.h +++ b/torch/csrc/jit/script/parser.h @@ -263,8 +263,10 @@ struct Parser { StringLiteral parseStringLiteral() { auto range = L.cur().range; std::stringstream ss; - while(L.cur().kind == TK_STRINGLITERAL) - ss << parseString(L.cur().range, L.next().text()); + while(L.cur().kind == TK_STRINGLITERAL) { + auto literal_range = L.cur().range; + ss << parseString(literal_range, L.next().text()); + } return StringLiteral::create(range, ss.str()); } @@ -339,7 +341,8 @@ struct Parser { auto param_types = parseList('(', ',', ')', &Parser::parseBareTypeAnnotation); TreeRef return_type; if (L.nextIf(TK_ARROW)) { - return_type = Maybe::create(L.cur().range, parseExp()); + auto return_type_range = L.cur().range; + return_type = Maybe::create(return_type_range, parseExp()); } else { return_type = Maybe::create(L.cur().range); } @@ -397,9 +400,10 @@ struct Parser { } return list; } - TreeRef parseIf() { + TreeRef parseIf(bool expect_if=true) { auto r = L.cur().range; - L.expect(TK_IF); + if (expect_if) + L.expect(TK_IF); auto cond = parseExp(); L.expect(':'); auto true_branch = parseStatements(); @@ -407,6 +411,12 @@ struct Parser { if (L.nextIf(TK_ELSE)) { L.expect(':'); false_branch = parseStatements(); + } else if (L.nextIf(TK_ELIF)) { + // NB: this needs to be a separate statement, since the call to parseIf + // mutates the lexer state, and thus causes a heap-use-after-free in + // compilers which evaluate argument expressions LTR + auto range = L.cur().range; + false_branch = makeList(range, {parseIf(false)}); } return If::create(r, Expr(cond), List(true_branch), List(false_branch)); } @@ -448,7 +458,8 @@ struct Parser { TreeRef return_type; if (L.nextIf(TK_ARROW)) { // Exactly one expression for return type annotation - return_type = Maybe::create(L.cur().range, parseExp()); + auto return_type_range = L.cur().range; + return_type = Maybe::create(return_type_range, parseExp()); } else { // Default to returning single tensor. TODO: better sentinel value? return_type = Maybe::create(L.cur().range); From 3258fc11a73a5b6a90c2ad416c4fbaa0d667bc24 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Fri, 14 Sep 2018 10:44:28 -0700 Subject: [PATCH 159/237] Delete torch/csrc/api/README.md (#11703) Summary: We'll have separate docs for the C++ frontend, right now this file is just misleading Pull Request resolved: https://github.com/pytorch/pytorch/pull/11703 Differential Revision: D9832847 Pulled By: goldsborough fbshipit-source-id: 2e8b30ccf6b5cba9d0526e6261160f7c6211a35c --- torch/csrc/api/README.md | 50 ---------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 torch/csrc/api/README.md diff --git a/torch/csrc/api/README.md b/torch/csrc/api/README.md deleted file mode 100644 index 3f11fdae25226f..00000000000000 --- a/torch/csrc/api/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# AUTOGRADPP - -This is an experimental C++ frontend to pytorch's C++ backend. Use at your own -risk. - -How to build: -``` -git submodule update --init --recursive - -cd pytorch -# On Linux: -python setup.py build -# On macOS (may need to prefix with `MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++` when using anaconda) -LDSHARED="cc -dynamiclib -undefined dynamic_lookup" python setup.py build - -cd ..; mkdir -p build; cd build -cmake .. -DPYTHON_EXECUTABLE:FILEPATH=$(which python) # helpful if you use anaconda -make -j -``` - -# Stuff - -- Check out the [MNIST example](https://github.com/ebetica/autogradpp/blob/eee977ddd377c484af5fce09ae8676410bb6fcce/tests/integration_t.cpp#L320-L355), -which tries to replicate PyTorch's MNIST model + training loop -- The principled way to write a model is probably something like -``` -TORCH_AUTOGRAD_CONTAINER_CLASS(MyModel) { - // This does a 2D convolution, followed by global sum pooling, followed by a linear. - public: - void initialize_containers() override { - myConv_ = add(Conv2d(1, 50, 3, 3).stride(2).make(), "conv"); - myLinear_ = add(Linear(50, 1).make(), "linear"); - } - std::vector forward(std::vector x) override { - auto v = myConv_->forward(x); - v = v.mean(-1).mean(-1); - return myLinear_.forward({v}); - } - private: - Container myLinear_; - Container myConv_; -} -``` - -Some things are not implemented: -- SGD, Adagrad, RMSprop, and Adam are the only optimizers implemented -- Bidirectional, batch first, and PackedSequence are not implemented for LSTMs -- Sparse Tensors might work but are very untested - -Otherwise, lots of other things work. There may be breaking API changes. From 7535d98ec436cab718d12d83bd412223cbfc55c8 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Fri, 14 Sep 2018 10:48:50 -0700 Subject: [PATCH 160/237] Add message tag parameter to send/recv Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11490 Reviewed By: teng-li Differential Revision: D9828116 Pulled By: pietern fbshipit-source-id: 98be1ae84b6763ffb329e63c030c5e3ec0e748b7 --- test/test_c10d.py | 4 +-- test/test_distributed.py | 20 ++++++++++++ torch/csrc/distributed/c10d/init.cpp | 6 ++-- torch/distributed/distributed_c10d.py | 34 +++++++++++++-------- torch/lib/c10d/ProcessGroup.hpp | 9 ++++-- torch/lib/c10d/ProcessGroupGloo.cpp | 25 +++++++++++---- torch/lib/c10d/ProcessGroupGloo.hpp | 9 ++++-- torch/lib/c10d/ProcessGroupMPI.cpp | 15 +++++---- torch/lib/c10d/ProcessGroupMPI.hpp | 9 ++++-- torch/lib/c10d/ProcessGroupNCCL.cpp | 5 ++- torch/lib/c10d/ProcessGroupNCCL.hpp | 9 ++++-- torch/lib/c10d/test/ProcessGroupMPITest.cpp | 6 ++-- 12 files changed, 106 insertions(+), 45 deletions(-) diff --git a/test/test_c10d.py b/test/test_c10d.py index ff9d87be76c84f..0df1e3c749c6e6 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -379,14 +379,14 @@ def test_send_recv_all_to_all(self): for i in range(self.world_size): if i == self.rank: continue - send_work.append(pg.send([inputs[i]], i)) + send_work.append(pg.send([inputs[i]], i, 0)) # Issue recvs recv_work = [] for i in range(self.world_size): if i == self.rank: continue - recv_work.append(pg.recv([outputs[i]], i)) + recv_work.append(pg.recv([outputs[i]], i, 0)) # Wait for sends to complete for work in send_work: diff --git a/test/test_distributed.py b/test/test_distributed.py index 6dd0d984e842d3..f3250644bd621a 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -322,6 +322,26 @@ def test_send_recv_any_source(self): self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier() + # SEND RECV WITH TAG + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv") + def test_send_recv_with_tag(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + tensor = _build_tensor(10, value=rank) + + for dst in range(0, world_size): + if dst == rank: + # Recv mode + for src in range(0, world_size): + if src == rank: + continue + output_tensor = _build_tensor(10, value=-1) + dist.recv(output_tensor, src, tag=src) + self.assertTrue(output_tensor.eq(src).all()) + else: + # Send mode + dist.send(tensor, dst, tag=rank) + # ISEND @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend") def test_isend(self): diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index d4f9e0f07b1d23..72bcac27cf8754 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -258,7 +258,8 @@ PyObject* c10d_init(PyObject* _unused) { "recv_anysource", [](::c10d::ProcessGroup& pg, std::vector& input, - at::Tensor& srcRankTensor) { + at::Tensor& srcRankTensor, + int tag) { if (srcRankTensor.type().scalarType() != at::kInt) { throw std::runtime_error( "source rank tensor needs to be " @@ -270,10 +271,11 @@ PyObject* c10d_init(PyObject* _unused) { "contain only one element"); } return pg.recvAnysource( - input, static_cast(srcRankTensor.data_ptr())); + input, static_cast(srcRankTensor.data_ptr()), tag); }, py::arg("tensors"), py::arg("src_rank"), + py::arg("tag"), py::call_guard()) .def( diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index d1a697e58288bb..651f2ea6f2f870 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -373,7 +373,8 @@ def get_world_size(group=group.WORLD): def isend(tensor, dst, - group=group.WORLD): + group=group.WORLD, + tag=0): """ Sends a tensor asynchronously. @@ -381,6 +382,7 @@ def isend(tensor, tensor (Tensor): Tensor to send. dst (int): Destination rank. group (ProcessGroup, optional): The process group to work on + tag (int, optional): Tag to match send with remote recv Returns: A distributed request object. @@ -392,15 +394,16 @@ def isend(tensor, if group == GroupMember.WORLD: _check_default_pg() - return _default_pg.send([tensor], dst) + return _default_pg.send([tensor], dst, tag) else: group_dst_rank = _get_group_rank(group, dst) - return group.send([tensor], group_dst_rank) + return group.send([tensor], group_dst_rank, tag) def irecv(tensor, src, - group=group.WORLD): + group=group.WORLD, + tag=0): """ Receives a tensor asynchronously. @@ -408,6 +411,7 @@ def irecv(tensor, tensor (Tensor): Tensor to fill with received data. src (int): Source rank. group (ProcessGroup, optional): The process group to work on + tag (int, optional): Tag to match recv with remote send Returns: A distributed request object. @@ -419,15 +423,16 @@ def irecv(tensor, if group == GroupMember.WORLD: _check_default_pg() - return _default_pg.recv([tensor], src) + return _default_pg.recv([tensor], src, tag) else: group_src_rank = _get_group_rank(group, src) - return group.recv([tensor], group_src_rank) + return group.recv([tensor], group_src_rank, tag) def send(tensor, dst, - group=group.WORLD): + group=group.WORLD, + tag=0): """ Sends a tensor synchronously. @@ -435,6 +440,7 @@ def send(tensor, tensor (Tensor): Tensor to send. dst (int): Destination rank. group (ProcessGroup, optional): The process group to work on + tag (int, optional): Tag to match send with remote recv """ if _rank_not_in_group(group): @@ -442,15 +448,16 @@ def send(tensor, if group == GroupMember.WORLD: _check_default_pg() - _default_pg.send([tensor], dst).wait() + _default_pg.send([tensor], dst, tag).wait() else: group_dst_rank = _get_group_rank(group, dst) - group.send([tensor], group_dst_rank).wait() + group.send([tensor], group_dst_rank, tag).wait() def recv(tensor, src=None, - group=group.WORLD): + group=group.WORLD, + tag=0): """ Receives a tensor synchronously. @@ -459,6 +466,7 @@ def recv(tensor, src (int, optional): Source rank. Will receive from any process if unspecified. group (ProcessGroup, optional): The process group to work on + tag (int, optional): Tag to match recv with remote send Returns: Sender rank @@ -476,7 +484,7 @@ def recv(tensor, if src is None: rank_tensor = torch.IntTensor([-1]) - pg.recv_anysource([tensor], rank_tensor).wait() + pg.recv_anysource([tensor], rank_tensor, tag).wait() src_rank = rank_tensor[0].item() if group == GroupMember.WORLD: return src_rank @@ -484,10 +492,10 @@ def recv(tensor, return _get_global_rank(pg, src_rank) else: if group == GroupMember.WORLD: - pg.recv([tensor], src).wait() + pg.recv([tensor], src, tag).wait() else: group_src_rank = _get_group_rank(pg, src) - pg.recv([tensor], group_src_rank).wait() + pg.recv([tensor], group_src_rank, tag).wait() return src diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp index 2ac2948457a7f6..3f80f72cf36382 100644 --- a/torch/lib/c10d/ProcessGroup.hpp +++ b/torch/lib/c10d/ProcessGroup.hpp @@ -115,15 +115,18 @@ class ProcessGroup { virtual std::shared_ptr send( std::vector& tensors, - int dstRank) = 0; + int dstRank, + int tag) = 0; virtual std::shared_ptr recv( std::vector& tensors, - int srcRank) = 0; + int srcRank, + int tag) = 0; virtual std::shared_ptr recvAnysource( std::vector& tensors, - int* srcRank) = 0; + int* srcRank, + int tag) = 0; virtual std::shared_ptr barrier() = 0; diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index e110a78001c134..4417364641efd2 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -707,17 +707,26 @@ at::Tensor& checkSingleTensor(std::vector& tensors) { return tensor; } +uint32_t checkTag(int32_t tag) { + if (tag < 0) { + throw std::runtime_error("Tag must be >= 0"); + } + return (uint32_t) tag; +} + std::shared_ptr ProcessGroupGloo::send( std::vector& tensors, - int dstRank) { + int dstRank, + int tag) { auto& tensor = checkSingleTensor(tensors); + auto utag = checkTag(tag); auto ptr = tensor.data_ptr(); auto size = tensor.numel() * tensor.type().elementSizeInBytes(); // Construct unbound buffer. auto& context = contexts_[0]; auto buf = context->createUnboundBuffer(ptr, size); - buf->send(dstRank, 0); + buf->send(dstRank, utag); // The work captures the tensor to prevent it being deallocated and // the unbound buffer to synchronize on completion of the send. @@ -726,15 +735,17 @@ std::shared_ptr ProcessGroupGloo::send( std::shared_ptr ProcessGroupGloo::recv( std::vector& tensors, - int srcRank) { + int srcRank, + int tag) { auto& tensor = checkSingleTensor(tensors); + auto utag = checkTag(tag); auto ptr = tensor.data_ptr(); auto size = tensor.numel() * tensor.type().elementSizeInBytes(); // Construct unbound buffer. auto& context = contexts_[0]; auto buf = context->createUnboundBuffer(ptr, size); - buf->recv(srcRank, 0); + buf->recv(srcRank, utag); // The work captures the tensor to prevent it being deallocated and // the unbound buffer to synchronize on completion of the recv. @@ -743,8 +754,10 @@ std::shared_ptr ProcessGroupGloo::recv( std::shared_ptr ProcessGroupGloo::recvAnysource( std::vector& tensors, - int* srcRank) { + int* srcRank, + int tag) { auto& tensor = checkSingleTensor(tensors); + auto utag = checkTag(tag); auto ptr = tensor.data_ptr(); auto size = tensor.numel() * tensor.type().elementSizeInBytes(); @@ -761,7 +774,7 @@ std::shared_ptr ProcessGroupGloo::recvAnysource( srcRanks.push_back(i); } - buf->recv(srcRanks, 0); + buf->recv(srcRanks, utag); // The work captures the tensor to prevent it being deallocated and // the unbound buffer to synchronize on completion of the recv. diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp index e78b1e36f8b94e..773ad600d4402f 100644 --- a/torch/lib/c10d/ProcessGroupGloo.hpp +++ b/torch/lib/c10d/ProcessGroupGloo.hpp @@ -327,15 +327,18 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr send( std::vector& tensors, - int dstRank) override; + int dstRank, + int tag) override; std::shared_ptr recv( std::vector& tensors, - int srcRank) override; + int srcRank, + int tag) override; std::shared_ptr recvAnysource( std::vector& tensors, - int* srcRank) override; + int* srcRank, + int tag) override; std::shared_ptr barrier() override; diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp index 39e5e5b609c1e8..03699962c15b59 100644 --- a/torch/lib/c10d/ProcessGroupMPI.cpp +++ b/torch/lib/c10d/ProcessGroupMPI.cpp @@ -649,7 +649,8 @@ std::shared_ptr ProcessGroupMPI::scatter( std::shared_ptr ProcessGroupMPI::send( std::vector& tensors, - int dstRank) { + int dstRank, + int tag) { if (pgComm_ == MPI_COMM_NULL) { return nullptr; } @@ -666,7 +667,7 @@ std::shared_ptr ProcessGroupMPI::send( tensor.numel(), mpiDatatype.at(tensor.type().scalarType()), dstRank, - 0, + tag, pgComm_, &request)); } @@ -676,7 +677,8 @@ std::shared_ptr ProcessGroupMPI::send( std::shared_ptr ProcessGroupMPI::recv( std::vector& tensors, - int srcRank) { + int srcRank, + int tag) { if (pgComm_ == MPI_COMM_NULL) { return nullptr; } @@ -693,7 +695,7 @@ std::shared_ptr ProcessGroupMPI::recv( tensor.numel(), mpiDatatype.at(tensor.type().scalarType()), srcRank, - 0, + tag, pgComm_, &request)); } @@ -703,7 +705,8 @@ std::shared_ptr ProcessGroupMPI::recv( std::shared_ptr ProcessGroupMPI::recvAnysource( std::vector& tensors, - int* srcRank) { + int* srcRank, + int tag) { if (pgComm_ == MPI_COMM_NULL) { return nullptr; } @@ -720,7 +723,7 @@ std::shared_ptr ProcessGroupMPI::recvAnysource( tensor.numel(), mpiDatatype.at(tensor.type().scalarType()), MPI_ANY_SOURCE, - 0, + tag, pgComm_, &request)); } diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp index 4a72570759ce91..5bd2b303c1a4e1 100644 --- a/torch/lib/c10d/ProcessGroupMPI.hpp +++ b/torch/lib/c10d/ProcessGroupMPI.hpp @@ -165,15 +165,18 @@ class ProcessGroupMPI : public ProcessGroup { std::shared_ptr send( std::vector& tensors, - int dstRank); + int dstRank, + int tag); std::shared_ptr recv( std::vector& tensors, - int srcRank); + int srcRank, + int tag); std::shared_ptr recvAnysource( std::vector& tensor, - int* srcRank); + int* srcRank, + int tag); std::shared_ptr barrier(); diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 157a9917f6803e..db3fcf5e211789 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -599,19 +599,22 @@ std::shared_ptr ProcessGroupNCCL::scatter( std::shared_ptr ProcessGroupNCCL::send( std::vector& /* unused */, + int /* unused */, int /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support send"); } std::shared_ptr ProcessGroupNCCL::recv( std::vector& /* unused */, + int /* unused */, int /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support recv"); } std::shared_ptr ProcessGroupNCCL::recvAnysource( std::vector& /* unused */, - int* /* unused */) { + int* /* unused */, + int /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support recv"); } diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index d3862fc044321d..3eca7c4d95c535 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -130,15 +130,18 @@ class ProcessGroupNCCL : public ProcessGroup { std::shared_ptr send( std::vector& tensors, - int dstRank) override; + int dstRank, + int tag) override; std::shared_ptr recv( std::vector& tensors, - int srcRank) override; + int srcRank, + int tag) override; std::shared_ptr recvAnysource( std::vector& tensors, - int* srcRank) override; + int* srcRank, + int tag) override; std::shared_ptr barrier() override; diff --git a/torch/lib/c10d/test/ProcessGroupMPITest.cpp b/torch/lib/c10d/test/ProcessGroupMPITest.cpp index 3a470fd1b88904..398bf31b9e06ae 100644 --- a/torch/lib/c10d/test/ProcessGroupMPITest.cpp +++ b/torch/lib/c10d/test/ProcessGroupMPITest.cpp @@ -318,7 +318,7 @@ void testSendRecv(bool recvAnysource, int iter = 10000) { std::vector> works; for (auto& tensors : allTensors) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1); + std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1, 0); works.push_back(std::move(work)); } for (auto& work : works) { @@ -337,11 +337,11 @@ void testSendRecv(bool recvAnysource, int iter = 10000) { for (auto& tensors : allTensors) { // Kick off work if (!recvAnysource) { - std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0); + std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0, 0); works.push_back(std::move(work)); } else { std::shared_ptr<::c10d::ProcessGroup::Work> work = - pg->recvAnysource(tensors, &srcRanks[i]); + pg->recvAnysource(tensors, &srcRanks[i], 0); works.push_back(std::move(work)); } ++i; From b90872c00ee159bc7e0fc936de8d58dcf1dc803b Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Fri, 14 Sep 2018 10:52:38 -0700 Subject: [PATCH 161/237] Get rid of default arguments for TH/THC factory functions. (#11673) Summary: This is causing codegen problems in caffe2, when we try to remove the circular Tensor/Type declarations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11673 Differential Revision: D9819341 Pulled By: gchanan fbshipit-source-id: f2c2cd96e8a16f6de6aa4889e71b8a78e12e9256 --- aten/src/ATen/Declarations.cwrap | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index 5a6687aaa53762..d45815c5b600c9 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -302,8 +302,7 @@ output: True - accreal start - accreal end - - arg: accreal step - default: 1 + - accreal step ]] [[ name: _arange @@ -320,8 +319,7 @@ output: True - accreal start - accreal end - - arg: accreal step - default: 1 + - accreal step - cname: arange arguments: - arg: THTensor* result @@ -1956,8 +1954,7 @@ output: True - real start - real end - - arg: long steps - default: 100 + - long steps ]] [[ name: _logspace @@ -1976,8 +1973,7 @@ output: True - real start - real end - - arg: long steps - default: 100 + - long steps ]] [[ name: histc From 4050770909166230baf33aa327ef6d4b6395b332 Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Fri, 14 Sep 2018 12:59:52 -0500 Subject: [PATCH 162/237] Skip tests that depend on double datatype for MIOpen and in absence of the fallback fail. --- test/test_jit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_jit.py b/test/test_jit.py index 985a4f760466b7..76d4ee79638d0f 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -6981,6 +6981,7 @@ def test_dcgan_models(self): self._test_dcgan_models(self, device='cpu') @unittest.skipIf(not RUN_CUDA, "no CUDA") + @skipIfRocm def test_dcgan_models_cuda(self): # XXX: export_import on CUDA modules doesn't work (#11480) self._test_dcgan_models(self, device='cuda', check_export_import=False) @@ -7103,11 +7104,13 @@ def test_mnist(self): self._test_mnist(self, device='cpu') @unittest.skipIf(not RUN_CUDA, "no CUDA") + @skipIfRocm def test_mnist_cuda(self): # XXX: export_import on CUDA modules doesn't work (#11480) self._test_mnist(self, device='cuda', check_export_import=False) @unittest.skipIf(not RUN_CUDA, "no CUDA") + @skipIfRocm def test_mnist_training_leaks_no_memory_cuda(self): net = MnistNet().cuda() # MnistNet uses dropout, don't check its trace From 0c2648830fbb48f1d600b18e1e4d1a6be071336d Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Fri, 14 Sep 2018 11:53:48 -0700 Subject: [PATCH 163/237] Augment emit_nvtx to help connect backward-pass Function apply calls with their corresponding forward pass ops (#10881) Summary: Often, we find ourselves looking at some long-running kernel or emit_nvtx range on an nvvp profile and trying to connect it to the offending line in a training script. If the op is in the forward pass that's easy: ops are enqueued explicitly from the Python side, so tracking it down with manual nvtx ranges supplemented by the built-in emit_nvtx ranges is straightforward. If the op is in the backward pass, it's much more difficult. From the Python side, all you can do is wrap loss.backward() in an nvtx range, and if you also use emit_nvtx, the automatic ranges provide only local information. Right now, the only consistent way to connect backward-pass kernels to their associated forward-pass lines of Python is to understand your script line by line, and know exactly where in the backward pass you are. This PR augments the existing nvtx machinery to bridge the gap between forward and backward, allowing connection of backward-pass Function apply calls to the forward-pass operations that required/created those Functions. The method is simple and surgical. During the forward pass, when running with emit_nvtx, the nvtx range for each function in VariableType is tagged with the current sequence number. During the backward pass, the nvtx range associated with each Function's operator() is tagged with that Function's stashed sequence number, which can be compared to "current sequence numbers" from the forward pass to locate the associated op. Double-backward is not a problem. If a backward pass with create_graph = True is underway, the relationship between backward and double-backward is conceptually the same as the relationship between forward and backward: The functions in VariableType still spit out current-sequence-number-tagged ranges, the Function objects they create still stash those sequence numbers, and in the eventual double-backward execution, their operator() ranges are still tagged with the stashed numbers, which can be compared to "current sequence numbers" from the backward pass. Minor caveats: - The sequence number is thread-local, and many VariableType functions (specifically, those without a derivative explicitly defined in derivatives.yaml) don't create an associated function object (instead delegating that to sub-functions further down the call chain, perhaps called from within at::native functions that route back through VariableType by calling at::function_name). So the correspondence of stashed sequence numbers in Function operator() ranges with numbers in forward-pass ranges is not guaranteed to be 1 to 1. However, it's still a vast improvement over the current situation, and I don't think this issue should be a blocker. - Feel free to litigate my use of stringstream in profiler.cpp. I did it because it was easy and clean. If that's too big a hammer, let's figure out something more lightweight. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10881 Differential Revision: D9833371 Pulled By: apaszke fbshipit-source-id: 1844f2e697117880ef5e31394e36e801d1de6088 --- tools/autograd/gen_variable_type.py | 2 +- torch/autograd/profiler.py | 45 +++++++++++++++++++++++++++++ torch/csrc/autograd/function.cpp | 4 +++ torch/csrc/autograd/function.h | 2 ++ torch/csrc/autograd/profiler.cpp | 21 ++++++++++++-- torch/csrc/autograd/profiler.h | 4 ++- 6 files changed, 73 insertions(+), 5 deletions(-) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 86dd5fcb98bda0..d09a07a7b550c4 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -124,7 +124,7 @@ """) RECORD_FUNCTION = CodeTemplate("""\ -profiler::RecordFunction profiler("${name}");""") +profiler::RecordFunction profiler("${name}", Function::peek_at_next_sequence_nr());""") PRE_RECORD_TRACE = CodeTemplate("""\ torch::jit::Node* node = nullptr; diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index c1be47ad494397..97a0fa8e97d85f 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -249,6 +249,51 @@ class emit_nvtx(object): ... model(x) # Warmup CUDA memory allocator and profiler ... with torch.autograd.profiler.emit_nvtx(): ... model(x) + + **Forward-backward correlation** + + When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler, + correlating each backward-pass op with the corresponding forward-pass op can be difficult. + To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it + generates. + + During the forward pass, each function range is decorated with ``seq=``. ``seq`` is a running + counter, incremented each time a new backward Function object is created and stashed for backward. + Thus, the `seq=` annotation associated with each forward function range tells you that + if a backward Function object is created by this forward function, + the backward object will receive sequence number N. + During the backward pass, the top-level range wrapping each C++ backward Function's + ``apply()`` call is decorated with ``stashed seq=``. ``M`` is the sequence number that + the backward object was created with. By comparing ``stashed seq`` numbers in backward with ``seq`` + numbers in forward, you can track down which forward op created each backward Function. + + Any functions executed during the backward pass are also decorated with ``seq=``. During + default backward (with ``create_graph=False``) this information is irrelevant, and in fact, + ``N`` may simply be 0 for all such functions. Only the top-level ranges associated with + backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function + objects with the earlier forward pass. + + **Double-backward** + + If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words, + if you are setting up for a double-backward), each function's execution during backward + is given a nonzero, useful ``seq=``. Those functions may themselves create Function objects + to be executed later during double-backward, just as the original functions in the forward pass did. + The relationship between backward and double-backward is conceptually the same as the relationship + between forward and backward: The functions still emit current-sequence-number-tagged ranges, + the Function objects they create still stash those sequence numbers, and during the eventual + double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq`` + numbers, which can be compared to `seq` numbers from the backward pass. + + .. warning: + The sequence number is thread-local, and some forward functions don't create an associated + backward Function object (instead delegating that to sub-functions further down the call chain). + For these reasons, the correspondence of stashed sequence numbers in + backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is + not guaranteed to be 1 to 1. The sequence numbers alone may not be enough to fully + disambiguate which forward function created which + backward Function object. You may need to make a judgment based on analytic knowledge of what + the expected correspondence should be. """ def __init__(self, enabled=True): self.enabled = enabled diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp index ba33a6f116bac2..57fe29810c4c8f 100644 --- a/torch/csrc/autograd/function.cpp +++ b/torch/csrc/autograd/function.cpp @@ -21,6 +21,10 @@ namespace torch { namespace autograd { /// numbers. thread_local uint64_t Function_next_sequence_nr_ = 0; +uint64_t Function::peek_at_next_sequence_nr() { + return Function_next_sequence_nr_; +} + uint64_t& Function::get_next_sequence_nr() { return Function_next_sequence_nr_; } diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index dfe8683197f8ee..7b01b4e7e112f9 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -315,6 +315,8 @@ struct TORCH_API Function : std::enable_shared_from_this { return nullptr; } + static uint64_t peek_at_next_sequence_nr(); + protected: static uint64_t& get_next_sequence_nr(); diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 66ff6d1a09c7c0..77c884e9b71b64 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -1,6 +1,8 @@ #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/function.h" +#include + namespace torch { namespace autograd { namespace profiler { ProfilerState state = ProfilerState::Disabled; @@ -37,13 +39,19 @@ void mark(std::string name, bool include_cuda /* = true */) { } } -void pushRange(std::string name) { +void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*= -1*/) { if (state == ProfilerState::Disabled) { return; } if (state == ProfilerState::NVTX) { #ifdef USE_CUDA - nvtxRangePushA(name.c_str()); + if(sequence_nr >= 0) { + std::stringstream s; + s << name << msg << sequence_nr; + nvtxRangePushA(s.str().c_str()); + } + else + nvtxRangePushA(name.c_str()); #else throw std::logic_error( "pushRange called with NVTX tracing, but compiled without CUDA"); @@ -95,6 +103,13 @@ RecordFunction::RecordFunction(const char* name) { pushRange(name); } +RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr) +{ + if (state == ProfilerState::Disabled) + return; + pushRange(name, ", seq=", current_sequence_nr); +} + RecordFunction::~RecordFunction() { if (state == ProfilerState::Disabled) return; @@ -102,7 +117,7 @@ RecordFunction::~RecordFunction() { } void RecordFunction::pushFunctionRange(Function* fn) { - pushRange(fn->name()); + pushRange(fn->name(), ", stashed seq=", fn->sequence_nr()); } #ifdef USE_CUDA diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index ba0fee1510baa2..29dc1044fb2f10 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -166,7 +166,7 @@ enum class ProfilerState { TORCH_API RangeEventList& getEventList(); TORCH_API void mark(std::string name, bool include_cuda = true); -TORCH_API void pushRange(std::string name); +TORCH_API void pushRange(std::string name, const char* msg = "", int64_t sequence_nr = -1); TORCH_API void popRange(); struct TORCH_API RecordFunction { @@ -176,6 +176,8 @@ struct TORCH_API RecordFunction { explicit RecordFunction(const char* name); + explicit RecordFunction(const char* name, int64_t current_sequence_nr); + ~RecordFunction(); // Needed only because we don't have Function defined yet. From 224e62bbec2b00719f38a2a2fb2191e40120bc4a Mon Sep 17 00:00:00 2001 From: Anders Papitto Date: Fri, 14 Sep 2018 12:16:29 -0700 Subject: [PATCH 164/237] respect USE_CUDA_STATIC_LINK in build_libtorch.py Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11713 Differential Revision: D9835972 Pulled By: anderspapitto fbshipit-source-id: 046363b132e5487c05ef7e6e6d88b508196386a1 --- tools/build_libtorch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py index df29875666063e..db698a24128ea9 100644 --- a/tools/build_libtorch.py +++ b/tools/build_libtorch.py @@ -23,6 +23,8 @@ command = [build_pytorch_libs, '--use-nnpack'] if USE_CUDA: command.append('--use-cuda') + if os.environ.get('USE_CUDA_STATIC_LINK', False): + command.append('--cuda-static-link') if options.use_cereal: command.append('--use-cereal') command.append('caffe2') From 70e68e755a8fdd94218e9b8e0abfb6b805354a08 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Fri, 14 Sep 2018 13:29:06 -0700 Subject: [PATCH 165/237] Casting for binary ops (#11708) Summary: Fixes #11663 `TensorIterator` was replacing the op tensors with type casted tensors which ended up producing side effects in binary ops like `a.float() * b` where `a` and `b` are `LongTensor`s. colesbury ezyang apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/11708 Differential Revision: D9834016 Pulled By: driazati fbshipit-source-id: 4082eb9710b31dfc741161a0fbdb9a8eba8fe39d --- aten/src/ATen/native/TensorIterator.cpp | 3 ++- aten/src/ATen/native/TensorIterator.h | 1 + test/test_torch.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index bae2a94b86273b..15d86fb5162a3e 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -98,7 +98,8 @@ void TensorIterator::compute_common_type() { if (op.tensor->defined() && type != op.tensor->type()) { if (op.tensor->dim() == 0) { if (type.backend() != at::Backend::CUDA) { - *op.tensor = op.tensor->toType(type); + cast_tensors_.emplace_back(op.tensor->toType(type)); + op.tensor = &(cast_tensors_.back()); } } else { op.needs_cast = true; diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index 245866373d4763..3faedbec6bb320 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -184,6 +184,7 @@ struct AT_API TensorIterator { DimVector shape_; DimVector perm_; SmallVector operands_; + SmallVector cast_tensors_; int num_outputs_ = 0; bool has_coalesced_dimensions_ = false; }; diff --git a/test/test_torch.py b/test/test_torch.py index 260bfe3d42e8e7..b90e0631d4f470 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -8861,6 +8861,20 @@ def test_cuda_not_built(self): self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.FloatTensor()) self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).to(device="cuda")) + def test_cast_binary_op(self): + # Scalar + a = torch.tensor(2) + b = torch.tensor(3) + a_copy = a.clone() + b_copy = b.clone() + + self.assertEqual(torch.tensor(6), a.float() * b) + + self.assertEqual(a.type(), a_copy.type()) + self.assertEqual(a.data.type(), a_copy.data.type()) + self.assertEqual(b.type(), b_copy.type()) + self.assertEqual(b.data.type(), b_copy.type()) + # Functions to test negative dimension wrapping METHOD = 1 From 96d3f968ebcbbcff5a7d3adf1ceb357e78ca672a Mon Sep 17 00:00:00 2001 From: Mike Ruberry <38511765+mruberry@users.noreply.github.com> Date: Fri, 14 Sep 2018 13:53:09 -0700 Subject: [PATCH 166/237] Splits CPU and CUDA fusion compilers (#10981) Summary: This PR splits the CPU and CUDA fusion compilers, putting them into a new jit/fusers/ directory with jit/fusers/common for common components. In particular: - A fusion interface is created that allows "fusion handles" to be requested - The CPU and CUDA fusers implement this interface, with dispatch determined by device - The fusion compilers, fusion function specializations and resource strings are split - CPU-specific classes like TempFile and DynamicLibrary are in the CPU fuser - Common classes likes TensorDesc and the base fusion function class are in jit/fusers/common - There is still some specialization in jit/fusers/common, but these specializations are small(-ish) - Updates the build system to remove the dummy interface on Windows and minimize the use of macros This structure should allow in-flight PRs to easily rebase while providing a clear interface to the fusers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10981 Reviewed By: soumith Differential Revision: D9701999 Pulled By: apaszke fbshipit-source-id: 3b6bec7b97e0444b2a93caa38d9b897f2e68c1b3 --- .gitignore | 1 + test/test_jit.py | 15 +- tools/amd_build/build_pytorch_amd.py | 2 +- torch/CMakeLists.txt | 30 +- torch/__init__.py | 2 +- torch/csrc/jit/fusers/Config.h.in | 4 + .../csrc/jit/fusers/common/annotated_graph.h | 26 + torch/csrc/jit/fusers/common/fused_kernel.cpp | 556 ++++++ torch/csrc/jit/fusers/common/fused_kernel.h | 88 + .../csrc/jit/fusers/common/fusion_arg_spec.h | 46 + .../jit/fusers/common/fusion_handle_impl.cpp | 400 ++++ .../jit/fusers/common/fusion_handle_impl.h | 72 + torch/csrc/jit/fusers/common/partition_desc.h | 48 + torch/csrc/jit/fusers/common/tensor_desc.cpp | 20 + torch/csrc/jit/fusers/common/tensor_desc.h | 83 + torch/csrc/jit/fusers/common/tensor_info.h | 25 + torch/csrc/jit/fusers/cpu/dynamic_library.h | 42 + torch/csrc/jit/fusers/cpu/fused_kernel.cpp | 95 + torch/csrc/jit/fusers/cpu/fused_kernel.h | 45 + torch/csrc/jit/fusers/cpu/fusion_compiler.cpp | 85 + torch/csrc/jit/fusers/cpu/fusion_compiler.h | 55 + torch/csrc/jit/fusers/cpu/interface.h | 32 + torch/csrc/jit/fusers/cpu/resource_strings.h | 56 + torch/csrc/jit/fusers/cpu/temp_file.h | 70 + torch/csrc/jit/fusers/cuda/fused_kernel.cpp | 105 ++ torch/csrc/jit/fusers/cuda/fused_kernel.h | 59 + .../csrc/jit/fusers/cuda/fusion_compiler.cpp | 71 + torch/csrc/jit/fusers/cuda/fusion_compiler.h | 44 + torch/csrc/jit/fusers/cuda/interface.h | 32 + torch/csrc/jit/fusers/cuda/resource_strings.h | 197 ++ torch/csrc/jit/fusers/interface.cpp | 81 + torch/csrc/jit/fusers/interface.h | 37 + torch/csrc/jit/fusion_compiler.cpp | 1615 ----------------- torch/csrc/jit/fusion_compiler.h | 188 -- torch/csrc/jit/init.cpp | 4 +- torch/csrc/jit/interpreter.cpp | 1 - torch/csrc/jit/passes/graph_fuser.cpp | 4 +- torch/csrc/jit/python_interpreter.cpp | 1 - torch/csrc/jit/register_prim_ops.cpp | 8 +- torch/csrc/jit/test_jit.cpp | 10 +- torch/jit/__init__.py | 1 + 41 files changed, 2534 insertions(+), 1822 deletions(-) create mode 100644 torch/csrc/jit/fusers/Config.h.in create mode 100644 torch/csrc/jit/fusers/common/annotated_graph.h create mode 100644 torch/csrc/jit/fusers/common/fused_kernel.cpp create mode 100644 torch/csrc/jit/fusers/common/fused_kernel.h create mode 100644 torch/csrc/jit/fusers/common/fusion_arg_spec.h create mode 100644 torch/csrc/jit/fusers/common/fusion_handle_impl.cpp create mode 100644 torch/csrc/jit/fusers/common/fusion_handle_impl.h create mode 100644 torch/csrc/jit/fusers/common/partition_desc.h create mode 100644 torch/csrc/jit/fusers/common/tensor_desc.cpp create mode 100644 torch/csrc/jit/fusers/common/tensor_desc.h create mode 100644 torch/csrc/jit/fusers/common/tensor_info.h create mode 100644 torch/csrc/jit/fusers/cpu/dynamic_library.h create mode 100644 torch/csrc/jit/fusers/cpu/fused_kernel.cpp create mode 100644 torch/csrc/jit/fusers/cpu/fused_kernel.h create mode 100644 torch/csrc/jit/fusers/cpu/fusion_compiler.cpp create mode 100644 torch/csrc/jit/fusers/cpu/fusion_compiler.h create mode 100644 torch/csrc/jit/fusers/cpu/interface.h create mode 100644 torch/csrc/jit/fusers/cpu/resource_strings.h create mode 100644 torch/csrc/jit/fusers/cpu/temp_file.h create mode 100644 torch/csrc/jit/fusers/cuda/fused_kernel.cpp create mode 100644 torch/csrc/jit/fusers/cuda/fused_kernel.h create mode 100644 torch/csrc/jit/fusers/cuda/fusion_compiler.cpp create mode 100644 torch/csrc/jit/fusers/cuda/fusion_compiler.h create mode 100644 torch/csrc/jit/fusers/cuda/interface.h create mode 100644 torch/csrc/jit/fusers/cuda/resource_strings.h create mode 100644 torch/csrc/jit/fusers/interface.cpp create mode 100644 torch/csrc/jit/fusers/interface.h delete mode 100644 torch/csrc/jit/fusion_compiler.cpp delete mode 100644 torch/csrc/jit/fusion_compiler.h diff --git a/.gitignore b/.gitignore index e3e7acb8e310cb..b600e7515b9134 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ torch/csrc/cudnn/cuDNN.cpp torch/csrc/generated torch/csrc/generic/TensorMethods.cpp torch/csrc/jit/generated/* +torch/csrc/jit/fusers/Config.h torch/csrc/nn/THCUNN.cpp torch/csrc/nn/THCUNN.cwrap torch/csrc/nn/THNN_generic.cpp diff --git a/test/test_jit.py b/test/test_jit.py index b5e26b304c57db..b715045b30a27c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -205,6 +205,17 @@ def _construct_empty_tensor_list(): return [] +def enable_cpu_fuser(fn): + def wrapper(*args, **kwargs): + torch._C._jit_override_can_fuse_on_cpu(True) + try: + fn(*args, **kwargs) + except Exception: + torch._C._jit_override_can_fuse_on_cpu(False) + raise + return wrapper + + class JitTestCase(TestCase): _do_cuda_memory_leak_check = True _restored_warnings = False @@ -630,6 +641,7 @@ def test_lstm_fusion_cuda(self): @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/8746") + @enable_cpu_fuser def test_lstm_fusion_cpu(self): inputs = get_lstm_inputs('cpu') try: @@ -1370,6 +1382,7 @@ def test_ge_unoptimized(self): self.run_ge_tests(False, False) @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") + @enable_cpu_fuser def test_ge_optimized(self): self.run_ge_tests(True, False) @@ -3053,6 +3066,7 @@ def chunk_4_last(x): @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @skipIfRocm + @enable_cpu_fuser def test_chunk_fusion_correctness(self): return self._test_chunk_fusion_correctness(self, 'cpu') @@ -6928,7 +6942,6 @@ def forward(self, x): class TestEndToEndHybridFrontendModels(JitTestCase): - @staticmethod def _test_dcgan_models(self, device, check_export_import=True): class DCGANGenerator(nn.Module): diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py index e9daea644e9cb2..d433ad50dbfae7 100644 --- a/tools/amd_build/build_pytorch_amd.py +++ b/tools/amd_build/build_pytorch_amd.py @@ -41,7 +41,7 @@ # Make various replacements inside AMD_BUILD/torch directory ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp", - "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"] + "csrc/cuda/cuda_check.h"] for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")): for filename in files: if filename.endswith(".cpp") or filename.endswith(".h"): diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 04a72819717bad..7679f3d6467ba0 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -146,7 +146,6 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp ${TORCH_SRC_DIR}/csrc/jit/export.cpp - ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp ${TORCH_SRC_DIR}/csrc/jit/import.cpp @@ -176,6 +175,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusers/interface.cpp ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp @@ -190,6 +190,34 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp ) +SET(USE_CPU_FUSER 0) +if (NOT WIN32) + SET(USE_CPU_FUSER 1) + + list(APPEND TORCH_SRCS + ${TORCH_SRC_DIR}/csrc/jit/fusers/common/tensor_desc.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusers/common/fusion_handle_impl.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusers/common/fused_kernel.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusers/cpu/fusion_compiler.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusers/cpu/fused_kernel.cpp + ) +endif() + +SET(USE_CUDA_FUSER 0) +if (USE_CUDA AND NOT USE_ROCM AND NOT WIN32) + SET(USE_CUDA_FUSER 1) + + list(APPEND TORCH_SRCS + ${TORCH_SRC_DIR}/csrc/jit/fusers/cuda/fusion_compiler.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusers/cuda/fused_kernel.cpp + ) + +endif() + +CONFIGURE_FILE( + ${TORCH_SRC_DIR}/csrc/jit/fusers/Config.h.in + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/jit/fusers/Config.h) + if (NOT NO_API AND NOT USE_ROCM) list(APPEND TORCH_SRCS ${TORCH_SRC_DIR}/csrc/api/src/utils.cpp diff --git a/torch/__init__.py b/torch/__init__.py index e494cdec6cbec1..c6298353cd8ab8 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -281,6 +281,7 @@ def manager_path(): import torch.cuda import torch.autograd +from torch.autograd import no_grad, enable_grad, set_grad_enabled import torch.nn import torch.optim import torch.multiprocessing @@ -293,7 +294,6 @@ def manager_path(): import torch.testing import torch.backends.cuda import torch.backends.mkl -from torch.autograd import no_grad, enable_grad, set_grad_enabled _C._init_names(list(torch._storage_classes)) diff --git a/torch/csrc/jit/fusers/Config.h.in b/torch/csrc/jit/fusers/Config.h.in new file mode 100644 index 00000000000000..0809591ca702e9 --- /dev/null +++ b/torch/csrc/jit/fusers/Config.h.in @@ -0,0 +1,4 @@ +#pragma once + +#define USE_CPU_FUSER @USE_CPU_FUSER@ +#define USE_CUDA_FUSER @USE_CUDA_FUSER@ diff --git a/torch/csrc/jit/fusers/common/annotated_graph.h b/torch/csrc/jit/fusers/common/annotated_graph.h new file mode 100644 index 00000000000000..bf8a6e862c4dac --- /dev/null +++ b/torch/csrc/jit/fusers/common/annotated_graph.h @@ -0,0 +1,26 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/tensor_desc.h" + +#include "torch/csrc/jit/ir.h" + +namespace torch { namespace jit { + +struct AnnotatedGraph { + // short-term storage only, so it borrows Graph. + AnnotatedGraph(Graph& graph, int device) + : graph(&graph), device(device) {} + + Graph* graph = nullptr; // TODO: this should really be const + int device = kCPUDevice; + std::vector input_desc; + std::vector output_desc; +}; + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp new file mode 100644 index 00000000000000..2021b9fa1b832d --- /dev/null +++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp @@ -0,0 +1,556 @@ +#include "torch/csrc/jit/fusers/common/fused_kernel.h" + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/cpu/resource_strings.h" +#include "torch/csrc/jit/fusers/cuda/resource_strings.h" +#include "torch/csrc/jit/fusers/common/partition_desc.h" +#include "torch/csrc/jit/fusers/common/tensor_desc.h" +#include "torch/csrc/jit/fusers/common/tensor_info.h" + +#include "torch/csrc/jit/code_template.h" +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/assertions.h" + +#include "ATen/ATen.h" + +#if USE_CUDA_FUSER + #include "THC/THCTensorRandom.h" + #include "THC/THCGenerator.hpp" + THCGenerator* THCRandom_getGenerator(THCState* state); +#endif // USE_CUDA_FUSER + +#include +#include +#include +#include +#include + +namespace torch { namespace jit { + +// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation +// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1 +// linearId /= sizes[i]; +auto dim_calc = CodeTemplate(R"( +//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]); +size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes}; +${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride}; +)"); + +// XXX: this code assumes that inputs are 32-bit addressable +static uint32_t computeNumel(at::ArrayRef sizes) { + uint32_t result = 1; + if (sizes.size() == 0) { + return 1; // scalar tensor + } + for (int64_t size : sizes) { + result *= size; + } + return result; +} + +// XXX: Assumes that after at::chunk, all inputs are the same size +static std::vector computeMapSize( + const at::Tensor& tensor, + const PartitionDesc& chunkDesc) { + std::vector sizes(tensor.sizes().begin(), tensor.sizes().end()); + // Should have been checked in graph fuser + JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0); + sizes[chunkDesc.dim] /= chunkDesc.nSubtensors; + return sizes; +} + +// Tries to compress sizes and strides according to cont. Emits the result t +// c_sizes, c_strides and throws an error on failure (if can't compress) +static void compressContiguous( + at::IntList sizes +, at::IntList strides +, const std::vector & cont +, uint32_t* c_sizes +, uint32_t* c_strides) { + size_t compressed_dims = 0; + size_t cur = 0; + size_t ndim = sizes.size(); + while (cur < ndim) { + size_t total_size = sizes[cur]; + cur++; + while (cont[cur-1] && cur < ndim) { + JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]); + total_size *= sizes[cur]; + cur++; + } + // cur starts pointing at the beginning of run to compress + // cur ends one _after_ the terminating false or end of list. + // total_size is the size of all dimensions [begin,end) + // examples: + // f = not cont. + // t = cont. + // x = don't care, including past end of list + // s = start of cur + // e = end of cur + + + // f x x x + // s e + + // t f x x + // s e + + // t t f x + // s e + + c_sizes[compressed_dims] = total_size; + c_strides[compressed_dims] = strides[cur-1]; + compressed_dims++; + } + if (ndim > 0) { + JIT_ASSERT(!cont.back() || strides.back() == 1); + } +} + +void FusedKernel::launch_with_tensors( + at::ArrayRef inputs +, at::ArrayRef outputs) { + at::DeviceGuard device_guard(inputs); + JIT_ASSERT(inputs.size() == input_desc.size()); + JIT_ASSERT(outputs.size() == output_desc.size()); + size_t flat_inputs_size = 0; + size_t flat_outputs_size = 0; + for (auto& c : chunk_desc) + flat_inputs_size += c.nSubtensors; + for (auto& c : concat_desc) + flat_outputs_size += c.nSubtensors; + // XXX: this code assumes that inputs are 32-bit addressable + // XXX: this code assumes that all inputs are of the same size + JIT_ASSERT(inputs[0].numel() <= std::numeric_limits::max()); + + // Compute map_size, numel from the first input + at::IntList map_size; + uint32_t numel; + std::vector keep_alive_size; + if (chunk_desc[0].isNoop()) { + map_size = inputs[0].sizes(); + numel = inputs[0].numel(); + } else { + keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]); + map_size = keep_alive_size; + numel = computeNumel(map_size); + } + + // Compute the storage needed to store TensorInfo structs for inputs and outputs. + size_t uncompressedDim = input_desc.at(0).contiguity.size(); + size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim; + size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size); + std::vector buffer(maxPossibleBufferSize); + char* buffer_next = buffer.data(); + // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs) + std::vector arguments; + arguments.reserve(3 + flat_inputs_size + flat_outputs_size); + auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) { + size_t nDim = desc.nDim(); // NOTE: this is the compressed dim + JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise + auto ti = reinterpret_cast(buffer_next); + ti->data = data_ptr; + compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim)); + buffer_next += maxPossibleTensorInfoSize; + arguments.push_back(ti); + }; + // Asserts that t's dims can be compressed in the same way as in desc + // (that's what the kernel assumes), and appends it to the arguments vector. + auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) { + addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides()); + }; + arguments.push_back(&numel); + for (size_t i = 0; i < input_desc.size(); ++i) { + auto & chunk = chunk_desc[i]; + const at::Tensor& tensor = inputs[i]; + if (chunk.isNoop()) { + addTensorInfo(input_desc[i], tensor); + } else { + size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType()); + char * data_ptr = reinterpret_cast(tensor.data_ptr()); + for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) { + addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides()); + data_ptr += chunk_offset; + } + } + } + for (size_t i = 0; i < output_desc.size(); ++i) { + auto & c = concat_desc[i]; + at::Tensor o = outputs[i]; + if (c.isNoop()) { + o.resize_(map_size); + addTensorInfo(output_desc[i], outputs[i]); + } else { + size_t small_size = map_size[c.dim]; + std::vector concat_size(map_size.begin(), map_size.end()); + concat_size[c.dim] = small_size * c.nSubtensors; + o.resize_(concat_size); + size_t offset = 0; + for(size_t j = 0; j < c.nSubtensors; ++j) { + // because the concatenated_output stays live, the underlying data + // in this view remains live through the end of this function + // so there is not need to hold onto this tensor + auto view = o.narrow(c.dim, offset, small_size); + addTensorInfo(*c.subtensorDesc, view); + offset += small_size; + } + } + } + + // If the kernel call contains a random op, we need to pass in random seeds as + // well. + #if USE_CUDA_FUSER + if (has_random && this->backend() == at::Backend::CUDA) { + auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState()); + uint64_t offset = + gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel)); + arguments.push_back(&gen_->state.initial_seed); + arguments.push_back(&offset); + } + #endif // USE_CUDA_FUSER + + launch_raw(numel, arguments.data()); +} + +void FusedKernel::launch( + at::ArrayRef inputs +, std::vector & outputs) { + at::DeviceGuard guard(inputs.back()); + JIT_ASSERT(inputs.size() > 0); + auto & ref_type = inputs[0].type(); + outputs.clear(); + outputs.reserve(outputDescriptors().size()); + for(auto & od : outputDescriptors()) { + outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor()); + } + + launch_with_tensors(inputs, outputs); +} + +static std::string valueName(Value * n) { + return "n" + std::to_string(n->unique()); +} + +static std::string scalarValue(int64_t v) { + return std::to_string(v); +} + +static std::string scalarValue(double v) { + std::ostringstream out; + out << std::scientific << v << "f"; + return out.str(); +} + +static const char * scalarTypeName(at::ScalarType type) { + if (type == at::ScalarType::Half) { + return "half"; + } + + switch(type) { + #define DEFINE_CASE(ctype,name,_) \ + case at::ScalarType::name: return #ctype; + AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE) + #undef DEFINE_CASE + default: + throw std::runtime_error("unknown scalar type"); + } +} + +static std::string encodeRHS(Node* n) { + static std::unordered_map simple_map_ops = { + // unary + {aten::abs, "absf(${0})"}, + {aten::sigmoid, "1.f / (1.f + expf(-${0}))"}, + {aten::relu, "${0} < 0 ? 0.f : ${0} "}, + {aten::log, "logf(${0})"}, + {aten::log10, "log10f(${0})"}, + {aten::log1p, "log1pf(${0})"}, + {aten::log2, "log2f(${0})"}, + {aten::lgamma, "lgammaf(${0})"}, + {aten::exp, "expf(${0})"}, + {aten::expm1, "expm1f(${0})"}, + {aten::cos, "cosf(${0})"}, + {aten::acos, "acosf(${0})"}, + {aten::cosh, "coshf(${0})"}, + {aten::sin, "sinf(${0})"}, + {aten::asin, "asinf(${0})"}, + {aten::sinh, "sinhf(${0})"}, + {aten::tan, "tanf(${0})"}, + {aten::atan, "atanf(${0})"}, + {aten::tanh, "tanhf(${0})"}, + {aten::sqrt, "sqrtf(${0})"}, + {aten::rsqrt, "rsqrtf(${0})"}, + {aten::ceil, "ceilf(${0})"}, + {aten::floor, "floorf(${0})"}, + {aten::round, "roundf(${0})"}, + {aten::trunc, "truncf(${0})"}, + {aten::frac, "fracf(${0})"}, + {aten::reciprocal, "reciprocalf(${0})"}, + {aten::neg, "-${0}"}, + //simple binary + {aten::atan2, "atan2(${0}, ${1})"}, + {aten::min, "fminf(${0}, ${1})"}, + {aten::max, "fmaxf(${0}, ${1})"}, + + //binary with other + // TODO: some of these ops will not get generated because + // we only work on float inputs/outputs, but they are here to record + // that they are valid mappable ops once we handle more type + {aten::__and__, "${0} && ${1}"}, + {aten::__lshift__, "${0} << ${1}"}, + {aten::__or__, "${0} || ${1}"}, + {aten::__rshift__, "${0} >> ${1}"}, + {aten::__xor__, "${0} ^ ${1}"}, + {aten::div, "${0} / ${1}"}, + {aten::eq, "${0} == ${1}"}, + {aten::fmod, "fmodf(${0}, ${1})"}, + {aten::ge, "(${0} >= ${1})"}, + {aten::gt, "${0} > ${1}"}, + {aten::le, "(${0} <= ${1})"}, + {aten::lt, "${0} < ${1}"}, + {aten::type_as, "(${0})"}, //everything is implicitly convertible to float + {aten::mul, "${0} * ${1}"}, + {aten::ne, "${0} != ${1}"}, + {aten::remainder, "remainderf(${0}, ${1})"}, + {aten::pow, "powf(${0}, ${1})"}, + + //alpha + {aten::add, "${0} + ${2}*${1}"}, + {aten::sub, "(${0} - ${2}*${1})"}, + {aten::rand_like, "uniform(rnd())"}, + + // simple derivatives + {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"}, + {aten::_tanh_backward, "${0} * (1.f - ${1} * ${1})"}, + }; + + if (n->kind() == prim::Constant) { + auto val = toIValue(n->output()).value(); + if (val.isDouble()) { + return scalarValue(val.toDouble()); + } else { + JIT_ASSERT(val.isInt()); + return scalarValue(val.toInt()); + } + } + + TemplateEnv env; + size_t i = 0; + for(auto in : n->inputs()) { + env.s(std::to_string(i++), valueName(in)); + } + + const auto & str = simple_map_ops.at(n->kind()); + return format(str, env); +} + +static Node* usedInFusedChunk(Value* input) { + auto uses = input->uses(); + if (uses.size() == 1) { + Node *user = uses[0].user; + if (user->kind() == prim::ConstantChunk) { + return user; + } + } + return nullptr; +} + +static void emitIndexingFor( + std::ostream& out +, const std::string& tensor +, int ndim +, bool last_is_cont) { + TemplateEnv env; + env.s("tensor",tensor); + out << format("IndexType ${tensor}_offset = 0;\n",env); + out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env); + for (int d = ndim - 1; d >= 0; --d) { + env.d("d",d); + env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : ""); + env.s("times_stride",(d < ndim - 1 || !last_is_cont) ? + format("* ${tensor}.strides[${d}]",env) : ""); + out << dim_calc.format(env); + if (d > 0) { + out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env); + } + } +} + +// Returns: (input chunk metadata, output concat metadata, is_random) +std::tuple< + std::vector + , std::vector + , bool> + emitCompilationUnit( + std::ostream& out + , const std::string& name + , AnnotatedGraph& agraph + , bool use_cuda) { + bool has_random = false; + Graph& subgraph = *agraph.graph; + TemplateEnv env; + env.s("kernelName", name); + // TODO: handle cases where we need to generate > 2^32 element tensors + env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t + + std::stringstream body; + std::stringstream tensorOffsets; + std::vector formals; + std::vector argument_loads; + auto emitFormal = [&](Value * n, const TensorDesc & desc) { + std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output + size_t nDim = desc.nDim(); + emitIndexingFor(tensorOffsets, tensor, nDim, desc.lastIsContiguous()); + env.s("tensor",tensor); + env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex + env.d("nDim",nDim); + env.s("scalar_type",scalarTypeName(desc.scalar_type)); + formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}", env)); + argument_loads.push_back(format("*static_cast*>(args[${formal_index}])", env)); + }; + + std::vector chunk_desc; + std::vector> flat_inputs; + { + size_t input_index = 0; + for(auto p : subgraph.inputs()) { + if (Node * chunk = usedInFusedChunk(p)) { + int64_t dim = chunk->i(attr::dim); + int64_t chunks = chunk->i(attr::chunks); + chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim); + for (auto * o : chunk->outputs()) { + flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc); + } + } else { + chunk_desc.emplace_back(); + flat_inputs.emplace_back(p, agraph.input_desc[input_index++]); + } + } + for (auto & input : flat_inputs) { + emitFormal(input.first, input.second); + } + } + + std::vector concat_desc; + std::vector> flat_output_nodes; + { + size_t i = 0; + for(auto o : subgraph.outputs()) { + auto & desc = agraph.output_desc[i++]; + if(o->node()->kind() != prim::FusedConcat) { + emitFormal(o, desc); + concat_desc.emplace_back(); + flat_output_nodes.emplace_back(o, desc); + } else { + auto cat = o->node(); + concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim)); + for(auto c : cat->inputs()) { + emitFormal(c, *concat_desc.back().subtensorDesc); + flat_output_nodes.emplace_back(c, desc); + } + } + } + } + + #if USE_CUDA_FUSER + bool has_half_tensor = false; + #endif // USE_CUDA_FUSER + size_t formal_count = 0; + for(auto input : flat_inputs) { + auto p = input.first; + env.s("node", valueName(p)); + env.d("formal", formal_count++); + + // Acquires and converts (if needed) inputs + bool is_half = input.second.scalar_type == at::ScalarType::Half; + if (is_half) { + AT_ASSERT(use_cuda); + #if USE_CUDA_FUSER + env.s( + "access" + , format("__half2float(t${formal}.data[t${formal}_offset])", env)); + has_half_tensor = true; + #endif // USE_CUDA_FUSER + } else { + env.s("access", format("t${formal}.data[t${formal}_offset]", env)); + } + + //TODO: actual type propagation rather than relying on auto.. + body << format("auto ${node} = ${access};\n", env); + } + + for (auto n : subgraph.nodes()) { + // FusedConcat nodes work by narrowing the output Tensors before the kernel runs + if (n->kind() == prim::FusedConcat) + continue; + if (n->kind() == prim::ConstantChunk) + continue; + if (n->kind() == aten::rand_like) { + has_random = true; + if (!use_cuda) + throw std::runtime_error("Fusion doesn't support rand on CPU"); + } + env.s("node",valueName(n->output())); + env.s("rhs", encodeRHS(n)); + body << format("auto ${node} = ${rhs};\n",env); + } + + for (auto output : flat_output_nodes) { + auto o = output.first; + env.d("formal",formal_count++); + env.s("access",format("t${formal}.data[t${formal}_offset]",env)); + env.s("node",valueName(o)); + + // Acquires and converts (if needed) outputs + bool is_half = output.second.scalar_type == at::ScalarType::Half; + if (is_half) { + AT_ASSERT(use_cuda); + #if USE_CUDA_FUSER + body << format("${access} = __float2half(${node});\n",env); + has_half_tensor = true; + #endif // USE_CUDA_FUSER + } else { + body << format("${access} = ${node};\n",env); + } + } + + // Includes half support if any half tensors are involved + #if USE_CUDA_FUSER + if (has_half_tensor) { + env.s("HalfHeader", cudafuser::half_support_literal); + } else { + env.s("HalfHeader", ""); + } + + if (has_random) { + env.s("RandHeader", cudafuser::rand_support_literal); + env.s("RandParam", cudafuser::rand_param); + env.s("RandInit", cudafuser::rand_init); + } else { + env.s("RandHeader", ""); + env.s("RandParam", ""); + env.s("RandInit", ""); + } + #endif // USE_CUDA_FUSER + + env.s("tensorOffsets", tensorOffsets.str()); + env.s("kernelBody", body.str()); + env.v("formals", formals); + env.v("argument_loads", argument_loads); + if (use_cuda) { + #if USE_CUDA_FUSER + env.s("type_declarations", cudafuser::type_declarations_template.format(env)); + out << cudafuser::cuda_compilation_unit_template.format(env); + #else + throw std::runtime_error("CUDA Fusion requested but not supported."); + #endif // USE_CUDA_FUSER + } else { + env.s("type_declarations", cpufuser::type_declarations_template.format(env)); + out << cpufuser::cpu_compilation_unit_template.format(env); + } + + return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/common/fused_kernel.h b/torch/csrc/jit/fusers/common/fused_kernel.h new file mode 100644 index 00000000000000..fbdf5d60d0a8cd --- /dev/null +++ b/torch/csrc/jit/fusers/common/fused_kernel.h @@ -0,0 +1,88 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/common/annotated_graph.h" +#include "torch/csrc/jit/fusers/common/tensor_desc.h" +#include "torch/csrc/jit/fusers/common/partition_desc.h" + +#include "torch/csrc/utils/disallow_copy.h" + +#include "ATen/ATen.h" + +#include +#include +#include + +namespace torch { namespace jit { + +std::tuple, std::vector, bool> emitCompilationUnit( + std::ostream& out +, const std::string& name +, AnnotatedGraph& agraph +, bool use_cuda); + +struct FusedKernel { + TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel); + + FusedKernel( + const std::string& name + , AnnotatedGraph& agraph) + : name{name} + , input_desc{agraph.input_desc} + , output_desc{agraph.output_desc} { } + + virtual ~FusedKernel() = default; + + // expects outputs to be pre-allocated + void launch_with_tensors( + at::ArrayRef inputs + , at::ArrayRef outputs); + + // creates new tensors for outputs + void launch( + at::ArrayRef inputs + , std::vector& outputs); + + const std::vector& outputDescriptors() const { + return output_desc; + } + +protected: + + virtual at::Backend backend() const = 0; + + // arguments is a list of pointers to the arguments for the compiled CUDA/CPU + // code. + // The format of arguments is suitable for directly passing to a call to + // cuLaunchKernel as the kernel arguments. + // Currently the first argument is a pointer to numel (for passing to + // CUDA code), and the remainder are pointers to the TensorInfo structs + // that compiled code uses to load Tensor data. + // launch_with_tensors handles packing at::Tensors into this arguments array. + // CPU code uses the same convension so that launch_with_tensors can be shared. + virtual void launch_raw(uint32_t numel, void** arguments) = 0; + + virtual uint64_t get_rand_offset(uint32_t numel) = 0; + bool has_random; + std::string name; + // We keep these around for debugging + std::string compilation_unit; + std::vector input_desc; + std::vector output_desc; + + // same size as output_desc, describes whether + // an output is actually a concatenation of + // many subtensors that the fusion group produces + std::vector concat_desc; + + // same size as input_desc, describes whether an + // input should be broken into subtensors (chunks) + // to be consumed by the fusion group + std::vector chunk_desc; +}; + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/common/fusion_arg_spec.h b/torch/csrc/jit/fusers/common/fusion_arg_spec.h new file mode 100644 index 00000000000000..b8780f8ada9e77 --- /dev/null +++ b/torch/csrc/jit/fusers/common/fusion_arg_spec.h @@ -0,0 +1,46 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/common/tensor_desc.h" + +#include "torch/csrc/utils/functional.h" // fmap +#include "torch/csrc/utils/hash.h" + +#include "ATen/ATen.h" + +#include +#include + +namespace torch { namespace jit { + +struct FusionArgSpec { + FusionArgSpec(at::TensorList inputs) + : descs_(fmap(inputs)) + , hash_code_(torch::get_hash(inputs.size(), descs_)) {} + + bool operator==(const FusionArgSpec& spec) const { + return hash_code_ == spec.hash_code_ && descs_ == spec.descs_; + } + + bool operator!=(const FusionArgSpec& spec) const { + return !(*this == spec); + } + + static size_t hash(const FusionArgSpec& spec) { + return spec.hash_code_; + } + + const std::vector& descs() const { + return descs_; + } + +private: + std::vector descs_; + size_t hash_code_; +}; + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp b/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp new file mode 100644 index 00000000000000..052a08cac93ede --- /dev/null +++ b/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp @@ -0,0 +1,400 @@ +#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h" + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/fusion_arg_spec.h" +#include "torch/csrc/jit/fusers/common/annotated_graph.h" +#include "torch/csrc/jit/fusers/common/tensor_desc.h" +#include "torch/csrc/jit/fusers/cpu/fused_kernel.h" +#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h" +#include "torch/csrc/jit/fusers/cuda/fused_kernel.h" + +#include "torch/csrc/jit/interpreter.h" +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/custom_operator.h" + +#include "torch/csrc/utils/functional.h" //fmap + +#include "ATen/ATen.h" +#include "ATen/ExpandUtils.h" + +#include +#include +#include +#include + +namespace torch { namespace jit { + +//////////////////////////////////////////////////////////////////////////////// +// FusedKernelCache + +// Note [Run-time shape checking code] +// There are multiple assumptions that our codegen makes, which we can't check +// in the fusion pass, because we don't have the shape information. Most notably, +// that all values (post-input-chunk, and pre-output-concat) have the same shape +// (hereinafter referred to as map size). One way to check this would be to run +// shape propagation for every size configuration we get as an input, but that +// requires a full graph traversal, and might incur unnecessary overhead. The code +// below uses a few nice properties of broadcasting rules and their interactions with +// pointwise operations, and takes a smarter approach, to quickly verify validity of +// the kernel. +// +// Notation: +// - a.s when a is a tensor is a shorthand for a.shape. +// - B is a shorthand for the broadcasting/expanding function. It is used as a +// vararg function. +// - E is a shorthand for expand function. +// - Every pointwise operation can be equivalently rewritten as +// f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), +// where f^ is a non-broadcasting verison of f. +// - A set of inputs that are used to produce a certain graph output is referred to +// as the output's broadcasting group (see Lemma 2. for explanation why). +// +// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the +// operation of broadcasting (returning bottom upon shape mismatch) forms a monoid. +// In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c). +// +// Proof. Satisfies all monoid laws: +// - Closed under broadcasting (trivial) +// - Empty shape is the identity element: B(a, []) == B([], a) == a +// - Associativity: A simple visual proof is that you can expand 3 tensors +// at the same time by stacking their sizes (with alignment to the right), +// just as you'd do in the case of 2 tensors, but with an intermediate +// (the algorithm ends up being pretty much the same). +// +// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set +// of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]). +// +// Proof. Let G be any DAG of pointwise ops and < be any valid topological +// ordering on nodes of G. Proof by induction over <. +// Base case (graph input): +// Trivial (input is also an output). +// Step (n = f(q, r)): +// Let QS (RS) be the set of shapes of inputs that q (r) depends on. +// Note that the set of inputs that n depends on is exactly QS + RS. +// shape(n) == shape(f(q, r)) +// (def of f) +// == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s)))) +// (output shape of f^ is equal to either of argument shapes) +// == shape(E(q, B(q.s, r.s))) +// (property of expand) +// == B(q.s, r.s) +// (induction assumption) +// == B(B(QS...), B(RS...)) +// (Lemma 1.) +// == B(QS..., RS...) +// (repeated shapes don't matter for broadcasting) +// == B((QS + RS)...) +// +// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s)) +// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)). +// Proof. A simple exercise for the reader :) +// +// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to +// (post-chunk-)inputs, and have all intermediates of the same shape +// (no broadcasting happening in the body). +// +// Proof. Using the above lemmas we can easily show that a graph with a single output +// can be easily rewritten by taking the shape given by B applied to all input +// shapes, expanding inputs to it, and using only non-broadcasting operations. +// Example: +// +// let d = f(a, b) in +// let e = h(b, c) in +// g(d, e) +// +// (By def. of broadcasting pointwise ops applied to g, f and h) +// (Lemma 2. for a closed formula for the size of g = gs) +// +// let gs = B(a.s, b.s, c.s) in +// let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in +// let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in +// g^(d', e') +// +// (Lemma 3.) +// +// let gs = B(a.s, b.s, c.s) in +// let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in +// let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in +// g^(d', e') +// +// (Lemma 4. + Lemma 1. to simplify broadcasting function) +// +// let gs = B(a.s, b.s, c.s) in +// let d' = f^(E(a, gs), E(b, gs)) in +// let e' = h^(E(b, gs), E(c, gs)) in +// g^(d', e') +// +// (Simple rewrite) +// +// let gs = B(a.s, b.s, c.s) in +// let a' = E(a, gs) in +// let b' = E(b, gs) in +// let c' = E(c, gs) in +// let d' = f^(a', b') in +// let e' = h^(b', c') in +// g^(d', e') +// +// This example can be easily formalized to arbitrary DAGs using induction +// over topological ordering, similar to Lemma 2. Now, if broadcasting groups +// for all outputs have the same shape, then performing an expand to this size +// on all inputs will ensure that all intermediates on all paths to outputs +// will have the same shape, proving that the body of the kernel is valid. +// +// This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs +// is straightforward (needs a simple lemma for moving expands through chunks). + +// Register implementations of fused operators, so that we can reuse the fused graph +// to generate fallback code. +RegisterOperators reg_fused_operators({ + Operator( + prim::FusedConcat, + [](Node* node) { + int64_t dim = node->i(attr::dim); + int64_t num_inputs = node->inputs().size(); + return [dim, num_inputs](Stack& stack) { + auto result = at::cat( + fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }), + dim + ); + drop(stack, num_inputs); + pack(stack, std::move(result)); + return 0; + }; + }) +}); + +FusionHandleImpl::FusionHandleImpl( + std::shared_ptr _graph +, int device) +: device(device) +, fallback_code(_graph) +, graph(std::move(_graph)) +, input_broadcast_groups(getInputBroadcastGroups()) +, input_chunks(getInputChunkDescriptors()) +, kernels() { } + +std::atomic FusionHandleImpl::next_kernel_id {0}; + +static Node* usedInFusedChunk(Value* input) { + auto uses = input->uses(); + if (uses.size() == 1) { + Node *user = uses[0].user; + if (user->kind() == prim::ConstantChunk) { + return user; + } + } + return nullptr; +} + +auto FusionHandleImpl::getInputChunkDescriptors() -> std::vector { + std::vector descs; + descs.reserve(graph->inputs().size()); + for (Value* input : graph->inputs()) { + if (Node* chunk = usedInFusedChunk(input)) { + descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim)); + } else { + descs.emplace_back(1, 0); + } + } + return descs; +} + +// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access +static std::vector getInputDependencies(Value* output) { + // Run a DFS traversal to find all inputs that affect a given output value + std::vector queue { output }; + std::unordered_set inputs; + std::unordered_set seen; + while (!queue.empty()) { + Value* val = queue.back(); queue.pop_back(); + Node* producer = val->node(); + if (producer->kind() == prim::Param) { + inputs.insert(val); + continue; + } + for (Value* input : producer->inputs()) { + if (/*bool inserted = */seen.insert(input).second) { + queue.push_back(input); + } + } + } + + // Convert Value* into offsets into the graph's input list + std::vector offsets; + offsets.reserve(inputs.size()); + for (Value* input : inputs) { + offsets.push_back(input->offset()); + } + + std::sort(offsets.begin(), offsets.end()); + return offsets; +} + +// See Note [Run-time shape checking code] for more explanation on the algorithm. +at::optional> FusionHandleImpl::canRunKernel(at::TensorList args) { + AT_CHECK(args.size() == input_chunks.size(), + "Expected ", input_chunks.size(), " arguments, but got ", args.size()); + + at::optional> map_size; + for (const auto & broadcast_group : input_broadcast_groups) { + if (!map_size) { + map_size = getMapSize(args, broadcast_group); + if (!map_size) { + return at::nullopt; + } + } else { + auto group_map_size = getMapSize(args, broadcast_group); + // NB: this checks that group_map_size is defined AND equal to map_size + if (map_size != group_map_size) { + return at::nullopt; + } + } + } + return map_size; +} + +std::unique_ptr FusionHandleImpl::compileSpec( + const FusionArgSpec& spec +, const std::vector& map_size) { + AnnotatedGraph agraph{*graph, device}; + + agraph.input_desc = spec.descs(); + // XXX: this assumes that fused kernels only operate on floating-point values inside + at::optional scalar_type; + for (TensorDesc& desc : agraph.input_desc) { + if (isFloatingType(desc.scalar_type)) { + scalar_type = desc.scalar_type; + break; + } + } + JIT_ASSERT(scalar_type); + + for (Value * output : graph->outputs()) { + std::vector sizes = map_size; + if (output->node()->kind() == prim::FusedConcat) { + sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size(); + } + auto type = CompleteTensorType::create(*scalar_type, device, sizes); + agraph.output_desc.emplace_back(std::move(type)); + } + + std::string name = "kernel_" + std::to_string(next_kernel_id++); + FusedKernel* raw_func; + if (device != kCPUDevice) { + #if USE_CUDA_FUSER + raw_func = new cudafuser::CUDAFusedKernel(name, agraph); + #else + throw std::runtime_error("CUDA Fusion is not supported on this build."); + #endif // USE_CUDA_FUSER + } else { + raw_func = new cpufuser::CPUFusedKernel( + name + , agraph + , cpufuser::getFusionCompiler().getConfig()); + } + return std::unique_ptr(raw_func); +} + +// NB: args are mutated in this call. map_size is mutated too, but is restored to its original +// value before this function returns (it's an optimization). +void FusionHandleImpl::expandArgs(std::vector& args, std::vector& map_size) { + for (size_t i = 0; i < args.size(); ++i) { + auto& arg = args[i]; + auto& pdesc = input_chunks[i]; + if (pdesc.nSubtensors == 1) { + if (arg.sizes().equals(map_size)) continue; + arg = arg.expand(map_size); + } else { + map_size.at(pdesc.dim) *= pdesc.nSubtensors; + if (!arg.sizes().equals(map_size)) { + arg = arg.expand(map_size); + } + map_size.at(pdesc.dim) /= pdesc.nSubtensors; + } + } +} + +std::vector> FusionHandleImpl::getInputBroadcastGroups() { + std::unordered_set, torch::hash>> broadcast_groups; + for (Value* output : graph->outputs()) { + broadcast_groups.insert(getInputDependencies(output)); + } + return std::vector>{broadcast_groups.begin(), broadcast_groups.end()}; +} + +void FusionHandleImpl::run(Stack& stack) { + int64_t num_inputs = graph->inputs().size(); + auto args = fmap(last(stack, num_inputs), [](const IValue& i) { + return i.toTensor(); + }); + + auto maybe_map_size = canRunKernel(args); + if (!maybe_map_size) { + return runFallback(stack); + } + expandArgs(args, *maybe_map_size); + + FusionArgSpec spec{args}; + auto it = kernels.find(spec); + if (it == kernels.end()) { + std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size)); + } + auto& fn = it->second; + + std::vector outputs; + fn->launch(args, outputs); + drop(stack, num_inputs); + stack.insert( + stack.end() + , std::make_move_iterator(outputs.begin()) + , std::make_move_iterator(outputs.end())); +} + +at::optional> FusionHandleImpl::getMapSize( + at::TensorList args +, at::IntList arg_subset) { + int64_t dim_after_broadcast = 0; + for (int64_t arg_idx : arg_subset) { + dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim()); + } + // TODO: this keeps reallocating map_size at every iteration, but we know + // exactly how much storage do we need, so this could be fixed in-place at + // every step. We're just missing a few functions for ATen, but the fix + // should be straightforward. + // NB: we leave this uninitialized, because an empty size is trivially + // broadcastable to any other size. + std::vector map_size; + for (size_t i = 0; i < arg_subset.size(); ++i) { + auto& arg = args.at(arg_subset[i]); + auto& chunk_desc = input_chunks.at(arg_subset[i]); + if (chunk_desc.nSubtensors == 1) { + try { + map_size = at::infer_size(map_size, arg.sizes()); + } catch (std::exception& e) { + return at::nullopt; + } + } else { + auto tensor_sizes = arg.sizes().vec(); + int64_t num_chunks = chunk_desc.nSubtensors; + int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size()); + if (tensor_sizes[dim] % num_chunks != 0) { + return at::nullopt; + } + tensor_sizes[dim] /= num_chunks; + try { + map_size = at::infer_size(map_size, tensor_sizes); + } catch (std::exception& e) { + return at::nullopt; + } + } + } + + return {map_size}; +} + +void FusionHandleImpl::runFallback(Stack& stack) { + InterpreterState(fallback_code).runOneStage(stack); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/common/fusion_handle_impl.h b/torch/csrc/jit/fusers/common/fusion_handle_impl.h new file mode 100644 index 00000000000000..cfc601ca385d21 --- /dev/null +++ b/torch/csrc/jit/fusers/common/fusion_handle_impl.h @@ -0,0 +1,72 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/fusion_arg_spec.h" +#include "torch/csrc/jit/fusers/common/fused_kernel.h" + +#include "torch/csrc/jit/stack.h" +#include "torch/csrc/jit/interpreter.h" +#include "torch/csrc/jit/ir.h" + +#include "ATen/ATen.h" + +#include +#include +#include +#include + +namespace torch { namespace jit { + +// FusionCompiler has very limited shape information available at the time getOrCompile +// is called, and this is why it can't really prepare the kernels at that time. Instead, +// it returns this object, which will take care of matching the run-time shapes to whatever +// kernels we have compiled already. +// +// Two configurations are considered eligible for the same fused kernel if: +// - the shapes satisfy graph invariants for our fused code (e.g. that all intermediate shapes +// are the same - see fusion_compiler.cpp for more details). +// - their FusionArgSpecs compare equal +struct FusionHandleImpl : public FusionHandle { + FusionHandleImpl( + std::shared_ptr _graph + , int device); + + void run(Stack& inputs); + +private: + struct PartitionInfo { + PartitionInfo(int64_t nsub, int64_t dim) + : nSubtensors(nsub), dim(dim) { }; + + int64_t nSubtensors; + int64_t dim; + }; + + void runFallback(Stack& stack); + void expandArgs(std::vector& args, std::vector& map_size); + at::optional> canRunKernel(at::TensorList args); + at::optional> getMapSize(at::TensorList args, at::IntList arg_subset); + std::vector> getInputBroadcastGroups(); + std::vector getInputChunkDescriptors(); + std::unique_ptr compileSpec( + const FusionArgSpec& spec, const std::vector& map_size); + + static std::atomic next_kernel_id; + + int device; + Code fallback_code; + std::shared_ptr graph; + std::vector> input_broadcast_groups; + std::vector input_chunks; + std::unordered_map< + FusionArgSpec + , std::unique_ptr + , torch::hash> kernels; +}; + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/common/partition_desc.h b/torch/csrc/jit/fusers/common/partition_desc.h new file mode 100644 index 00000000000000..b9825155f55ca1 --- /dev/null +++ b/torch/csrc/jit/fusers/common/partition_desc.h @@ -0,0 +1,48 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/common/tensor_desc.h" + +#include "torch/csrc/jit/assertions.h" + +#include +#include +#include + +namespace torch { namespace jit { + +// Descriptor for chunk-ing an input tensor into subtensors +// OR concat-ing an output tensor from subtensors +struct PartitionDesc { + + PartitionDesc() + : nSubtensors(1), dim(0) {} + + PartitionDesc(const TensorDesc& desc, size_t nSubtensors, size_t dim) + : nSubtensors(nSubtensors), dim(dim) { + JIT_ASSERT(nSubtensors > 1); + std::vector cont = desc.contiguity; + if(dim > 0) { + // when we narrow the concatenated output/chunked input + // we make the size[dim] smaller while keeping the stride[dim] the same, + // meaning: stride[dim - 1] != stride[dim]*size[dim] + // so dim - 1 is no longer contiguous + cont[dim - 1] = false; + } + subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont)); + } + + bool isNoop() const { + return nSubtensors == 1; + } + + size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat + size_t dim; // dimension along which the chunk/concat occurs + std::unique_ptr subtensorDesc; // descriptor for the subtensor, if it exists +}; + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/common/tensor_desc.cpp b/torch/csrc/jit/fusers/common/tensor_desc.cpp new file mode 100644 index 00000000000000..5b1de58e3129dc --- /dev/null +++ b/torch/csrc/jit/fusers/common/tensor_desc.cpp @@ -0,0 +1,20 @@ +#include "torch/csrc/jit/fusers/common/tensor_desc.h" + +#include "torch/csrc/jit/assertions.h" + +namespace torch { namespace jit { + +std::vector TensorDesc::findContiguous( + const at::IntList& sizes +, const at::IntList& strides) { + JIT_ASSERT(sizes.size() == strides.size()); + std::vector cont(sizes.size()); + for(size_t i = 0; i < sizes.size(); ++i) { + int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1; + cont[i] = strides[i] == expected_stride; + } + return cont; +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/common/tensor_desc.h b/torch/csrc/jit/fusers/common/tensor_desc.h new file mode 100644 index 00000000000000..ff5d640bfd6cbb --- /dev/null +++ b/torch/csrc/jit/fusers/common/tensor_desc.h @@ -0,0 +1,83 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/type.h" + +#include "torch/csrc/utils/hash.h" + +#include "ATen/ATen.h" + +#include +#include +#include + +namespace torch { namespace jit { + +// type information needed by the compiler for input/outputs +// contiguity[i] is true if the dim i is contiguous with dim i + 1. +// contiguity.back() == true means strides.back() == 1. +struct TensorDesc { + at::ScalarType scalar_type; + std::vector contiguity; + + TensorDesc(const at::ScalarType& type, const std::vector& contiguity) + : scalar_type{type}, contiguity{contiguity} { + if (contiguity.size() == 0) { + nDim_ = 0; + } else { + nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0); + } + } + + TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides) + : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {} + + TensorDesc(const at::Tensor& t) + : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {} + + TensorDesc(CompleteTensorTypePtr type) + : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {} + + // number of dimensions after contiguity compression + size_t nDim() const { + return nDim_; + } + + // do we have inner stride == 1? + bool lastIsContiguous() const { + return contiguity.size() == 0 || contiguity.back(); + } + + static std::vector findContiguous( + const at::IntList& sizes, + const at::IntList& strides); + + bool operator==(const TensorDesc & desc) const { + return scalar_type == desc.scalar_type && contiguity == desc.contiguity; + } + + bool operator!=(const TensorDesc & desc) const { + return !(*this == desc); + } + + static size_t hash(const TensorDesc& spec) { + return torch::get_hash(spec.scalar_type, spec.nDim_, std::hash>{}(spec.contiguity)); + } + +private: + size_t nDim_; +}; + +inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) { + out << d.scalar_type << "["; + for (auto b : d.contiguity) + out << b << ";"; + out << "]"; + return out; +} + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/common/tensor_info.h b/torch/csrc/jit/fusers/common/tensor_info.h new file mode 100644 index 00000000000000..9798a44bd877e8 --- /dev/null +++ b/torch/csrc/jit/fusers/common/tensor_info.h @@ -0,0 +1,25 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER || USE_CUDA_FUSER +#pragma once + +#include + +namespace torch { namespace jit { + +// Host-side view of TensorInfo (that visivle for the kernel is defined above). +// Note dims[0] - we need to dynamically allocate the dims. +struct TensorInfo { + + uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; } + uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; } + + void* data; + #pragma GCC diagnostic ignored "-Wpedantic" + uint32_t sizes_strides[0]; + #pragma GCC diagnostic pop +}; + +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER || USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/cpu/dynamic_library.h b/torch/csrc/jit/fusers/cpu/dynamic_library.h new file mode 100644 index 00000000000000..ec16698a75b960 --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/dynamic_library.h @@ -0,0 +1,42 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER +#pragma once + +#include "dlfcn.h" + +namespace torch { namespace jit { namespace cpufuser { + +static void* checkDL(void* x) { + if (!x) { + AT_ERROR("error in dlopen or dlsym: ", dlerror()); + } + + return x; +} + +struct DynamicLibrary { + TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary); + + DynamicLibrary(const char* name) { + handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW)); + } + + void* sym(const char* name) { + JIT_ASSERT(handle); + return checkDL(dlsym(handle, name)); + } + + ~DynamicLibrary() { + if (!handle) return; + dlclose(handle); + } + +private: + void* handle = nullptr; +}; + +} // namespace cpufuser +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER diff --git a/torch/csrc/jit/fusers/cpu/fused_kernel.cpp b/torch/csrc/jit/fusers/cpu/fused_kernel.cpp new file mode 100644 index 00000000000000..277ec7b0c20975 --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/fused_kernel.cpp @@ -0,0 +1,95 @@ +#include "torch/csrc/jit/fusers/cpu/fused_kernel.h" + +#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h" +#include "torch/csrc/jit/fusers/cpu/temp_file.h" +#include "torch/csrc/jit/fusers/cpu/dynamic_library.h" +#include "torch/csrc/jit/fusers/common/annotated_graph.h" + +#include "torch/csrc/jit/assertions.h" +#include "torch/csrc/jit/code_template.h" + +#include +#include +#include +#include +#include + + +namespace torch { namespace jit { namespace cpufuser { + +static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so"; +static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp"; + +// NB: -march=native not supported on PPC64 g++. It's a bit annoying +// to do a configure-style test to decide whether or not the g++ +// actually supports it or not, so we heuristically use the host +// compiler to predict if the runtime compiler supports the option we +// want. This probably won't work if you're cross-compiling. +// NB: -march=native is disabled because it has caused problems where +// compiler and assembler do not agree on what native instruction they +// understand for AVX512. When we need better CPU performance this +// optimization can be re-enabled by tracking down the platforms where +// this error occurs and only selectively disabling it. +static const std::string compile_string = + "\"${cxx}\" -O3 -g " +#ifndef __PPC64__ +// "-march=native " +#endif + "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm"; + +static void runCompiler( + CPUFusionCompilerConfig& config +, const std::string& cpp_file +, const std::string& so_file) { + TemplateEnv env; + env.s("cxx", config.cxx); + env.s("fopenmp", config.openmp ? "-fopenmp" : ""); + env.s("cpp_file",cpp_file); + env.s("so_file",so_file); + std::string result = format(compile_string, env); + int r = system(result.c_str()); + if (config.openmp && r != 0) { + std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n"; + config.openmp = false; // disable for future compiles + return runCompiler(config, cpp_file, so_file); + } + JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel"); +} + +static const std::string disas_string = + "objdump -M intel -d \"${so_file}\""; +static void disas(const std::string& so_file) { + TemplateEnv env; + env.s("so_file", so_file); + std::string cmd = format(disas_string, env); + int r = system(cmd.c_str()); + JIT_ASSERT(r == 0); +} + +CPUFusedKernel::CPUFusedKernel( + const std::string& name +, AnnotatedGraph& agraph +, CPUFusionCompilerConfig& config) +: FusedKernel(name, agraph) { + TempFile so_file(so_template, 3); + TempFile cpp_file(cpp_template, 4); + + std::stringstream cu; + std::tie(chunk_desc, concat_desc, has_random) = emitCompilationUnit(cu, name, agraph, false); + JIT_ASSERT(!has_random); + compilation_unit = cu.str(); + cpp_file.write(compilation_unit); + cpp_file.sync(); + runCompiler(config, cpp_file.name(), so_file.name()); + if (config.debug) { + disas(so_file.name()); + } + so_lib.reset(new DynamicLibrary(so_file.name().c_str())); + #pragma GCC diagnostic ignored "-Wpedantic" + kernel = reinterpret_cast(so_lib->sym(name.c_str())); + #pragma GCC diagnostic pop +} + +} // namespace cpufuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/cpu/fused_kernel.h b/torch/csrc/jit/fusers/cpu/fused_kernel.h new file mode 100644 index 00000000000000..d32f4c351733bf --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/fused_kernel.h @@ -0,0 +1,45 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h" +#include "torch/csrc/jit/fusers/cpu/dynamic_library.h" +#include "torch/csrc/jit/fusers/common/fused_kernel.h" +#include "torch/csrc/jit/fusers/common/annotated_graph.h" + +#include "ATen/ATen.h" + +#include +#include +#include + +namespace torch { namespace jit { namespace cpufuser { + +struct CPUFusedKernel : public ::torch::jit::FusedKernel { + CPUFusedKernel( + const std::string& name + , AnnotatedGraph& agraph + , CPUFusionCompilerConfig& config); + +protected: + virtual at::Backend backend() const override { + return at::Backend::CPU; + } + + virtual uint64_t get_rand_offset(uint32_t numel) override { + return numel; + } + + virtual void launch_raw(uint32_t numel, void** arguments) override { + kernel(numel, arguments); + } + + std::unique_ptr so_lib; + void (*kernel)(uint32_t, void**) = nullptr; +}; + +} // namespace cpufuser +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER diff --git a/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp b/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp new file mode 100644 index 00000000000000..9285ae4caa179b --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp @@ -0,0 +1,85 @@ +#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h" + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h" + +#include "torch/csrc/jit/passes/shape_analysis.h" // EraseShapeInformation +#include "torch/csrc/utils/functional.h" //fmap +#include "torch/csrc/jit/ivalue.h" // IValue +#include "torch/csrc/jit/code_template.h" +#include "torch/csrc/jit/assertions.h" + +#include +#include +#include +#include + +namespace torch { namespace jit { namespace cpufuser { + +CPUFusionCompiler& getFusionCompiler() { + static CPUFusionCompiler compiler; + return compiler; +} + +static const std::string check_exists_string = "which '${program}' > /dev/null"; +static bool programExists(const std::string& program) { + TemplateEnv env; + env.s("program", program); + std::string cmd = format(check_exists_string, env); + return 0 == system(cmd.c_str()); +} + +CPUFusionCompiler::CPUFusionCompiler() { + const char* cxx_env = getenv("CXX"); + if (cxx_env != nullptr) { + config_.cxx = cxx_env; + } + + if (!programExists(config_.cxx)) { + config_.cxx = ""; + } + + const char* debug_env = getenv("PYTORCH_FUSION_DEBUG"); + config_.debug = debug_env && atoi(debug_env) != 0; +} + +std::shared_ptr CPUFusionCompiler::getFusionHandle(Node* fusion_group) { + int device = fusion_group->i(attr::device); + JIT_ASSERT(device == kCPUDevice); + auto graph = fusion_group->g(attr::Subgraph)->copy(); + EraseShapeInformation(*graph); + std::stringstream key; + key << "device " << device << "\n"; + key << *graph << "\n"; + std::string key_ = key.str(); + auto it = cache_map.find(key_); + if (it == cache_map.end()) { + std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared(graph, device)); + } + return it->second; +} + +std::vector CPUFusionCompiler::debugLaunchGraph( + Graph& graph +, int device +, at::ArrayRef inputs) { + auto wrapper_graph = std::make_shared(); + Node* fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device)); + fusion_group->g_(attr::Subgraph, graph.copy()); + for (size_t i = 0; i < graph.inputs().size(); ++i) { + fusion_group->addInput(wrapper_graph->addInput()); + } + for (size_t i = 0; i < graph.outputs().size(); ++i) { + wrapper_graph->registerOutput(fusion_group->addOutput()); + } + auto cache = getFusionHandle(fusion_group); + Stack stack = fmap(inputs); + cache->run(stack); + return fmap(stack, [](const IValue& iv) { return iv.toTensor(); }); +} + + + +} // namespace cpufuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/cpu/fusion_compiler.h b/torch/csrc/jit/fusers/cpu/fusion_compiler.h new file mode 100644 index 00000000000000..d41fbc7de4b550 --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/fusion_compiler.h @@ -0,0 +1,55 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h" + +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/utils/disallow_copy.h" + +#include "ATen/ATen.h" + +#include +#include +#include +#include + +namespace torch { namespace jit { namespace cpufuser { + +struct CPUFusionCompilerConfig { + std::string cxx = "g++"; // compiler location + bool debug = false; // emit debugging information about fusions + bool openmp = true; +}; + +struct CPUFusionCompiler { + TH_DISALLOW_COPY_AND_ASSIGN(CPUFusionCompiler); + + CPUFusionCompiler(); + + ~CPUFusionCompiler() = default; + + std::shared_ptr getFusionHandle(Node* fusion_group); + + std::vector debugLaunchGraph( + Graph& graph + , int device + , at::ArrayRef inputs); + + CPUFusionCompilerConfig& getConfig() { + return config_; + } + +private: + CPUFusionCompilerConfig config_; + std::unordered_map> cache_map; +}; + +CPUFusionCompiler& getFusionCompiler(); + +} // namespace cudafuser +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER diff --git a/torch/csrc/jit/fusers/cpu/interface.h b/torch/csrc/jit/fusers/cpu/interface.h new file mode 100644 index 00000000000000..01f6feb8e2dd8d --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/interface.h @@ -0,0 +1,32 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h" + +#include "ATen/ATen.h" + +#include "torch/csrc/jit/ir.h" + +#include +#include + +namespace torch { namespace jit { namespace cpufuser { + +inline std::shared_ptr getFusionHandle(Node* fusion_group) { + return getFusionCompiler().getFusionHandle(fusion_group); +} + +std::vector debugLaunchGraph( + Graph& graph +, int device +, at::ArrayRef inputs) { + return getFusionCompiler().debugLaunchGraph(graph, device, inputs); +} + +} // namespace cpufuser +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER diff --git a/torch/csrc/jit/fusers/cpu/resource_strings.h b/torch/csrc/jit/fusers/cpu/resource_strings.h new file mode 100644 index 00000000000000..60c1c0faaa4fea --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/resource_strings.h @@ -0,0 +1,56 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER +#pragma once + +#include "torch/csrc/jit/code_template.h" + +namespace torch { namespace jit { namespace cpufuser { + +/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input. +Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types, +so typedefs help it handle those cases*/ + +auto type_declarations_template = CodeTemplate(R"( +typedef ${IndexType} IndexType; +template +struct TensorInfo { + T* data; + IndexType sizes[N]; + IndexType strides[N]; +}; +template +struct TensorInfo { + T * data; +}; +)"); + +auto cpu_compilation_unit_template = CodeTemplate(R"( +#include +#include +#include +${type_declarations} + +#define OMP_THRESHOLD 100000 +static void ${kernelName}_kernel(IndexType totalElements, ${formals}) { + #pragma omp parallel for if(totalElements > OMP_THRESHOLD) + for (IndexType linearIndex = 0; + linearIndex < totalElements; + linearIndex += 1) { + // Convert `linearIndex` into an offset of tensor: + ${tensorOffsets} + // calculate the results + ${kernelBody} + } +} + +extern "C" +void ${kernelName}(IndexType totalElements, void ** args) { + ${kernelName}_kernel(totalElements ${,argument_loads}); +} +)"); + +} // namespace cpufuser +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER diff --git a/torch/csrc/jit/fusers/cpu/temp_file.h b/torch/csrc/jit/fusers/cpu/temp_file.h new file mode 100644 index 00000000000000..2b2c21290c9ff5 --- /dev/null +++ b/torch/csrc/jit/fusers/cpu/temp_file.h @@ -0,0 +1,70 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CPU_FUSER +#pragma once + +#include "torch/csrc/jit/assertions.h" + +#include "torch/csrc/WindowsTorchApiMacro.h" +#include "torch/csrc/utils/disallow_copy.h" + +#include "ATen/ATen.h" + + +#include "unistd.h" + +#include +#include + +namespace torch { namespace jit { namespace cpufuser { + +struct TempFile { + TH_DISALLOW_COPY_AND_ASSIGN(TempFile); + + TempFile(const std::string& t, int suffix) { + // mkstemps edits its first argument in places + // so we make a copy of the string here, including null terminator + std::vector tt(t.c_str(), t.c_str() + t.size() + 1); + int fd = mkstemps(tt.data(), suffix); + JIT_ASSERT(fd != -1); + file_ = fdopen(fd, "r+"); + + // - 1 becuase tt.size() includes the null terminator, + // but std::string does not expect one + name_ = std::string(tt.begin(), tt.end() - 1); + } + + const std::string& name() const { + return name_; + } + + void sync() { + fflush(file_); + } + + void write(const std::string & str) { + size_t result = fwrite(str.c_str(), 1, str.size(), file_); + JIT_ASSERT(str.size() == result); + } + + FILE* file() { + return file_; + } + + ~TempFile() { + if (file_ != nullptr) { + // unlink first to ensure another mkstemps doesn't + // race between close and unlink + unlink(name_.c_str()); + fclose(file_); + } + } +private: + FILE* file_ = nullptr; + std::string name_; +}; + +} // namespace cpufuser +} // namespace jit +} // namespace torch + +#endif // USE_CPU_FUSER diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp new file mode 100644 index 00000000000000..90624307123b56 --- /dev/null +++ b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp @@ -0,0 +1,105 @@ +#include "torch/csrc/jit/fusers/cuda/fused_kernel.h" + +#include "torch/csrc/jit/resource_guard.h" + +#include "ATen/cuda/CUDAContext.h" +#include "THC/THC.h" +#include "THC/THCGenerator.hpp" +#include "torch/csrc/cuda/cuda_check.h" + +#include "nvrtc.h" +#include "cuda.h" +#include "cuda_runtime.h" + +#include +#include +#include +#include +#include + +namespace torch { namespace jit { namespace cudafuser { + +void checkCUDAVersion(const cudaDeviceProp& prop) { + if ((prop.major >= 6 && CUDA_VERSION < 8000) || + (prop.major >= 7 && CUDA_VERSION < 9000)) { + std::stringstream err_string; + err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: " + << CUDA_VERSION << " for the current GPU device " << prop.name + << " with device capability " << prop.major << "." << prop.minor; + throw std::runtime_error(err_string.str()); + } +} + +CUDAFusedKernel::CUDAFusedKernel( + const std::string& name +, AnnotatedGraph& agraph) +: FusedKernel(name, agraph) { + at::DeviceGuard device_guard(agraph.device); + + TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device)); + checkCUDAVersion(prop); + + std::stringstream cu; + std::tie(chunk_desc, concat_desc, has_random) = emitCompilationUnit(cu, name, agraph, true); + compilation_unit = cu.str(); + nvrtcProgram program; + TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr)); + + std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor); + std::vector args = {"--std=c++11", compute.c_str(), "-default-device"}; + nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data()); + if (result == NVRTC_ERROR_COMPILATION) { + size_t logsize; + nvrtcGetProgramLogSize(program, &logsize); + std::vector log(logsize); + nvrtcGetProgramLog(program, log.data()); + cu << log.data(); + throw std::runtime_error(cu.str()); + } + ResourceGuard holdProgram([&] { + TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program)); + }); + TORCH_NVRTC_CHECK(result); + + size_t ptx_size; + TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size)); + ptx.resize(ptx_size); + TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data())); + + TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data())); + TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str())); + + TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor( + &maxBlocks, function, 128, 0)); + maxBlocks *= prop.multiProcessorCount; +} + +void CUDAFusedKernel::launch_raw(uint32_t numel, void** arguments) { + int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize)); + + //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize) + // << " numblocks = " << numBlocks; + + // it is possible that this is the first cuda call on this thread + // so make sure we initialize the Driver API's context + // cudaFree(0) accomplishes this. + CUcontext pctx = 0; + TORCH_CU_CHECK(cuCtxGetCurrent(&pctx)); + if (!pctx) { + std::unique_lock cudaFreeMutexLock( + *(THCCachingAllocator_getCudaFreeMutex())); + cudaFree(0); + } + CUstream stream = at::cuda::getCurrentCUDAStream(); + TORCH_CU_CHECK(cuLaunchKernel( + function, + numBlocks, 1, 1, + blockSize, 1, 1, + 0, stream, + arguments, + nullptr)); +} + +} // namespace cudafuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.h b/torch/csrc/jit/fusers/cuda/fused_kernel.h new file mode 100644 index 00000000000000..bd6a0ec8b95606 --- /dev/null +++ b/torch/csrc/jit/fusers/cuda/fused_kernel.h @@ -0,0 +1,59 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/common/fused_kernel.h" +#include "torch/csrc/jit/fusers/common/annotated_graph.h" + +#include "ATen/ATen.h" + +#include "nvrtc.h" +#include "cuda.h" +#include "cuda_runtime.h" + +#include +#include +#include + +namespace torch { namespace jit { namespace cudafuser { + +struct CUDAFusedKernel : public ::torch::jit::FusedKernel { + CUDAFusedKernel(const std::string& name, AnnotatedGraph& agraph); + + virtual ~CUDAFusedKernel() override { + cuModuleUnload(module); + } + +protected: + virtual at::Backend backend() const override { + return at::Backend::CUDA; + } + + int ceilDiv(int a, int b) { + return (a + b - 1) / b; + } + + virtual uint64_t get_rand_offset(uint32_t numel) override { + int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize)); + return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1); + } + + virtual void launch_raw(uint32_t numel, void ** arguments) override; + + std::vector ptx; + CUmodule module; + CUfunction function; + + // we record prop/device so if they are availiable for launch heuristics + // querying at launch is too slow for device properties. + int device; + cudaDeviceProp prop; + int blockSize = 128; + int maxBlocks; +}; + +} // namespace cudafuser +} // namespace jit +} // namespace torch + +#endif // USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp b/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp new file mode 100644 index 00000000000000..3add518bdeaf21 --- /dev/null +++ b/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp @@ -0,0 +1,71 @@ +#include "torch/csrc/jit/fusers/cuda/fusion_compiler.h" + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h" + +#include "torch/csrc/jit/passes/shape_analysis.h" // EraseShapeInformation +#include "torch/csrc/utils/functional.h" //fmap +#include "torch/csrc/jit/ivalue.h" // IValue + +#include "torch/csrc/jit/assertions.h" + +#include +#include +#include +#include + +namespace torch { namespace jit { namespace cudafuser { +CUDAFusionCompiler& getFusionCompiler() { + static CUDAFusionCompiler compiler; + return compiler; +} + +std::shared_ptr CUDAFusionCompiler::getFusionHandle( + Node* fusion_group) { + // verifies on GPU + const auto device = fusion_group->i(attr::device); + JIT_ASSERT(device != kCPUDevice); + + auto graph = fusion_group->g(attr::Subgraph)->copy(); + EraseShapeInformation(*graph); + std::stringstream key; + key << "device " << device << "\n"; + key << *graph << "\n"; + std::string key_ = key.str(); + auto it = cache_map.find(key_); + if (it == cache_map.end()) { + std::tie(it, std::ignore) = + cache_map.emplace( + key_ + , std::make_shared(graph, device)); + } + + return it->second; +} + +std::vector CUDAFusionCompiler::debugLaunchGraph( + Graph& graph +, int device +, at::ArrayRef inputs) { + auto wrapper_graph = std::make_shared(); + Node* fusion_group = + wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device)); + fusion_group->g_(attr::Subgraph, graph.copy()); + + for (size_t i = 0; i < graph.inputs().size(); ++i) { + fusion_group->addInput(wrapper_graph->addInput()); + } + + for (size_t i = 0; i < graph.outputs().size(); ++i) { + wrapper_graph->registerOutput(fusion_group->addOutput()); + } + + auto cache = getFusionHandle(fusion_group); + Stack stack = fmap(inputs); + cache->run(stack); + return fmap(stack, [](const IValue& iv) { return iv.toTensor(); }); +} + +} // namespace cudafuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/cuda/fusion_compiler.h b/torch/csrc/jit/fusers/cuda/fusion_compiler.h new file mode 100644 index 00000000000000..800ea2150dfecc --- /dev/null +++ b/torch/csrc/jit/fusers/cuda/fusion_compiler.h @@ -0,0 +1,44 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h" + +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/utils/disallow_copy.h" + +#include "ATen/ATen.h" + +#include +#include +#include +#include + +namespace torch { namespace jit { namespace cudafuser { + +struct CUDAFusionCompiler { + TH_DISALLOW_COPY_AND_ASSIGN(CUDAFusionCompiler); + + CUDAFusionCompiler() = default; + + ~CUDAFusionCompiler() = default; + + std::shared_ptr getFusionHandle(Node* fusion_group); + + std::vector debugLaunchGraph( + Graph& graph + , int device + , at::ArrayRef inputs); + +private: + std::unordered_map> cache_map; +}; + +CUDAFusionCompiler& getFusionCompiler(); + +} // namespace cudafuser +} // namespace jit +} // namespace torch + +#endif // USE_CUDA_FUSER diff --git a/torch/csrc/jit/fusers/cuda/interface.h b/torch/csrc/jit/fusers/cuda/interface.h new file mode 100644 index 00000000000000..4a6ee3f8e5aa36 --- /dev/null +++ b/torch/csrc/jit/fusers/cuda/interface.h @@ -0,0 +1,32 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/fusers/interface.h" +#include "torch/csrc/jit/fusers/cuda/fusion_compiler.h" + +#include "torch/csrc/jit/ir.h" + +#include "ATen/ATen.h" + +#include +#include + +namespace torch { namespace jit { namespace cudafuser { + +inline std::shared_ptr getFusionHandle(Node* fusion_group) { + return getFusionCompiler().getFusionHandle(fusion_group); +} + +std::vector debugLaunchGraph( + Graph& graph +, int device +, at::ArrayRef inputs) { + return getFusionCompiler().debugLaunchGraph(graph, device, inputs); +} + +} // namespace cudafuser +} // namespace jit +} // namespace torch + +#endif // USE_CUDA_FUSER \ No newline at end of file diff --git a/torch/csrc/jit/fusers/cuda/resource_strings.h b/torch/csrc/jit/fusers/cuda/resource_strings.h new file mode 100644 index 00000000000000..0063288721d727 --- /dev/null +++ b/torch/csrc/jit/fusers/cuda/resource_strings.h @@ -0,0 +1,197 @@ +#include "torch/csrc/jit/fusers/Config.h" +#if USE_CUDA_FUSER +#pragma once + +#include "torch/csrc/jit/code_template.h" + +namespace torch { namespace jit { namespace cudafuser { + +/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input. +Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types, +so typedefs help it handle those cases*/ + +auto type_declarations_template = CodeTemplate(R"( +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef short int int16_t; +typedef long long int int64_t; +${HalfHeader} +${RandHeader} + +typedef ${IndexType} IndexType; +template +struct TensorInfo { + T* data; + IndexType sizes[N]; + IndexType strides[N]; +}; +template +struct TensorInfo { + T * data; +}; +)"); + +// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the +// curand header correctly. +constexpr auto rand_support_literal = R"( + + class Philox { + public: + __device__ inline Philox(unsigned long long seed, + unsigned long long subsequence, + unsigned long long offset) { + key.x = (unsigned int)seed; + key.y = (unsigned int)(seed >> 32); + counter = make_uint4(0, 0, 0, 0); + counter.z = (unsigned int)(subsequence); + counter.w = (unsigned int)(subsequence >> 32); + STATE = 0; + incr_n(offset / 4); + } + + __device__ inline unsigned long operator()() { + if(STATE == 0) { + uint4 counter_ = counter; + uint2 key_ = key; + for(int i = 0; i < 9; i++) { + counter_ = single_round(counter_, key_); + key_.x += (kPhilox10A); key_.y += (kPhilox10B); + } + output = single_round(counter_, key_); + incr(); + } + unsigned long ret; + switch(STATE) { + case 0: ret = output.x; break; + case 1: ret = output.y; break; + case 2: ret = output.z; break; + case 3: ret = output.w; break; + } + STATE = (STATE + 1) % 4; + return ret; + } + + private: + uint4 counter; + uint4 output; + uint2 key; + unsigned int STATE; + __device__ inline void incr_n(unsigned long long n) { + unsigned int nlo = (unsigned int)(n); + unsigned int nhi = (unsigned int)(n >> 32); + counter.x += nlo; + if (counter.x < nlo) + nhi++; + counter.y += nhi; + if (nhi <= counter.y) + return; + if (++counter.z) + return; + ++counter.w; + } + __device__ inline void incr() { + if (++counter.x) + return; + if (++counter.y) + return; + if (++counter.z) + return; + ++counter.w; + } + __device__ unsigned int mulhilo32(unsigned int a, unsigned int b, + unsigned int *result_high) { + *result_high = __umulhi(a, b); + return a*b; + } + + __device__ inline uint4 single_round(uint4 ctr, uint2 key) { + unsigned int hi0; + unsigned int hi1; + unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); + unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); + + uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; + return ret; + } + + static const unsigned long kPhilox10A = 0x9E3779B9; + static const unsigned long kPhilox10B = 0xBB67AE85; + static const unsigned long kPhiloxSA = 0xD2511F53; + static const unsigned long kPhiloxSB = 0xCD9E8D57; + }; + + // Inverse of 2^32. + #define M_RAN_INVM32 2.3283064e-10f + __device__ __inline__ float uniform(unsigned int x) { + return x * M_RAN_INVM32; + } +)"; + +constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset"; + +constexpr auto rand_init = R"( + int idx = blockIdx.x*blockDim.x + threadIdx.x; + Philox rnd(seed, idx, offset); +)"; + +auto cuda_compilation_unit_template = CodeTemplate(R"( +${type_declarations} + +extern "C" __global__ +void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) { + ${RandInit} + for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x) { + // Convert `linearIndex` into an offset of tensor: + ${tensorOffsets} + // calculate the results + ${kernelBody} + } +} +)"); + + +// This snippet enables half support in the jit. Following the pattern for +// reductions, fp16 input data is immediately upconverted to float +// with __half2float(). All mathematical operations are done on float +// values, and if needed the intermediate float representation is +// converted to half with __float2half() when writing to a half tensor. +constexpr auto half_support_literal = R"( +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#if defined(__cplusplus) + struct __align__(2) __half { + __host__ __device__ __half() { } + + protected: + unsigned short __x; + }; + + /* All intrinsic functions are only available to nvcc compilers */ + #if defined(__CUDACC__) + /* Definitions of intrinsics */ + __device__ __half __float2half(const float f) { + __half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; + } + + __device__ float __half2float(const __half h) { + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); + return val; + } + #endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ +#undef __HALF_TO_US +#undef __HALF_TO_CUS + +typedef __half half; +)"; + +} // namespace cudafuser +} // namespace jit +} // namespace torch + +#endif // USE_CUDA_FUSER \ No newline at end of file diff --git a/torch/csrc/jit/fusers/interface.cpp b/torch/csrc/jit/fusers/interface.cpp new file mode 100644 index 00000000000000..0b175730f8cfde --- /dev/null +++ b/torch/csrc/jit/fusers/interface.cpp @@ -0,0 +1,81 @@ +#include "torch/csrc/jit/fusers/interface.h" + +#include "torch/csrc/jit/fusers/Config.h" + +#if USE_CPU_FUSER + #include "torch/csrc/jit/fusers/cpu/interface.h" +#endif // USE_CPU_FUSER + +#if USE_CUDA_FUSER + #include "torch/csrc/jit/fusers/cuda/interface.h" +#endif // USE_CUDA_FUSER + +#include + +namespace torch { namespace jit { + +namespace detail { + +bool cpu_fuser_enabled = false; + +} // namespace detail + +// Pure virtual destructor definition +FusionHandle::~FusionHandle() { } + +std::shared_ptr getFusionHandle(Node* fusion_group) { + const auto device = fusion_group->i(attr::device); + if (device == kCPUDevice) { + #if USE_CPU_FUSER + return cpufuser::getFusionHandle(fusion_group); + #endif + throw std::runtime_error("CPU fusion is not supported on this build."); + } + + #if USE_CUDA_FUSER + return cudafuser::getFusionHandle(fusion_group); + #endif // USE_CUDA_FUSER + + throw std::runtime_error("CUDA fusion is not supported on this build."); +} + +bool canFuseOnCPU() { + #if USE_CPU_FUSER + return detail::cpu_fuser_enabled; + #endif // USE_CPU_FUSER + + return false; +} + +bool canFuseOnGPU() { + #if USE_CUDA_FUSER + return true; + #endif // USE_CUDA_FUSER + + return false; +} + +void overrideCanFuseOnCPU(bool value) { + detail::cpu_fuser_enabled = value; +} + +std::vector debugLaunchGraph( + Graph& graph +, int device +, at::ArrayRef inputs) { + if (device == kCPUDevice) { + #if USE_CPU_FUSER + return cpufuser::debugLaunchGraph(graph, device, inputs); + #endif // USE_CPU_FUSER + throw std::runtime_error("CPU fusion is not supported on this build."); + } + + #if USE_CUDA_FUSER + return cudafuser::debugLaunchGraph(graph, device, inputs); + #endif // USE_CUDA_FUSER + + throw std::runtime_error("CUDA fusion is not supported on this build."); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusers/interface.h b/torch/csrc/jit/fusers/interface.h new file mode 100644 index 00000000000000..cd4956a8870d87 --- /dev/null +++ b/torch/csrc/jit/fusers/interface.h @@ -0,0 +1,37 @@ +#pragma once + +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/stack.h" + +#include "torch/csrc/WindowsTorchApiMacro.h" + +#include "ATen/ATen.h" + +#include +#include + +namespace torch { namespace jit { + +constexpr int kCPUDevice = -1; + +struct TORCH_API FusionHandle { + virtual void run(Stack& inputs) = 0; + + virtual ~FusionHandle() = 0; +}; + +TORCH_API std::shared_ptr getFusionHandle(Node* fusion_group); + +TORCH_API bool canFuseOnCPU(); +TORCH_API bool canFuseOnGPU(); + +// CPU fuser is disabled by default, but we still want to test it. +TORCH_API void overrideCanFuseOnCPU(bool value); + +TORCH_API std::vector debugLaunchGraph( + Graph& graph +, int device +, at::ArrayRef inputs); + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp deleted file mode 100644 index e21d00f00f40de..00000000000000 --- a/torch/csrc/jit/fusion_compiler.cpp +++ /dev/null @@ -1,1615 +0,0 @@ -#ifndef _WIN32 -#include "torch/csrc/jit/fusion_compiler.h" - -#include "torch/csrc/jit/ir.h" -#include "torch/csrc/jit/code_template.h" -#include "torch/csrc/jit/resource_guard.h" -#include "torch/csrc/jit/constants.h" -#include "torch/csrc/jit/passes/shape_analysis.h" -#include "torch/csrc/jit/custom_operator.h" - -#include "torch/csrc/utils/disallow_copy.h" -#include "torch/csrc/variable_tensor_functions.h" -#include "torch/csrc/utils/hash.h" -#include - -#include "ATen/ATen.h" -#include "ATen/ExpandUtils.h" -#include "ATen/WrapDimUtils.h" - -#ifdef USE_CUDA -#include "ATen/cuda/CUDAContext.h" -#include "THC/THC.h" -#include -#include "torch/csrc/cuda/cuda_check.h" -#include -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef USE_CUDA -THCGenerator* THCRandom_getGenerator(THCState* state); -#endif - -namespace torch { namespace jit { - -std::vector TensorDesc::findContiguous( - const at::IntList& sizes, - const at::IntList& strides) { - JIT_ASSERT(sizes.size() == strides.size()); - std::vector cont(sizes.size()); - for(size_t i = 0; i < sizes.size(); ++i) { - int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1; - cont[i] = strides[i] == expected_stride; - } - return cont; -} - -// Descriptor for chunk-ing an input tensor into subtensors -// OR concat-ing an output tensor from subtensors -struct PartitionDesc { - size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat - size_t dim; // dimension along which the chunk/concat occurs - std::unique_ptr subtensorDesc; // descriptor for the subtensor, if it exists - PartitionDesc() - : nSubtensors(1), dim(0) {} - - PartitionDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim) - : nSubtensors(nSubtensors), dim(dim) { - JIT_ASSERT(nSubtensors > 1); - std::vector cont = desc.contiguity; - if(dim > 0) { - // when we narrow the concatenated output/chunked input - // we make the size[dim] smaller while keeping the stride[dim] the same, - // meaning: stride[dim - 1] != stride[dim]*size[dim] - // so dim - 1 is no longer contiguous - cont[dim - 1] = false; - } - subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont)); - } - - bool isNoop() const { - return nSubtensors == 1; - } -}; - -struct FusedKernel { - TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel); - - FusedKernel(const std::string & name, AnnotatedGraph & agraph); - virtual ~FusedKernel() = default; - - // expects outputs to be pre-allocated - void launch_with_tensors(at::ArrayRef inputs, at::ArrayRef outputs); - - // creates new tensors for outputs - void launch(at::ArrayRef inputs, std::vector & outputs); - const std::vector & outputDescriptors() const { - return output_desc; - } -protected: - virtual at::Backend backend() const = 0; - - // arguments is a list of pointers to the arguments for the compiled CUDA/CPU - // code. - // The format of arguments is suitable for directly passing to a call to - // cuLaunchKernel as the kernel arguments. - // Currently the first argument is a pointer to numel (for passing to - // CUDA code), and the remainder are pointers to the TensorInfo structs - // that compiled code uses to load Tensor data. - // launch_with_tensors handles packing at::Tensors into this arguments array. - // CPU code uses the same convension so that launch_with_tensors can be shared. - virtual void launch_raw(uint32_t numel, void ** arguments) = 0; - - virtual uint64_t get_rand_offset(uint32_t numel) = 0; - bool has_random; - std::string name; - // We keep these around for debugging - std::string compilation_unit; - std::vector input_desc; - std::vector output_desc; - - // same size as output_desc, describes whether - // an output is actually a concatenation of - // many subtensors that the fusion group produces - std::vector concat_desc; - - // same size as input_desc, describes whether an - // input should be broken into subtensors (chunks) - // to be consumed by the fusion group - std::vector chunk_desc; -}; - - -namespace { - -#ifdef USE_CUDA - -static int ceilDiv(int a, int b) { - return (a + b - 1) / b; -} - -#endif - -Node* usedInFusedChunk(Value * input) { - auto uses = input->uses(); - if (uses.size() == 1) { - Node *user = uses[0].user; - if (user->kind() == prim::ConstantChunk) { - return user; - } - } - return nullptr; -} - -//////////////////////////////////////////////////////////////////////////////// -// Code generation - -namespace codegen { - -/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input. -Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types, -so typedefs help it handle those cases*/ - -auto type_declarations_template = CodeTemplate(R"( -#if defined(__CUDACC_RTC__) -typedef unsigned char uint8_t; -typedef signed char int8_t; -typedef short int int16_t; -typedef long long int int64_t; -${HalfHeader} -${RandHeader} -#endif -typedef ${IndexType} IndexType; -template -struct TensorInfo { - T * data; - IndexType sizes[N]; - IndexType strides[N]; -}; -template -struct TensorInfo { - T * data; -}; -)"); - -// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the -// curand header correctly. -constexpr auto rand_support_literal = R"( - - class Philox { - public: - __device__ inline Philox(unsigned long long seed, - unsigned long long subsequence, - unsigned long long offset) { - key.x = (unsigned int)seed; - key.y = (unsigned int)(seed >> 32); - counter = make_uint4(0, 0, 0, 0); - counter.z = (unsigned int)(subsequence); - counter.w = (unsigned int)(subsequence >> 32); - STATE = 0; - incr_n(offset / 4); - } - - __device__ inline unsigned long operator()() { - if(STATE == 0) { - uint4 counter_ = counter; - uint2 key_ = key; - for(int i = 0; i < 9; i++) { - counter_ = single_round(counter_, key_); - key_.x += (kPhilox10A); key_.y += (kPhilox10B); - } - output = single_round(counter_, key_); - incr(); - } - unsigned long ret; - switch(STATE) { - case 0: ret = output.x; break; - case 1: ret = output.y; break; - case 2: ret = output.z; break; - case 3: ret = output.w; break; - } - STATE = (STATE + 1) % 4; - return ret; - } - - private: - uint4 counter; - uint4 output; - uint2 key; - unsigned int STATE; - __device__ inline void incr_n(unsigned long long n) { - unsigned int nlo = (unsigned int)(n); - unsigned int nhi = (unsigned int)(n >> 32); - counter.x += nlo; - if (counter.x < nlo) - nhi++; - counter.y += nhi; - if (nhi <= counter.y) - return; - if (++counter.z) - return; - ++counter.w; - } - __device__ inline void incr() { - if (++counter.x) - return; - if (++counter.y) - return; - if (++counter.z) - return; - ++counter.w; - } - __device__ unsigned int mulhilo32(unsigned int a, unsigned int b, - unsigned int *result_high) { - *result_high = __umulhi(a, b); - return a*b; - } - - __device__ inline uint4 single_round(uint4 ctr, uint2 key) { - unsigned int hi0; - unsigned int hi1; - unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); - unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); - - uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; - return ret; - } - - static const unsigned long kPhilox10A = 0x9E3779B9; - static const unsigned long kPhilox10B = 0xBB67AE85; - static const unsigned long kPhiloxSA = 0xD2511F53; - static const unsigned long kPhiloxSB = 0xCD9E8D57; - }; - - // Inverse of 2^32. - #define M_RAN_INVM32 2.3283064e-10f - __device__ __inline__ float uniform(unsigned int x) { - return x * M_RAN_INVM32; - } -)"; - -constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset"; -constexpr auto rand_init = R"( - int idx = blockIdx.x*blockDim.x + threadIdx.x; - Philox rnd(seed, idx, offset); -)"; -auto cuda_compilation_unit_template = CodeTemplate(R"( -${type_declarations} - -extern "C" __global__ -void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) { - ${RandInit} - for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < totalElements; - linearIndex += gridDim.x * blockDim.x) { - // Convert `linearIndex` into an offset of tensor: - ${tensorOffsets} - // calculate the results - ${kernelBody} - } -} -)"); - -auto cpu_compilation_unit_template = CodeTemplate(R"( -#include -#include -#include -${type_declarations} - -#define OMP_THRESHOLD 100000 -static void ${kernelName}_kernel(IndexType totalElements, ${formals}) { - #pragma omp parallel for if(totalElements > OMP_THRESHOLD) - for (IndexType linearIndex = 0; - linearIndex < totalElements; - linearIndex += 1) { - // Convert `linearIndex` into an offset of tensor: - ${tensorOffsets} - // calculate the results - ${kernelBody} - } -} - -extern "C" -void ${kernelName}(IndexType totalElements, void ** args) { - ${kernelName}_kernel(totalElements ${,argument_loads}); -} -)"); - -// This snippet enables half support in the jit. Following the pattern for -// reductions, fp16 input data is immediately upconverted to float -// with __half2float(). All mathematical operations are done on float -// values, and if needed the intermediate float representation is -// converted to half with __float2half() when writing to a half tensor. -constexpr auto half_support_literal = R"( -#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) -#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) -#if defined(__cplusplus) - struct __align__(2) __half { - __host__ __device__ __half() { } - - protected: - unsigned short __x; - }; - - /* All intrinsic functions are only available to nvcc compilers */ - #if defined(__CUDACC__) - /* Definitions of intrinsics */ - __device__ __half __float2half(const float f) { - __half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; - } - - __device__ float __half2float(const __half h) { - float val; - asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); - return val; - } - #endif /* defined(__CUDACC__) */ -#endif /* defined(__cplusplus) */ -#undef __HALF_TO_US -#undef __HALF_TO_CUS - -typedef __half half; -)"; - -// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation -// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1 -// linearId /= sizes[i]; -auto dim_calc = CodeTemplate(R"( -//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]); -size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes}; -${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride}; -)"); - -static void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) { - TemplateEnv env; - env.s("tensor",tensor); - out << format("IndexType ${tensor}_offset = 0;\n",env); - out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env); - for(int d = ndim - 1; d >= 0; --d) { - env.d("d",d); - env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : ""); - env.s("times_stride",(d < ndim - 1 || !last_is_cont) ? - format("* ${tensor}.strides[${d}]",env) : ""); - out << dim_calc.format(env); - if(d > 0) { - out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env); - } - } -} - -static std::string valueName(Value * n) { - return "n" + std::to_string(n->unique()); -} - -static std::string scalarValue(int64_t v) { - return std::to_string(v); -} - -static std::string scalarValue(double v) { - std::ostringstream out; - out << std::scientific << v << "f"; - return out.str(); -} - -static const char * scalarTypeName(at::ScalarType type) { - if (type == at::ScalarType::Half) { - return "half"; - } - - switch(type) { - #define DEFINE_CASE(ctype,name,_) \ - case at::ScalarType::name: return #ctype; - AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE) - #undef DEFINE_CASE - default: - throw std::runtime_error("unknown scalar type"); - } -} - -std::string encodeRHS(Node * n) { - static std::unordered_map simple_map_ops = { - // unary - {aten::abs, "absf(${0})"}, - {aten::sigmoid, "1.f / (1.f + expf(-${0}))"}, - {aten::relu, "${0} < 0 ? 0.f : ${0} "}, - {aten::log, "logf(${0})"}, - {aten::log10, "log10f(${0})"}, - {aten::log1p, "log1pf(${0})"}, - {aten::log2, "log2f(${0})"}, - {aten::lgamma, "lgammaf(${0})"}, - {aten::exp, "expf(${0})"}, - {aten::expm1, "expm1f(${0})"}, - {aten::cos, "cosf(${0})"}, - {aten::acos, "acosf(${0})"}, - {aten::cosh, "coshf(${0})"}, - {aten::sin, "sinf(${0})"}, - {aten::asin, "asinf(${0})"}, - {aten::sinh, "sinhf(${0})"}, - {aten::tan, "tanf(${0})"}, - {aten::atan, "atanf(${0})"}, - {aten::tanh, "tanhf(${0})"}, - {aten::sqrt, "sqrtf(${0})"}, - {aten::rsqrt, "rsqrtf(${0})"}, - {aten::ceil, "ceilf(${0})"}, - {aten::floor, "floorf(${0})"}, - {aten::round, "roundf(${0})"}, - {aten::trunc, "truncf(${0})"}, - {aten::frac, "fracf(${0})"}, - {aten::reciprocal, "reciprocalf(${0})"}, - {aten::neg, "-${0}"}, - //simple binary - {aten::atan2, "atan2(${0}, ${1})"}, - {aten::min, "fminf(${0}, ${1})"}, - {aten::max, "fmaxf(${0}, ${1})"}, - - //binary with other - // TODO: some of these ops will not get generated because - // we only work on float inputs/outputs, but they are here to record - // that they are valid mappable ops once we handle more type - {aten::__and__, "${0} && ${1}"}, - {aten::__lshift__, "${0} << ${1}"}, - {aten::__or__, "${0} || ${1}"}, - {aten::__rshift__, "${0} >> ${1}"}, - {aten::__xor__, "${0} ^ ${1}"}, - {aten::div, "${0} / ${1}"}, - {aten::eq, "${0} == ${1}"}, - {aten::fmod, "fmodf(${0}, ${1})"}, - {aten::ge, "(${0} >= ${1})"}, - {aten::gt, "${0} > ${1}"}, - {aten::le, "(${0} <= ${1})"}, - {aten::lt, "${0} < ${1}"}, - {aten::type_as, "(${0})"}, //everything is implicitly convertible to float - {aten::mul, "${0} * ${1}"}, - {aten::ne, "${0} != ${1}"}, - {aten::remainder, "remainderf(${0}, ${1})"}, - {aten::pow, "powf(${0}, ${1})"}, - - //alpha - {aten::add, "${0} + ${2}*${1}"}, - {aten::sub, "(${0} - ${2}*${1})"}, - {aten::rand_like, "uniform(rnd())"}, - - // simple derivatives - {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"}, - {aten::_tanh_backward, "${0} * (1.f - ${1} * ${1})"}, - }; - - if (n->kind() == prim::Constant) { - auto val = toIValue(n->output()).value(); - if (val.isDouble()) { - return scalarValue(val.toDouble()); - } else { - JIT_ASSERT(val.isInt()); - return scalarValue(val.toInt()); - } - } - - TemplateEnv env; - size_t i = 0; - for(auto in : n->inputs()) { - env.s(std::to_string(i++), valueName(in)); - } - - const auto & str = simple_map_ops.at(n->kind()); - return format(str, env); -} - -// Returns: (input chunk metadata, output concat metadata, is_random) -std::tuple,std::vector,bool> emitCompilationUnit( - std::ostream& out, - const std::string& name, - AnnotatedGraph& agraph, - bool use_cuda) { - bool has_random = false; - Graph& subgraph = *agraph.graph; - TemplateEnv env; - env.s("kernelName",name); - // TODO: handle cases where we need to generate > 2^32 element tensors - env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t - - std::stringstream body; - std::stringstream tensorOffsets; - std::vector formals; - std::vector argument_loads; - auto emitFormal = [&](Value * n, const TensorDesc & desc) { - std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output - size_t nDim = desc.nDim(); - emitIndexingFor(tensorOffsets, tensor, nDim, desc.lastIsContiguous()); - env.s("tensor",tensor); - env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex - env.d("nDim",nDim); - env.s("scalar_type",scalarTypeName(desc.scalar_type)); - formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env)); - argument_loads.push_back(format("*static_cast*>(args[${formal_index}])",env)); - }; - - std::vector chunk_desc; - std::vector> flat_inputs; - { - size_t input_index = 0; - for(auto p : subgraph.inputs()) { - if (Node * chunk = usedInFusedChunk(p)) { - int64_t dim = chunk->i(attr::dim); - int64_t chunks = chunk->i(attr::chunks); - chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim); - for (auto * o : chunk->outputs()) { - flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc); - } - } else { - chunk_desc.emplace_back(); - flat_inputs.emplace_back(p, agraph.input_desc[input_index++]); - } - } - for (auto & input : flat_inputs) { - emitFormal(input.first, input.second); - } - } - - std::vector concat_desc; - std::vector> flat_output_nodes; - { - size_t i = 0; - for(auto o : subgraph.outputs()) { - auto & desc = agraph.output_desc[i++]; - if(o->node()->kind() != prim::FusedConcat) { - emitFormal(o, desc); - concat_desc.emplace_back(); - flat_output_nodes.emplace_back(o, desc); - } else { - auto cat = o->node(); - concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim)); - for(auto c : cat->inputs()) { - emitFormal(c, *concat_desc.back().subtensorDesc); - flat_output_nodes.emplace_back(c, desc); - } - } - } - } - - bool has_half_tensor = false; - size_t formal_count = 0; - for(auto input : flat_inputs) { - auto p = input.first; - env.s("node",valueName(p)); - env.d("formal",formal_count++); - - // Acquires and converts (if needed) inputs - bool is_half = input.second.scalar_type == at::ScalarType::Half; - if (is_half) { - AT_ASSERT(use_cuda); - env.s( - "access" - , format("__half2float(t${formal}.data[t${formal}_offset])", env)); - has_half_tensor = true; - } else { - env.s("access", format("t${formal}.data[t${formal}_offset]", env)); - } - - //TODO: actual type propagation rather than relying on auto.. - body << format("auto ${node} = ${access};\n",env); - } - - for(auto n : subgraph.nodes()) { - // FusedConcat nodes work by narrowing the output Tensors before the kernel runs - if (n->kind() == prim::FusedConcat) - continue; - if (n->kind() == prim::ConstantChunk) - continue; - if(n->kind() == aten::rand_like) { - has_random = true; - if(!use_cuda) - throw std::runtime_error("Fusion doesn't support rand on CPU"); - } - env.s("node",valueName(n->output())); - env.s("rhs", encodeRHS(n)); - body << format("auto ${node} = ${rhs};\n",env); - } - - for(auto output : flat_output_nodes) { - auto o = output.first; - env.d("formal",formal_count++); - env.s("access",format("t${formal}.data[t${formal}_offset]",env)); - env.s("node",valueName(o)); - - // Acquires and converts (if needed) outputs - bool is_half = output.second.scalar_type == at::ScalarType::Half; - if (is_half) { - AT_ASSERT(use_cuda); - body << format("${access} = __float2half(${node});\n",env); - has_half_tensor = true; - } else { - body << format("${access} = ${node};\n",env); - } - } - - // Includes half support if any half tensors are involved - if (has_half_tensor) { - env.s("HalfHeader", half_support_literal); - } else { - env.s("HalfHeader", ""); - } - - if (has_random) { - env.s("RandHeader", rand_support_literal); - env.s("RandParam", rand_param); - env.s("RandInit", rand_init); - } else { - env.s("RandHeader", ""); - env.s("RandParam", ""); - env.s("RandInit", ""); - } - - env.s("tensorOffsets",tensorOffsets.str()); - env.s("kernelBody",body.str()); - env.v("formals",formals); - env.v("argument_loads",argument_loads); - env.s("type_declarations", type_declarations_template.format(env)); - if(use_cuda) { - out << cuda_compilation_unit_template.format(env); - } else { - out << cpu_compilation_unit_template.format(env); - } - - return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random); -} - -//////////////////////////////////////////////////////////////////////////////// - -} // codegen namespace -} // anonymous namespace - -//////////////////////////////////////////////////////////////////////////////// -// CompiledFunctionFunction - -// Host-side view of TensorInfo (that visivle for the kernel is defined above). -// Note dims[0] - we need to dynamically allocate the dims. -struct TensorInfo { - void * data; -#pragma GCC diagnostic ignored "-Wpedantic" - uint32_t sizes_strides[0]; -#pragma GCC diagnostic pop - - uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; } - uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; } -}; - -FusedKernel::FusedKernel(const std::string & name, AnnotatedGraph & agraph) - : name(name) - , input_desc(agraph.input_desc) - , output_desc(agraph.output_desc) {} - -namespace { - -// Tries to compress sizes and strides according to cont. Emits the result t -// c_sizes, c_strides and throws an error on failure (if can't compress) -void compressContiguous( - at::IntList sizes, - at::IntList strides, - const std::vector & cont, - uint32_t * c_sizes, - uint32_t * c_strides) { - size_t compressed_dims = 0; - size_t cur = 0; - size_t ndim = sizes.size(); - while(cur < ndim) { - size_t total_size = sizes[cur]; - cur++; - while(cont[cur-1] && cur < ndim) { - JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]); - total_size *= sizes[cur]; - cur++; - } - // cur starts pointing at the beginning of run to compress - // cur ends one _after_ the terminating false or end of list. - // total_size is the size of all dimensions [begin,end) - // examples: - // f = not cont. - // t = cont. - // x = don't care, including past end of list - // s = start of cur - // e = end of cur - - - // f x x x - // s e - - // t f x x - // s e - - // t t f x - // s e - - c_sizes[compressed_dims] = total_size; - c_strides[compressed_dims] = strides[cur-1]; - compressed_dims++; - } - if (ndim > 0) { - JIT_ASSERT(!cont.back() || strides.back() == 1); - } -} - -} // anonymous namespace - -// XXX: Assumes that after at::chunk, all inputs are the same size -static std::vector computeMapSize( - const at::Tensor& tensor, - const PartitionDesc& chunkDesc) { - std::vector sizes(tensor.sizes().begin(), tensor.sizes().end()); - // Should have been checked in graph fuser - JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0); - sizes[chunkDesc.dim] /= chunkDesc.nSubtensors; - return sizes; -} - -// XXX: this code assumes that inputs are 32-bit addressable -static uint32_t computeNumel(at::ArrayRef sizes) { - uint32_t result = 1; - if (sizes.size() == 0) { - return 1; // scalar tensor - } - for (int64_t size : sizes) { - result *= size; - } - return result; -} - -void FusedKernel::launch_with_tensors(at::ArrayRef inputs, at::ArrayRef outputs) { - at::DeviceGuard device_guard(inputs); - JIT_ASSERT(inputs.size() == input_desc.size()); - JIT_ASSERT(outputs.size() == output_desc.size()); - size_t flat_inputs_size = 0; - size_t flat_outputs_size = 0; - for(auto & c : chunk_desc) - flat_inputs_size += c.nSubtensors; - for(auto & c : concat_desc) - flat_outputs_size += c.nSubtensors; - // XXX: this code assumes that inputs are 32-bit addressable - // XXX: this code assumes that all inputs are of the same size - JIT_ASSERT(inputs[0].numel() <= std::numeric_limits::max()); - - // Compute map_size, numel from the first input - at::IntList map_size; - uint32_t numel; - std::vector keep_alive_size; - if (chunk_desc[0].isNoop()) { - map_size = inputs[0].sizes(); - numel = inputs[0].numel(); - } else { - keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]); - map_size = keep_alive_size; - numel = computeNumel(map_size); - } - - // Compute the storage needed to store TensorInfo structs for inputs and outputs. - size_t uncompressedDim = input_desc.at(0).contiguity.size(); - size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim; - size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size); - std::vector buffer(maxPossibleBufferSize); - char * buffer_next = buffer.data(); - // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs) - std::vector arguments; - arguments.reserve(3 + flat_inputs_size + flat_outputs_size); - auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) { - size_t nDim = desc.nDim(); // NOTE: this is the compressed dim - JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise - auto ti = reinterpret_cast(buffer_next); - ti->data = data_ptr; - compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim)); - buffer_next += maxPossibleTensorInfoSize; - arguments.push_back(ti); - }; - // Asserts that t's dims can be compressed in the same way as in desc - // (that's what the kernel assumes), and appends it to the arguments vector. - auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) { - addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides()); - }; - arguments.push_back(&numel); - for (size_t i = 0; i < input_desc.size(); ++i) { - auto & chunk = chunk_desc[i]; - const at::Tensor& tensor = inputs[i]; - if (chunk.isNoop()) { - addTensorInfo(input_desc[i], tensor); - } else { - size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType()); - char * data_ptr = reinterpret_cast(tensor.data_ptr()); - for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) { - addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides()); - data_ptr += chunk_offset; - } - } - } - for (size_t i = 0; i < output_desc.size(); ++i) { - auto & c = concat_desc[i]; - at::Tensor o = outputs[i]; - if(c.isNoop()) { - o.resize_(map_size); - addTensorInfo(output_desc[i], outputs[i]); - } else { - size_t small_size = map_size[c.dim]; - std::vector concat_size(map_size.begin(), map_size.end()); - concat_size[c.dim] = small_size * c.nSubtensors; - o.resize_(concat_size); - size_t offset = 0; - for(size_t j = 0; j < c.nSubtensors; ++j) { - // because the concatenated_output stays live, the underlying data - // in this view remains live through the end of this function - // so there is not need to hold onto this tensor - auto view = o.narrow(c.dim, offset, small_size); - addTensorInfo(*c.subtensorDesc, view); - offset += small_size; - } - } - } - - // If the kernel call contains a random op, we need to pass in random seeds as - // well. - #ifdef USE_CUDA - if(has_random && this->backend() == at::Backend::CUDA) { - auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState()); - uint64_t offset = - gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel)); - arguments.push_back(&gen_->state.initial_seed); - arguments.push_back(&offset); - } - #endif - - launch_raw(numel, arguments.data()); -} - -void FusedKernel::launch(at::ArrayRef inputs, std::vector & outputs) { - at::DeviceGuard guard(inputs.back()); - JIT_ASSERT(inputs.size() > 0); - auto & ref_type = inputs[0].type(); - outputs.clear(); - outputs.reserve(outputDescriptors().size()); - for(auto & od : outputDescriptors()) { - outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor()); - } - launch_with_tensors(inputs, outputs); -} - -//////////////////////////////////////////////////////////////////////////////// -// CUDAFusedKernel - -#ifdef USE_CUDA - -void checkCUDAVersion(const cudaDeviceProp & prop) { - if ((prop.major >= 6 && CUDA_VERSION < 8000) || - (prop.major >= 7 && CUDA_VERSION < 9000)) { - std::stringstream err_string; - err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: " - << CUDA_VERSION << " for the current GPU device " << prop.name - << " with device capability " << prop.major << "." << prop.minor; - throw std::runtime_error(err_string.str()); - } -} - -struct CUDAFusedKernel : public FusedKernel { - CUDAFusedKernel(const std::string & name, AnnotatedGraph & agraph) - : FusedKernel(name, agraph) { - at::DeviceGuard device_guard(agraph.device); - - TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device)); - checkCUDAVersion(prop); - - std::stringstream cu; - std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, true); - compilation_unit = cu.str(); - nvrtcProgram program; - TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr)); - - std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor); - std::vector args = {"--std=c++11", compute.c_str(), "-default-device"}; - nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data()); - if (result == NVRTC_ERROR_COMPILATION) { - size_t logsize; - nvrtcGetProgramLogSize(program, &logsize); - std::vector log(logsize); - nvrtcGetProgramLog(program, log.data()); - cu << log.data(); - throw std::runtime_error(cu.str()); - } - ResourceGuard holdProgram([&] { - TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program)); - }); - TORCH_NVRTC_CHECK(result); - - size_t ptx_size; - TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size)); - ptx.resize(ptx_size); - TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data())); - - TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data())); - TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str())); - - TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor( - &maxBlocks, function, 128, 0)); - maxBlocks *= prop.multiProcessorCount; - } - virtual ~CUDAFusedKernel() override { - TORCH_CU_CHECK(cuModuleUnload(module)); - } -protected: - virtual at::Backend backend() const override { - return at::Backend::CUDA; - } - virtual uint64_t get_rand_offset(uint32_t numel) override { - int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize)); - return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1); - } - virtual void launch_raw(uint32_t numel, void ** arguments) override { - int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize)); - - //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize) - // << " numblocks = " << numBlocks; - - // it is possible that this is the first cuda call on this thread - // so make sure we initialize the Driver API's context - // cudaFree(0) accomplishes this. - CUcontext pctx = 0; - TORCH_CU_CHECK(cuCtxGetCurrent(&pctx)); - if (!pctx) { - std::unique_lock cudaFreeMutexLock( - *(THCCachingAllocator_getCudaFreeMutex())); - cudaFree(0); - } - CUstream stream = at::cuda::getCurrentCUDAStream(); - TORCH_CU_CHECK(cuLaunchKernel( - function, - numBlocks, 1, 1, - blockSize, 1, 1, - 0, stream, - arguments, - nullptr)); - } - std::vector ptx; - CUmodule module; - CUfunction function; - - // we record prop/device so if they are availiable for launch heuristics - // querying at launch is too slow for device properties. - int device; - cudaDeviceProp prop; - int blockSize = 128; - int maxBlocks; -}; - -#endif - -//////////////////////////////////////////////////////////////////////////////// -// CPUFusedKernel - -struct TempFile { - TH_DISALLOW_COPY_AND_ASSIGN(TempFile); - TempFile(const std::string & t, int suffix) { - // mkstemps edits its first argument in places - // so we make a copy of the string here, including null terminator - std::vector tt(t.c_str(), t.c_str() + t.size() + 1); - int fd = mkstemps(tt.data(), suffix); - JIT_ASSERT(fd != -1); - file_ = fdopen(fd, "r+"); - - // - 1 becuase tt.size() includes the null terminator, - // but std::string does not expect one - name_ = std::string(tt.begin(), tt.end() - 1); - } - const std::string & name() const { - return name_; - } - void sync() { - fflush(file_); - } - void write(const std::string & str) { - size_t result = fwrite(str.c_str(), 1, str.size(), file_); - JIT_ASSERT(str.size() == result); - } - FILE* file() { - return file_; - } - ~TempFile() { - if(file_ != nullptr) { - // unlink first to ensure another mkstemps doesn't - // race between close and unlink - unlink(name_.c_str()); - fclose(file_); - } - } -private: - FILE * file_ = nullptr; - std::string name_; -}; - -static void* checkDL(void * x) { - if(!x) { - AT_ERROR("error in dlopen or dlsym: ", dlerror()); - } - return x; -} - -struct DynamicLibrary { - TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary); - DynamicLibrary(const char * name) { - handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW)); - } - void * sym(const char * name) { - JIT_ASSERT(handle); - return checkDL(dlsym(handle, name)); - } - ~DynamicLibrary() { - if(!handle) return; - dlclose(handle); - } -private: - void * handle = nullptr; -}; - -static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so"; -static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp"; - -// NB: -march=native not supported on PPC64 g++. It's a bit annoying -// to do a configure-style test to decide whether or not the g++ -// actually supports it or not, so we heuristically use the host -// compiler to predict if the runtime compiler supports the option we -// want. This probably won't work if you're cross-compiling. -// NB: -march=native is disabled because it has caused problems where -// compiler and assembler do not agree on what native instruction they -// understand for AVX512. When we need better CPU performance this -// optimization can be re-enabled by tracking down the platforms where -// this error occurs and only selectively disabling it. -static const std::string compile_string = - "\"${cxx}\" -O3 -g " -#ifndef __PPC64__ -// "-march=native " -#endif - "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm"; - -static void runCompiler(FusionCompilerConfig & config, const std::string & cpp_file, const std::string & so_file) { - TemplateEnv env; - env.s("cxx", config.cxx); - env.s("fopenmp", config.openmp ? "-fopenmp" : ""); - env.s("cpp_file",cpp_file); - env.s("so_file",so_file); - std::string result = format(compile_string,env); - int r = system(result.c_str()); - if(config.openmp && r != 0) { - std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n"; - config.openmp = false; // disable for future compiles - return runCompiler(config, cpp_file, so_file); - } - JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel"); -} - - -static const std::string disas_string = - "objdump -M intel -d \"${so_file}\""; -static void disas(const std::string & so_file) { - TemplateEnv env; - env.s("so_file", so_file); - std::string cmd = format(disas_string, env); - int r = system(cmd.c_str()); - JIT_ASSERT(r == 0); -} - -struct CPUFusedKernel : public FusedKernel { - CPUFusedKernel(const std::string & name, AnnotatedGraph & agraph, FusionCompilerConfig & config) - : FusedKernel(name, agraph) { - TempFile so_file(so_template, 3); - TempFile cpp_file(cpp_template, 4); - - std::stringstream cu; - std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, false); - JIT_ASSERT(!has_random); - compilation_unit = cu.str(); - cpp_file.write(compilation_unit); - cpp_file.sync(); - runCompiler(config, cpp_file.name(), so_file.name()); - if(config.debug) { - disas(so_file.name()); - } - so_lib.reset(new DynamicLibrary(so_file.name().c_str())); -#pragma GCC diagnostic ignored "-Wpedantic" - kernel = reinterpret_cast(so_lib->sym(name.c_str())); -#pragma GCC diagnostic pop - } -protected: - virtual at::Backend backend() const override { - return at::Backend::CPU; - } - virtual uint64_t get_rand_offset(uint32_t numel) override { - return numel; - } - virtual void launch_raw(uint32_t numel, void ** arguments) override { - kernel(numel, arguments); - } - std::unique_ptr so_lib; - void (*kernel)(uint32_t, void**) = nullptr; -}; - -//////////////////////////////////////////////////////////////////////////////// -// FusedKernelCache - -// Note [Run-time shape checking code] -// There are multiple assumptions that our codegen makes, which we can't check -// in the fusion pass, because we don't have the shape information. Most notably, -// that all values (post-input-chunk, and pre-output-concat) have the same shape -// (hereinafter referred to as map size). One way to check this would be to run -// shape propagation for every size configuration we get as an input, but that -// requires a full graph traversal, and might incur unnecessary overhead. The code -// below uses a few nice properties of broadcasting rules and their interactions with -// pointwise operations, and takes a smarter approach, to quickly verify validity of -// the kernel. -// -// Notation: -// - a.s when a is a tensor is a shorthand for a.shape. -// - B is a shorthand for the broadcasting/expanding function. It is used as a -// vararg function. -// - E is a shorthand for expand function. -// - Every pointwise operation can be equivalently rewritten as -// f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), -// where f^ is a non-broadcasting verison of f. -// - A set of inputs that are used to produce a certain graph output is referred to -// as the output's broadcasting group (see Lemma 2. for explanation why). -// -// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the -// operation of broadcasting (returning bottom upon shape mismatch) forms a monoid. -// In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c). -// -// Proof. Satisfies all monoid laws: -// - Closed under broadcasting (trivial) -// - Empty shape is the identity element: B(a, []) == B([], a) == a -// - Associativity: A simple visual proof is that you can expand 3 tensors -// at the same time by stacking their sizes (with alignment to the right), -// just as you'd do in the case of 2 tensors, but with an intermediate -// (the algorithm ends up being pretty much the same). -// -// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set -// of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]). -// -// Proof. Let G be any DAG of pointwise ops and < be any valid topological -// ordering on nodes of G. Proof by induction over <. -// Base case (graph input): -// Trivial (input is also an output). -// Step (n = f(q, r)): -// Let QS (RS) be the set of shapes of inputs that q (r) depends on. -// Note that the set of inputs that n depends on is exactly QS + RS. -// shape(n) == shape(f(q, r)) -// (def of f) -// == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s)))) -// (output shape of f^ is equal to either of argument shapes) -// == shape(E(q, B(q.s, r.s))) -// (property of expand) -// == B(q.s, r.s) -// (induction assumption) -// == B(B(QS...), B(RS...)) -// (Lemma 1.) -// == B(QS..., RS...) -// (repeated shapes don't matter for broadcasting) -// == B((QS + RS)...) -// -// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s)) -// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)). -// Proof. A simple exercise for the reader :) -// -// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to -// (post-chunk-)inputs, and have all intermediates of the same shape -// (no broadcasting happening in the body). -// -// Proof. Using the above lemmas we can easily show that a graph with a single output -// can be easily rewritten by taking the shape given by B applied to all input -// shapes, expanding inputs to it, and using only non-broadcasting operations. -// Example: -// -// let d = f(a, b) in -// let e = h(b, c) in -// g(d, e) -// -// (By def. of broadcasting pointwise ops applied to g, f and h) -// (Lemma 2. for a closed formula for the size of g = gs) -// -// let gs = B(a.s, b.s, c.s) in -// let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in -// let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in -// g^(d', e') -// -// (Lemma 3.) -// -// let gs = B(a.s, b.s, c.s) in -// let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in -// let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in -// g^(d', e') -// -// (Lemma 4. + Lemma 1. to simplify broadcasting function) -// -// let gs = B(a.s, b.s, c.s) in -// let d' = f^(E(a, gs), E(b, gs)) in -// let e' = h^(E(b, gs), E(c, gs)) in -// g^(d', e') -// -// (Simple rewrite) -// -// let gs = B(a.s, b.s, c.s) in -// let a' = E(a, gs) in -// let b' = E(b, gs) in -// let c' = E(c, gs) in -// let d' = f^(a', b') in -// let e' = h^(b', c') in -// g^(d', e') -// -// This example can be easily formalized to arbitrary DAGs using induction -// over topological ordering, similar to Lemma 2. Now, if broadcasting groups -// for all outputs have the same shape, then performing an expand to this size -// on all inputs will ensure that all intermediates on all paths to outputs -// will have the same shape, proving that the body of the kernel is valid. -// -// This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs -// is straightforward (needs a simple lemma for moving expands through chunks). - -// Register implementations of fused operators, so that we can reuse the fused graph -// to generate fallback code. -RegisterOperators reg_fused_operators({ - Operator( - prim::FusedConcat, - [](Node* node) { - int64_t dim = node->i(attr::dim); - int64_t num_inputs = node->inputs().size(); - return [dim, num_inputs](Stack& stack) { - auto result = at::cat( - fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }), - dim - ); - drop(stack, num_inputs); - pack(stack, std::move(result)); - return 0; - }; - }) -}); - -FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr _graph, int device) - : device(device) - , fallback_code(_graph) - , compiler(compiler) - , graph(std::move(_graph)) - , input_broadcast_groups(getInputBroadcastGroups()) - , input_chunks(getInputChunkDescriptors()) - , kernels() {} - -std::atomic FusedKernelCache::next_kernel_id {0}; - -auto FusedKernelCache::getInputChunkDescriptors() -> std::vector { - std::vector descs; - descs.reserve(graph->inputs().size()); - for (Value * input : graph->inputs()) { - if (Node * chunk = usedInFusedChunk(input)) { - descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim)); - } else { - descs.emplace_back(1, 0); - } - } - return descs; -} - -// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access -static std::vector getInputDependencies(Value* output) { - // Run a DFS traversal to find all inputs that affect a given output value - std::vector queue { output }; - std::unordered_set inputs; - std::unordered_set seen; - while (!queue.empty()) { - Value * val = queue.back(); queue.pop_back(); - Node * producer = val->node(); - if (producer->kind() == prim::Param) { - inputs.insert(val); - continue; - } - for (Value * input : producer->inputs()) { - if (/*bool inserted = */seen.insert(input).second) { - queue.push_back(input); - } - } - } - - // Convert Value* into offsets into the graph's input list - std::vector offsets; - offsets.reserve(inputs.size()); - for (Value * input : inputs) { - offsets.push_back(input->offset()); - } - std::sort(offsets.begin(), offsets.end()); - return offsets; -} - -std::vector> FusedKernelCache::getInputBroadcastGroups() { - std::unordered_set, torch::hash>> broadcast_groups; - for (Value * output : graph->outputs()) { - broadcast_groups.insert(getInputDependencies(output)); - } - return std::vector>{ broadcast_groups.begin(), broadcast_groups.end() }; -} - -void FusedKernelCache::run(Stack& stack) { - int64_t num_inputs = graph->inputs().size(); - auto args = fmap(last(stack, num_inputs), [](const IValue& i) { - return i.toTensor(); - }); - - auto maybe_map_size = canRunKernel(args); - if (!maybe_map_size) { - return runFallback(stack); - } - expandArgs(args, *maybe_map_size); - - FusedKernelArgSpec spec { args }; - auto it = kernels.find(spec); - if (it == kernels.end()) { - std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size)); - } - auto & fn = it->second; - - std::vector outputs; - fn->launch(args, outputs); - drop(stack, num_inputs); - stack.insert(stack.end(), std::make_move_iterator(outputs.begin()), - std::make_move_iterator(outputs.end())); -} - -at::optional> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) { - int64_t dim_after_broadcast = 0; - for (int64_t arg_idx : arg_subset) { - dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim()); - } - // TODO: this keeps reallocating map_size at every iteration, but we know - // exactly how much storage do we need, so this could be fixed in-place at - // every step. We're just missing a few functions for ATen, but the fix - // should be straightforward. - // NB: we leave this uninitialized, because an empty size is trivially - // broadcastable to any other size. - std::vector map_size; - for (size_t i = 0; i < arg_subset.size(); ++i) { - auto & arg = args.at(arg_subset[i]); - auto & chunk_desc = input_chunks.at(arg_subset[i]); - if (chunk_desc.nSubtensors == 1) { - try { - map_size = at::infer_size(map_size, arg.sizes()); - } catch (std::exception& e) { - return at::nullopt; - } - } else { - auto tensor_sizes = arg.sizes().vec(); - int64_t num_chunks = chunk_desc.nSubtensors; - int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size()); - if (tensor_sizes[dim] % num_chunks != 0) { - return at::nullopt; - } - tensor_sizes[dim] /= num_chunks; - try { - map_size = at::infer_size(map_size, tensor_sizes); - } catch (std::exception& e) { - return at::nullopt; - } - } - } - - return {map_size}; -} - -// See Note [Run-time shape checking code] for more explanation on the algorithm. -at::optional> FusedKernelCache::canRunKernel(at::TensorList args) { - AT_CHECK(args.size() == input_chunks.size(), - "Expected ", input_chunks.size(), " arguments, but got ", args.size()); - - at::optional> map_size; - for (const auto & broadcast_group : input_broadcast_groups) { - if (!map_size) { - map_size = getMapSize(args, broadcast_group); - if (!map_size) { - return at::nullopt; - } - } else { - auto group_map_size = getMapSize(args, broadcast_group); - // NB: this checks that group_map_size is defined AND equal to map_size - if (map_size != group_map_size) { - return at::nullopt; - } - } - } - return map_size; -} - -void FusedKernelCache::runFallback(Stack& stack) { - InterpreterState(fallback_code).runOneStage(stack); -} - -// NB: args are mutated in this call. map_size is mutated too, but is restored to its original -// value before this function returns (it's an optimization). -void FusedKernelCache::expandArgs(std::vector& args, std::vector& map_size) { - for (size_t i = 0; i < args.size(); ++i) { - auto & arg = args[i]; - auto & pdesc = input_chunks[i]; - if (pdesc.nSubtensors == 1) { - if (arg.sizes().equals(map_size)) continue; - arg = arg.expand(map_size); - } else { - map_size.at(pdesc.dim) *= pdesc.nSubtensors; - if (!arg.sizes().equals(map_size)) { - arg = arg.expand(map_size); - } - map_size.at(pdesc.dim) /= pdesc.nSubtensors; - } - } -} - -std::unique_ptr FusedKernelCache::compileSpec( - const FusedKernelArgSpec& spec, const std::vector& map_size) { - AnnotatedGraph agraph {*graph, device}; - - agraph.input_desc = spec.descs(); - // XXX: this assumes that fused kernels only operate on floating-point values inside - at::optional scalar_type; - for (TensorDesc& desc : agraph.input_desc) { - if (isFloatingType(desc.scalar_type)) { - scalar_type = desc.scalar_type; - break; - } - } - JIT_ASSERT(scalar_type); - - for (Value * output : graph->outputs()) { - std::vector sizes = map_size; - if (output->node()->kind() == prim::FusedConcat) { - sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size(); - } - auto type = CompleteTensorType::create(*scalar_type, device, sizes); - agraph.output_desc.emplace_back(std::move(type)); - } - - std::string name = "kernel_" + std::to_string(next_kernel_id++); - FusedKernel * raw_func; - if (device != kCPUDevice) { -#ifdef USE_CUDA - raw_func = new CUDAFusedKernel(name, agraph); -#else - throw std::runtime_error("cannot compile a CUDA fusion group, CUDA is not enabled."); -#endif - } else { - JIT_ASSERT(compiler.canCompileOnCPU()); - raw_func = new CPUFusedKernel(name, agraph, compiler.config_); - } - return std::unique_ptr(raw_func); -} - -//////////////////////////////////////////////////////////////////////////////// -// FusionCompiler - -std::shared_ptr FusionCompiler::getOrCompile(Node* fusion_group) { - int device = fusion_group->i(attr::device); - if (device == kCPUDevice) { - JIT_ASSERT(canCompileOnCPU()); - } else { -#ifndef USE_CUDA - throw std::runtime_error("cannot compile a CUDA fusion group - CUDA is not enabled."); -#endif - } - auto graph = fusion_group->g(attr::Subgraph)->copy(); - EraseShapeInformation(*graph); - std::stringstream key; - key << "device " << device << "\n"; - key << *graph << "\n"; - std::string key_ = key.str(); - auto it = cache_map.find(key_); - if (it == cache_map.end()) { - std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared(*this, graph, device)); - } - return it->second; -} - -std::vector FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef inputs) { - auto wrapper_graph = std::make_shared(); - Node * fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device)); - fusion_group->g_(attr::Subgraph, graph.copy()); - for (size_t i = 0; i < graph.inputs().size(); ++i) { - fusion_group->addInput(wrapper_graph->addInput()); - } - for (size_t i = 0; i < graph.outputs().size(); ++i) { - wrapper_graph->registerOutput(fusion_group->addOutput()); - } - auto cache = getOrCompile(fusion_group); - Stack stack = fmap(inputs); - cache->run(stack); - return fmap(stack, [](const IValue& iv) { return iv.toTensor(); }); -} - -static const std::string check_exists_string = - "which '${program}' > /dev/null"; - -static bool programExists(const std::string & program) { - TemplateEnv env; - env.s("program", program); - std::string cmd = format(check_exists_string, env); - return 0 == system(cmd.c_str()); -} - -FusionCompiler::FusionCompiler() { - const char * cxx_env = getenv("CXX"); - if(cxx_env != nullptr) { - config_.cxx = cxx_env; - } - if(!programExists(config_.cxx)) { - config_.cxx = ""; - } - const char * debug_env = getenv("PYTORCH_FUSION_DEBUG"); - config_.debug = debug_env && atoi(debug_env) != 0; -} - -//TODO: thread safety -FusionCompiler & sharedFusionCompiler() { - static FusionCompiler compiler; - return compiler; -} - -}} - -# else -// dummy implementations for windows - -#include "torch/csrc/jit/fusion_compiler.h" -#include "torch/csrc/jit/ir.h" -#include "torch/csrc/jit/code_template.h" -#include "torch/csrc/jit/resource_guard.h" -#include "torch/csrc/utils/disallow_copy.h" -#include "ATen/ATen.h" -#ifdef USE_CUDA -#include "torch/csrc/cuda/cuda_check.h" -#include -#include -#include -#endif -#include -#include -#include -#include -#include -#include - -namespace torch { namespace jit { - -struct FusedKernel { - char padding; -}; - -FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr graph, int device) - : compiler(compiler) {} -void FusedKernelCache::run(Stack& inputs) {} -void FusedKernelCache::runFallback(Stack& stack) {} -void FusedKernelCache::expandArgs(std::vector& args, std::vector& map_size) {} -at::optional> FusedKernelCache::canRunKernel(at::TensorList args) { return at::nullopt; } -at::optional> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) { return at::nullopt; } -std::vector> FusedKernelCache::getInputBroadcastGroups() { return {}; } -auto FusedKernelCache::getInputChunkDescriptors() -> std::vector { return {}; } -std::unique_ptr FusedKernelCache::compileSpec( - const FusedKernelArgSpec& spec, const std::vector& map_size) { return nullptr; } -std::atomic FusedKernelCache::next_kernel_id {0}; - -FusionCompiler::FusionCompiler() {} -std::shared_ptr FusionCompiler::getOrCompile(Node* fusion_group) { return nullptr; } -std::vector FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef inputs) { return {}; } - -FusionCompiler & sharedFusionCompiler() { - throw std::runtime_error("NYI: fuser is not supported on Windows."); -} - -}} - -# endif diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h deleted file mode 100644 index a24fe0817e0517..00000000000000 --- a/torch/csrc/jit/fusion_compiler.h +++ /dev/null @@ -1,188 +0,0 @@ -#pragma once - -#include -#include "torch/csrc/utils/disallow_copy.h" -#include "torch/csrc/utils/hash.h" -#include -#include -#include -#include - -#include "ATen/ATen.h" -#include -#include -#include -#include -#include - -namespace torch { namespace jit { - -struct FusedKernel; -struct FusionCompiler; - -// type information needed by the compiler for input/outputs -// contiguity[i] is true if the dim i is contiguous with dim i + 1. -// contiguity.back() == true means strides.back() == 1. -struct TensorDesc { - at::ScalarType scalar_type; - std::vector contiguity; - - TensorDesc(const at::ScalarType& type, const std::vector& contiguity) - : scalar_type(type), contiguity(contiguity) { - if (contiguity.size() == 0) { - nDim_ = 0; - } else { - nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0); - } - } - - TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides) - : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {} - TensorDesc(const at::Tensor& t) - : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {} - TensorDesc(CompleteTensorTypePtr type) - : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {} - - // number of dimensions after contiguity compression - size_t nDim() const { - return nDim_; - } - - // do we have inner stride == 1? - bool lastIsContiguous() const { - return contiguity.size() == 0 || contiguity.back(); - } - - static std::vector findContiguous( - const at::IntList& sizes, - const at::IntList& strides); - - bool operator==(const TensorDesc & desc) const { - return scalar_type == desc.scalar_type && contiguity == desc.contiguity; - } - bool operator!=(const TensorDesc & desc) const { - return !(*this == desc); - } - static size_t hash(const TensorDesc& spec) { - return torch::get_hash(spec.scalar_type, spec.nDim_, std::hash>{}(spec.contiguity)); - } - -private: - size_t nDim_; -}; - -inline std::ostream& operator<<(std::ostream & out, const TensorDesc & d) { - out << d.scalar_type << "["; - for(auto b : d.contiguity) - out << b << ";"; - out << "]"; - return out; -} - -struct FusedKernelArgSpec { - FusedKernelArgSpec(at::TensorList inputs) - : descs_(fmap(inputs)) - , hash_code_(torch::get_hash(inputs.size(), descs_)) {} - - bool operator==(const FusedKernelArgSpec & spec) const { - return hash_code_ == spec.hash_code_ && descs_ == spec.descs_; - } - bool operator!=(const FusedKernelArgSpec & spec) const { - return !(*this == spec); - } - static size_t hash(const FusedKernelArgSpec& spec) { - return spec.hash_code_; - } - const std::vector& descs() const { - return descs_; - } - -private: - std::vector descs_; - size_t hash_code_; -}; - -constexpr int kCPUDevice = -1; -struct AnnotatedGraph { - // short-term storage only, so it borrows Graph. - AnnotatedGraph(Graph & graph, int device) - : graph(&graph), device(device) {} - Graph* graph = nullptr; // TODO: this should really be const - int device = kCPUDevice; - std::vector input_desc; - std::vector output_desc; -}; - -// FusionCompiler has very limited shape information available at the time getOrCompile -// is called, and this is why it can't really prepare the kernels at that time. Instead, -// it returns this object, which will take care of matching the run-time shapes to whatever -// kernels we have compiled already. -// -// Two configurations are considered eligible for the same fused kernel if: -// - the shapes satisfy graph invariants for our fused code (e.g. that all intermediate shapes -// are the same - see fusion_compiler.cpp for more details). -// - their FusedKernelArgSpecs compare equal -struct FusedKernelCache { - FusedKernelCache(FusionCompiler& compiler, std::shared_ptr graph, int device); - - void run(Stack& inputs); -private: - struct PartitionInfo { - PartitionInfo(int64_t nsub, int64_t dim) - : nSubtensors(nsub), dim(dim) {}; - int64_t nSubtensors; - int64_t dim; - }; - - void runFallback(Stack& stack); - void expandArgs(std::vector& args, std::vector& map_size); - at::optional> canRunKernel(at::TensorList args); - at::optional> getMapSize(at::TensorList args, at::IntList arg_subset); - std::vector> getInputBroadcastGroups(); - std::vector getInputChunkDescriptors(); - std::unique_ptr compileSpec( - const FusedKernelArgSpec& spec, const std::vector& map_size); - - static std::atomic next_kernel_id; - - int device; - Code fallback_code; - FusionCompiler& compiler; - std::shared_ptr graph; - std::vector> input_broadcast_groups; - std::vector input_chunks; - std::unordered_map, torch::hash> kernels; -}; - -struct FusionCompilerConfig { - std::string cxx = "g++"; // compiler location - bool debug = false; // emit debugging information about fusions - bool openmp = true; -}; - -// caching compiler -struct FusionCompiler { - friend struct FusedKernelCache; - - FusionCompiler(); - TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler); - - // uses type annotations in fusion_group to create Annotated graph - std::shared_ptr getOrCompile(Node * fusion_group); - - // debugging function that lets you do everything from compilation to execution - // in one step. - // this should not be used in the hot path of execution because it has to serialize - // the graph each time - std::vector debugLaunchGraph(Graph & graph, int device, at::ArrayRef inputs); - bool canCompileOnCPU() const { - return config_.cxx.size() > 0; - } -private: - FusionCompilerConfig config_; - std::unordered_map> cache_map; -}; - -FusionCompiler & sharedFusionCompiler(); - -}} diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index 5484c04716c406..ab9a9ee4a81518 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -33,6 +33,7 @@ #include "torch/csrc/jit/function_schema.h" #include "torch/csrc/jit/serialization.h" #include "torch/csrc/jit/operator.h" +#include "torch/csrc/jit/fusers/interface.h" #include @@ -115,7 +116,8 @@ void initJITBindings(PyObject *module) { .def("_jit_pass_onnx_block", BlockToONNX) .def("_jit_pass_fixup_onnx_loops", FixupONNXLoops) .def("_jit_pass_canonicalize_ops", CanonicalizeOps) - .def("_jit_pass_specialize_undef", specializeUndef) + .def("_jit_pass_specialize_undef", specializeUndef) + .def("_jit_override_can_fuse_on_cpu", &overrideCanFuseOnCPU) .def("_jit_differentiate", [](Graph &g) { // the python binding slightly differs in semantics // it makes a copy of the input Graph, and works on that diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index 6864d8ad797087..d7e8c31f8cb5cf 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -6,7 +6,6 @@ #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/variable.h" #include "torch/csrc/jit/assertions.h" -#include "torch/csrc/jit/fusion_compiler.h" #include "torch/csrc/jit/graph_executor.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/ivalue.h" diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index 81a1768f48a302..e8123690789b24 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -1,7 +1,7 @@ #include "torch/csrc/jit/passes/graph_fuser.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" #include "torch/csrc/jit/symbolic_variable.h" -#include "torch/csrc/jit/fusion_compiler.h" +#include "torch/csrc/jit/fusers/interface.h" #include "torch/csrc/jit/autodiff.h" #include "torch/csrc/jit/assertions.h" #include "ATen/ExpandUtils.h" @@ -343,7 +343,7 @@ struct GraphFuser { // is that if we're compiling on CPU, the fusion compiler works. if (consumer_device.type() == DeviceType::CPU || producer_device.type() == DeviceType::CPU) { - return sharedFusionCompiler().canCompileOnCPU(); + return canFuseOnCPU(); } return true; } diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp index 85f0c37c4ea7e3..5c2115a20413d8 100644 --- a/torch/csrc/jit/python_interpreter.cpp +++ b/torch/csrc/jit/python_interpreter.cpp @@ -5,7 +5,6 @@ #include "torch/csrc/autograd/function.h" #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/variable.h" -#include "torch/csrc/jit/fusion_compiler.h" #include "torch/csrc/jit/operator.h" #include "torch/csrc/jit/custom_operator.h" #include "torch/csrc/jit/graph_executor.h" diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 13f5f6dbbad275..42184ac318ad58 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -3,7 +3,7 @@ #include "torch/csrc/autograd/generated/variable_factories.h" #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/autograd/variable.h" -#include "torch/csrc/jit/fusion_compiler.h" +#include "torch/csrc/jit/fusers/interface.h" #include "torch/csrc/jit/graph_executor.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/operator.h" @@ -53,10 +53,10 @@ RegisterOperators reg({ Operator( prim::FusionGroup, [](Node* node) { - auto kernel_cache = sharedFusionCompiler().getOrCompile(node); - return [kernel_cache](Stack& stack) { + auto handle = getFusionHandle(node); + return [handle](Stack& stack) { autograd::profiler::RecordFunction record("FusionGroup"); - kernel_cache->run(stack); + handle->run(stack); return 0; }; }), diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index 7a652970243f34..28bf95847d634f 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -12,7 +12,7 @@ using Catch::StartsWith; #endif #include "torch/csrc/jit/assertions.h" -#include "torch/csrc/jit/fusion_compiler.h" +#include "torch/csrc/jit/fusers/interface.h" #include "torch/csrc/jit/code_template.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/attributes.h" @@ -136,8 +136,6 @@ Value * appendNewNode(NodeKind kind, Graph& graph, ArrayRef inputs) { static void fusionTests() { - FusionCompiler comp; - auto testSimple = [&] { Graph graph; Var i0 = Var::asNewInput(graph); @@ -147,7 +145,7 @@ static void fusionTests() { auto a = at::rand({3,4}, at::kCUDA); auto b = at::rand({4,3}, at::kCUDA).transpose(0,1); auto o = at::zeros({3,4}, at::kCUDA); - auto outputs = comp.debugLaunchGraph(graph, 0, {a,b}); + auto outputs = debugLaunchGraph(graph, 0, {a,b}); REQUIRE(outputs.size() == 1); auto o2 = a*b; float max_diff = (o2 - outputs[0]).abs().max().toCDouble(); @@ -201,7 +199,7 @@ static void fusionTests() { auto t5 = out1.tanh(); auto out0 = t16*t5; - auto outputs = comp.debugLaunchGraph(graph, 0, inputs); + auto outputs = debugLaunchGraph(graph, 0, inputs); REQUIRE(outputs.size() == graph.outputs().size()); REQUIRE(out0.is_same_size(outputs.front())); float max_diff = (outputs.front() - out0).abs().max().toCDouble(); @@ -235,7 +233,7 @@ static void fusionTests() { auto o_r = a*b; auto o2_r = at::cat({a, o_r}, dim); - auto outputs = comp.debugLaunchGraph(graph, 0, {a,b}); + auto outputs = debugLaunchGraph(graph, 0, {a,b}); REQUIRE(outputs.size() == 2); float max_diff = (o_r - outputs[0]).abs().max().toCDouble(); diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index e4543a6be12b87..5f6a1c1e011167 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -315,6 +315,7 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None): # Check the traced module against a set of user-provided validation inputs +@torch.no_grad() def _check_trace(check_inputs, func, executor_options, module, check_tolerance): for inputs in check_inputs: if isinstance(inputs, torch.Tensor): From 8e3f8c52e85ee91995f8a9137a4649f5c39dfc3a Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Fri, 14 Sep 2018 15:06:12 -0700 Subject: [PATCH 167/237] Document the Sequential module (#11648) Summary: 1. Document the Sequential module in the C++ API at a high, why-does-this-exist, and low, how-to-use, level 2. Change the Sequential tests to be in a style that makes them easier to convert to gtest. No code changes. ebetica ezyang apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/11648 Differential Revision: D9834526 Pulled By: goldsborough fbshipit-source-id: 39f2f5c6cbbf8ed5a1b69986978c8ef127036de1 --- docs/cpp/Doxyfile | 4 +- test/cpp/api/sequential.cpp | 541 +++++++++--------- torch/csrc/api/include/torch/jit.h | 2 +- torch/csrc/api/include/torch/nn/module.h | 8 +- torch/csrc/api/include/torch/nn/modules/any.h | 8 +- .../api/include/torch/nn/modules/sequential.h | 123 +++- 6 files changed, 396 insertions(+), 290 deletions(-) diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile index a21dcf8184ad52..3c942a9025c2dd 100644 --- a/docs/cpp/Doxyfile +++ b/docs/cpp/Doxyfile @@ -419,7 +419,7 @@ EXTRACT_ALL = YES # be included in the documentation. # The default value is: NO. -EXTRACT_PRIVATE = YES +EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. @@ -799,7 +799,7 @@ INPUT_ENCODING = UTF-8 # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, # *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. -FILE_PATTERNS = *.h *.cpp +FILE_PATTERNS = *.h # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp index 389393fe480b21..aef1332720e4a4 100644 --- a/test/cpp/api/sequential.cpp +++ b/test/cpp/api/sequential.cpp @@ -1,11 +1,16 @@ #include #include +#include +#include +#include #include +#include #include #include #include +#include #include #include @@ -16,298 +21,304 @@ using namespace torch::test; using Catch::StartsWith; -TEST_CASE("sequential") { - SECTION("construction from shared pointer") { - struct M : torch::nn::Module { - explicit M(int value_) : value(value_) {} - int value; - int forward() { - return value; - } - }; - Sequential sequential( - std::make_shared(1), std::make_shared(2), std::make_shared(3)); - REQUIRE(sequential->size() == 3); - } - SECTION("construction from concrete type") { - struct M : torch::nn::Module { - explicit M(int value_) : value(value_) {} - int value; - int forward() { - return value; - } - }; - - Sequential sequential(M(1), M(2), M(3)); - REQUIRE(sequential->size() == 3); - } - SECTION("construction from module holders") { - struct MImpl : torch::nn::Module { - explicit MImpl(int value_) : value(value_) {} - int forward() { - return value; - } - int value; - }; - - struct M : torch::nn::ModuleHolder { - using torch::nn::ModuleHolder::ModuleHolder; - using torch::nn::ModuleHolder::get; - }; - - Sequential sequential(M(1), M(2), M(3)); - REQUIRE(sequential->size() == 3); - } - SECTION("push_back") { - struct M : torch::nn::Module { - explicit M(int value_) : value(value_) {} - int forward() { - return value; - } - int value; - }; - Sequential sequential; - REQUIRE(sequential->size() == 0); - REQUIRE(sequential->is_empty()); - sequential->push_back(Linear(3, 4)); - REQUIRE(sequential->size() == 1); - sequential->push_back(std::make_shared(1)); - REQUIRE(sequential->size() == 2); - sequential->push_back(M(2)); - REQUIRE(sequential->size() == 3); - } - SECTION("access") { - struct M : torch::nn::Module { - explicit M(int value_) : value(value_) {} - int forward() { - return value; - } - int value; - }; - std::vector> modules = { - std::make_shared(1), std::make_shared(2), std::make_shared(3)}; - - Sequential sequential; - for (auto& module : modules) { - sequential->push_back(module); - } - REQUIRE(sequential->size() == 3); - - SECTION("at()") { - SECTION("returns the correct module for a given index") { - for (size_t i = 0; i < modules.size(); ++i) { - REQUIRE(&sequential->at(i) == modules[i].get()); - } - } - SECTION("throws for a bad index") { - REQUIRE_THROWS_WITH( - sequential->at(modules.size() + 1), - StartsWith("Index out of range")); - REQUIRE_THROWS_WITH( - sequential->at(modules.size() + 1000000), - StartsWith("Index out of range")); - } +TEST_CASE("Sequential/ConstructsFromSharedPointer") { + struct M : torch::nn::Module { + explicit M(int value_) : value(value_) {} + int value; + int forward() { + return value; } + }; + Sequential sequential( + std::make_shared(1), std::make_shared(2), std::make_shared(3)); + REQUIRE(sequential->size() == 3); +} - SECTION("ptr()") { - SECTION("returns the correct module for a given index") { - for (size_t i = 0; i < modules.size(); ++i) { - REQUIRE(sequential->ptr(i).get() == modules[i].get()); - REQUIRE(sequential[i].get() == modules[i].get()); - REQUIRE(sequential->ptr(i).get() == modules[i].get()); - } - } - SECTION("throws for a bad index") { - REQUIRE_THROWS_WITH( - sequential->ptr(modules.size() + 1), - StartsWith("Index out of range")); - REQUIRE_THROWS_WITH( - sequential->ptr(modules.size() + 1000000), - StartsWith("Index out of range")); - } +TEST_CASE("Sequential/ConstructsFromConcreteType") { + struct M : torch::nn::Module { + explicit M(int value_) : value(value_) {} + int value; + int forward() { + return value; } - } - SECTION("forward") { - SECTION("calling forward() on an empty sequential is disallowed") { - Sequential empty; - REQUIRE_THROWS_WITH( - empty->forward(), - StartsWith("Cannot call forward() on an empty Sequential")); + }; + + Sequential sequential(M(1), M(2), M(3)); + REQUIRE(sequential->size() == 3); +} +TEST_CASE("Sequential/ConstructsFromModuleHolder") { + struct MImpl : torch::nn::Module { + explicit MImpl(int value_) : value(value_) {} + int forward() { + return value; } + int value; + }; - SECTION("calling forward() on a non-empty sequential chains correctly") { - struct MockModule : torch::nn::Module { - explicit MockModule(int value) : expected(value) {} - int expected; - int forward(int value) { - REQUIRE(value == expected); - return value + 1; - } - }; + struct M : torch::nn::ModuleHolder { + using torch::nn::ModuleHolder::ModuleHolder; + using torch::nn::ModuleHolder::get; + }; - Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3}); + Sequential sequential(M(1), M(2), M(3)); + REQUIRE(sequential->size() == 3); +} - REQUIRE(sequential->forward(1) == 4); +TEST_CASE("Sequential/PushBackAddsAnElement") { + struct M : torch::nn::Module { + explicit M(int value_) : value(value_) {} + int forward() { + return value; } + int value; + }; + Sequential sequential; + REQUIRE(sequential->size() == 0); + REQUIRE(sequential->is_empty()); + sequential->push_back(Linear(3, 4)); + REQUIRE(sequential->size() == 1); + sequential->push_back(std::make_shared(1)); + REQUIRE(sequential->size() == 2); + sequential->push_back(M(2)); + REQUIRE(sequential->size() == 3); +} - SECTION("calling forward() with the wrong return type throws") { - struct M : public torch::nn::Module { - int forward() { - return 5; - } - }; - - Sequential sequential(M{}); - REQUIRE(sequential->forward() == 5); - REQUIRE_THROWS_WITH( - sequential->forward(), - StartsWith("The type of the return value " - "is int, but you asked for type float")); +TEST_CASE("Sequential/AccessWithAt") { + struct M : torch::nn::Module { + explicit M(int value_) : value(value_) {} + int forward() { + return value; } + int value; + }; + std::vector> modules = { + std::make_shared(1), std::make_shared(2), std::make_shared(3)}; + + Sequential sequential; + for (auto& module : modules) { + sequential->push_back(module); + } + REQUIRE(sequential->size() == 3); - SECTION("The return type of forward() defaults to Tensor") { - struct M : public torch::nn::Module { - torch::Tensor forward(torch::Tensor v) { - return v; - } - }; - - Sequential sequential(M{}); - auto variable = torch::ones({3, 3}, torch::requires_grad()); - REQUIRE(sequential->forward(variable).equal(variable)); - } + // returns the correct module for a given index + for (size_t i = 0; i < modules.size(); ++i) { + REQUIRE(&sequential->at(i) == modules[i].get()); } - SECTION("returns the last value") { - torch::manual_seed(0); - Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100)); + // throws for a bad index + REQUIRE_THROWS_WITH( + sequential->at(modules.size() + 1), StartsWith("Index out of range")); + REQUIRE_THROWS_WITH( + sequential->at(modules.size() + 1000000), + StartsWith("Index out of range")); +} - auto x = torch::randn({1000, 10}, torch::requires_grad()); - auto y = sequential->forward(x); - REQUIRE(y.ndimension() == 2); - REQUIRE(y.size(0) == 1000); - REQUIRE(y.size(1) == 100); +TEST_CASE("Sequential/AccessWithPtr") { + struct M : torch::nn::Module { + explicit M(int value_) : value(value_) {} + int forward() { + return value; + } + int value; + }; + std::vector> modules = { + std::make_shared(1), std::make_shared(2), std::make_shared(3)}; + + Sequential sequential; + for (auto& module : modules) { + sequential->push_back(module); } + REQUIRE(sequential->size() == 3); - SECTION("can hold other important modules") { - Sequential sequential( - Linear(10, 3), - Conv2d(1, 2, 3), - Dropout(0.5), - BatchNorm(5), - Embedding(4, 10), - LSTM(4, 5)); + // returns the correct module for a given index + for (size_t i = 0; i < modules.size(); ++i) { + REQUIRE(sequential->ptr(i).get() == modules[i].get()); + REQUIRE(sequential[i].get() == modules[i].get()); + REQUIRE(sequential->ptr(i).get() == modules[i].get()); } - SECTION("converts at::Tensor to torch::Tensor correctly") { - struct M : torch::nn::Module { - torch::Tensor forward(torch::Tensor input) { - return input; - } - }; - - Sequential sequential(M{}); - torch::Tensor variable = torch::ones(5); - REQUIRE(sequential->forward(variable).sum().toCFloat() == 5); - - at::Tensor tensor_that_is_actually_a_variable = variable * 2; - REQUIRE( - sequential->forward(tensor_that_is_actually_a_variable) - .sum() - .toCFloat() == 10); - } - SECTION("extend() pushes modules from other Sequential") { - struct A : torch::nn::Module { - int forward(int x) { - return x; - } - }; - struct B : torch::nn::Module { - int forward(int x) { - return x; - } - }; - struct C : torch::nn::Module { - int forward(int x) { - return x; - } - }; - struct D : torch::nn::Module { - int forward(int x) { - return x; - } - }; - Sequential a(A{}, B{}); - Sequential b(C{}, D{}); - a->extend(*b); - - REQUIRE(a->size() == 4); - REQUIRE(a[0]->as()); - REQUIRE(a[1]->as()); - REQUIRE(a[2]->as()); - REQUIRE(a[3]->as()); - - REQUIRE(b->size() == 2); - REQUIRE(b[0]->as()); - REQUIRE(b[1]->as()); - - std::vector> c = {std::make_shared(), - std::make_shared()}; - b->extend(c); - - REQUIRE(b->size() == 4); - REQUIRE(b[0]->as()); - REQUIRE(b[1]->as()); - REQUIRE(b[2]->as()); - REQUIRE(b[3]->as()); - } - SECTION("has reference semantics") { - Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5)); - Sequential second(first); - - REQUIRE(first.get() == second.get()); - REQUIRE(first->size() == second->size()); - REQUIRE(std::equal( - first->begin(), - first->end(), - second->begin(), - [](const AnyModule& first, const AnyModule& second) { - return &first == &second; - })); - } - SECTION("Is cloneable") { - Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3)); - Sequential clone = - std::dynamic_pointer_cast(sequential->clone()); - REQUIRE(sequential->size() == clone->size()); - - for (size_t i = 0; i < sequential->size(); ++i) { - // The modules should be the same kind (type). - REQUIRE(sequential[i]->name() == clone[i]->name()); - // But not pointer-equal (distinct objects). - REQUIRE(sequential[i] != clone[i]); + // throws for a bad index + REQUIRE_THROWS_WITH( + sequential->ptr(modules.size() + 1), StartsWith("Index out of range")); + REQUIRE_THROWS_WITH( + sequential->ptr(modules.size() + 1000000), + StartsWith("Index out of range")); +} + +TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") { + Sequential empty; + REQUIRE_THROWS_WITH( + empty->forward(), + StartsWith("Cannot call forward() on an empty Sequential")); +} + +TEST_CASE("Sequential/CallingForwardChainsCorrectly") { + struct MockModule : torch::nn::Module { + explicit MockModule(int value) : expected(value) {} + int expected; + int forward(int value) { + REQUIRE(value == expected); + return value + 1; } + }; - // Verify that the clone is deep, i.e. parameters of modules are cloned too. + Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3}); - torch::NoGradGuard no_grad; + REQUIRE(sequential->forward(1) == 4); +} - auto params1 = sequential->parameters(); - auto params2 = clone->parameters(); - REQUIRE(params1.size() == params2.size()); - for (auto& param : params1) { - REQUIRE(!pointer_equal(param.value, params2[param.key])); - REQUIRE(param->device() == params2[param.key].device()); - REQUIRE(param->allclose(params2[param.key])); - param->add_(2); +TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") { + struct M : public torch::nn::Module { + int forward() { + return 5; } - for (auto& param : params1) { - REQUIRE(!param->allclose(params2[param.key])); + }; + + Sequential sequential(M{}); + REQUIRE(sequential->forward() == 5); + REQUIRE_THROWS_WITH( + sequential->forward(), + StartsWith("The type of the return value " + "is int, but you asked for type float")); +} + +TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") { + struct M : public torch::nn::Module { + torch::Tensor forward(torch::Tensor v) { + return v; } + }; + + Sequential sequential(M{}); + auto variable = torch::ones({3, 3}, torch::requires_grad()); + REQUIRE(sequential->forward(variable).equal(variable)); +} + +TEST_CASE("Sequential/ForwardReturnsTheLastValue") { + torch::manual_seed(0); + Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100)); + + auto x = torch::randn({1000, 10}, torch::requires_grad()); + auto y = sequential->forward(x); + REQUIRE(y.ndimension() == 2); + REQUIRE(y.size(0) == 1000); + REQUIRE(y.size(1) == 100); +} + +TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") { + Sequential sequential( + Linear(10, 3), + Conv2d(1, 2, 3), + Dropout(0.5), + BatchNorm(5), + Embedding(4, 10), + LSTM(4, 5)); +} + +TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") { + struct A : torch::nn::Module { + int forward(int x) { + return x; + } + }; + struct B : torch::nn::Module { + int forward(int x) { + return x; + } + }; + struct C : torch::nn::Module { + int forward(int x) { + return x; + } + }; + struct D : torch::nn::Module { + int forward(int x) { + return x; + } + }; + Sequential a(A{}, B{}); + Sequential b(C{}, D{}); + a->extend(*b); + + REQUIRE(a->size() == 4); + REQUIRE(a[0]->as()); + REQUIRE(a[1]->as()); + REQUIRE(a[2]->as()); + REQUIRE(a[3]->as()); + + REQUIRE(b->size() == 2); + REQUIRE(b[0]->as()); + REQUIRE(b[1]->as()); + + std::vector> c = {std::make_shared(), + std::make_shared()}; + b->extend(c); + + REQUIRE(b->size() == 4); + REQUIRE(b[0]->as()); + REQUIRE(b[1]->as()); + REQUIRE(b[2]->as()); + REQUIRE(b[3]->as()); +} + +TEST_CASE("Sequential/HasReferenceSemantics") { + Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5)); + Sequential second(first); + + REQUIRE(first.get() == second.get()); + REQUIRE(first->size() == second->size()); + REQUIRE(std::equal( + first->begin(), + first->end(), + second->begin(), + [](const AnyModule& first, const AnyModule& second) { + return &first == &second; + })); +} + +TEST_CASE("Sequential/IsCloneable") { + Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3)); + Sequential clone = + std::dynamic_pointer_cast(sequential->clone()); + REQUIRE(sequential->size() == clone->size()); + + for (size_t i = 0; i < sequential->size(); ++i) { + // The modules should be the same kind (type). + REQUIRE(sequential[i]->name() == clone[i]->name()); + // But not pointer-equal (distinct objects). + REQUIRE(sequential[i] != clone[i]); } + + // Verify that the clone is deep, i.e. parameters of modules are cloned too. + + torch::NoGradGuard no_grad; + + auto params1 = sequential->parameters(); + auto params2 = clone->parameters(); + REQUIRE(params1.size() == params2.size()); + for (auto& param : params1) { + REQUIRE(!pointer_equal(param.value, params2[param.key])); + REQUIRE(param->device() == params2[param.key].device()); + REQUIRE(param->allclose(params2[param.key])); + param->add_(2); + } + for (auto& param : params1) { + REQUIRE(!param->allclose(params2[param.key])); + } +} + +TEST_CASE("Sequential/RegistersElementsAsSubmodules") { + Sequential sequential(Linear(10, 3), Conv2d(1, 2, 3), FeatureDropout(0.5)); + + auto modules = sequential->modules(); + REQUIRE(modules.size() == sequential->children().size()); + + REQUIRE(modules[0]->as()); + REQUIRE(modules[1]->as()); + REQUIRE(modules[2]->as()); } -TEST_CASE("sequential/clone-to-device", "[cuda]") { +TEST_CASE("Sequential/CloneToDevice", "[cuda]") { Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3)); torch::Device device(torch::kCUDA, 0); Sequential clone = diff --git a/torch/csrc/api/include/torch/jit.h b/torch/csrc/api/include/torch/jit.h index 5a4b262101ab68..d43a22c3e9f50d 100644 --- a/torch/csrc/api/include/torch/jit.h +++ b/torch/csrc/api/include/torch/jit.h @@ -17,7 +17,7 @@ namespace jit { /// /// For example: /// \rst -/// .. code-block:: +/// .. code-block:: cpp /// auto module = torch::jit::compile(R"JIT( /// def relu_script(a, b): /// return torch.relu(a + b) diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h index ccfa10a90ad2a4..2cb71e3470574e 100644 --- a/torch/csrc/api/include/torch/nn/module.h +++ b/torch/csrc/api/include/torch/nn/module.h @@ -225,8 +225,8 @@ class Module { /// } /// } /// - /// MyModule module; - /// module->modules().apply(initialize_weights); + /// MyModule module; + /// module->modules().apply(initialize_weights); /// \endrst template typename ModuleType::ContainedType* as() noexcept; @@ -244,8 +244,8 @@ class Module { /// } /// } /// - /// MyModule module; - /// module->modules().apply(initialize_weights); + /// MyModule module; + /// module->modules().apply(initialize_weights); /// \endrst template < typename ModuleType, diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h index 626149469b5035..512e6d2cc1c5af 100644 --- a/torch/csrc/api/include/torch/nn/modules/any.h +++ b/torch/csrc/api/include/torch/nn/modules/any.h @@ -38,7 +38,7 @@ namespace nn { /// Example: /// /// \rst -/// .. code-block:: +/// .. code-block:: cpp /// struct GenericTrainer { /// torch::nn::AnyModule module; /// @@ -57,7 +57,7 @@ namespace nn { /// `AnyModule` will compile, but throw an exception at runtime: /// /// \rst -/// .. code-block:: +/// .. code-block:: cpp /// torch::nn::AnyModule module(torch::nn::Linear(3, 4)); /// // Linear takes a tensor as input, but we are passing an integer. /// // This will compile, but throw a `torch::Error` exception at runtime. @@ -79,7 +79,7 @@ namespace nn { /// for example. /// /// \rst -/// .. code-block:: +/// .. code-block:: cpp /// torch::nn::AnyModule module(torch::nn::Linear(3, 4)); /// auto output = module.forward(torch::ones({2, 3})); /// @@ -97,7 +97,7 @@ namespace nn { /// using `.get()` where `T` is the concrete module type. /// /// \rst -/// .. code-block:: +/// .. code-block:: cpp /// torch::nn::AnyModule module(torch::nn::Linear(3, 4)); /// std::shared_ptr ptr = module.ptr(); /// torch::nn::Linear linear(module.get()); diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h index 384afd1b7814bb..e4839ac41a910b 100644 --- a/torch/csrc/api/include/torch/nn/modules/sequential.h +++ b/torch/csrc/api/include/torch/nn/modules/sequential.h @@ -18,18 +18,81 @@ namespace torch { namespace nn { -/// A `Sequential` module is a container for any number of other modules. Its -/// `forward()` method chains outputs to inputs and returns the final output. -/// The `Sequential` class reference semantics. + +/// A list of `Module`s that acts as a `Module` itself. +/// +/// A `Sequential` is fundamentally a list of `Module`s, each with a `forward()` +/// method. `Sequential` provides a `forward()` method of its own, which accepts +/// any input and forwards it to the first module it stores. It then "chains" +/// outputs to inputs sequentially for each subsequent module, finally returning +/// the output of the last module. For example: +/// +/// \rst +/// .. code-block:: cpp +/// +/// torch::nn::Sequential seq( +/// torch::nn::Linear(3, 4), +/// torch::nn::BatchNorm(4), +/// torch::nn::Dropout(0.5) +/// ); +/// +/// auto output = seq->forward(torch::ones(3)); +/// +/// \endrst +/// +/// This can conceptually be thought of as the following loop (using Python as +/// pseudocode): +/// +/// \rst +/// .. code-block:: python +/// +/// def forward(sequential, input): +/// for module in sequential: +/// input = module(input) +/// return input +/// +/// \endrst +/// +/// Why should you use `Sequential` instead of a simple `std::vector`? The value +/// a `Sequential` provides over manually calling a sequence of modules is that +/// it allows treating the whole container *as a single module*, such that +/// performing a transformation on the `Sequential` applies to each of the +/// modules it stores (which are each a registered submodule of the +/// `Sequential`). For example, calling +/// `.to(torch::kCUDA)` on a `Sequential` will move each module in the list to +/// CUDA memory. For example: +/// +/// \rst +/// .. code-block:: cpp +/// +/// torch::nn::Sequential seq( +/// torch::nn::Linear(3, 4), +/// torch::nn::BatchNorm(4), +/// torch::nn::Dropout(0.5) +/// ); +/// +/// // Convert all modules to CUDA. +/// seq->to(torch::kCUDA); +/// +/// \endrst +/// +/// Finally, `Sequential` provides a lightweight container API, such as allowing +/// iteration over submodules, positional access, adding a new module after +/// construction via `push_back`, as well as joining two `Sequential`s via +/// `extend`. +/// +/// \rst +/// .. attention:: +/// One current limitation of `Sequential` is that all except the first module +/// must accept a single argument. If your modules need to take multiple +/// arguments, you should define them to take and return tuples. +/// \endrst class SequentialImpl : public Cloneable { public: using Iterator = std::vector::iterator; using ConstIterator = std::vector::const_iterator; - /// Constructs the `Sequential` from a pack of modules. Each module can either - /// be a plain value (e.g. `Linear`) or a boxed value (e.g. - /// `shared_ptr`). Unboxed modules will be moved into `shared_ptr`s - /// internally. + /// Constructs the `Sequential` from a variadic list of modules. template explicit SequentialImpl(Modules&&... modules) { modules_.reserve(sizeof...(Modules)); @@ -47,19 +110,43 @@ class SequentialImpl : public Cloneable { return clone; } - /// `reset()` is empty for `Sequential`, since it does not have parameter of + /// `reset()` is empty for `Sequential`, since it does not have parameters of /// its own. void reset() override {} - /// Feeds the `inputs` to the first module, then chains the output of each - /// module with the input of the next, in order of construction. - template - ReturnType forward(ArgumentTypes&&... arguments) { + /// Feeds `inputs` to the first module and then chains outputs to inputs, + /// returning the last output. + /// + /// Conceptually the following loop in Python: + /// + /// \rst + /// .. code-block:: python + /// + /// def forward(sequential, input): + /// for module in sequential: + /// input = module(input) + /// return input + /// + /// \endrst + /// + /// The return type is taken as the first template parameter. It defaults to + /// `Tensor`. If the last module in the `Sequential` returns another type `T`, + /// you should call `forward(inputs)` instead of just `forward(inputs)`: + /// + /// \rst + /// .. code-block:: cpp + /// + /// torch::Tensor tensor = sequential1->forward(inputs); + /// int integer = sequential2->forward(inputs); + /// float value = sequential3->forward(inputs); + /// + /// \endrst + template + ReturnType forward(InputTypes&&... inputs) { AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential"); auto iterator = modules_.begin(); - auto input = - iterator->any_forward(std::forward(arguments)...); + auto input = iterator->any_forward(std::forward(inputs)...); for (++iterator; iterator != modules_.end(); ++iterator) { input = iterator->any_forward(std::move(input)); @@ -126,6 +213,8 @@ class SequentialImpl : public Cloneable { Iterator begin() { return modules_.begin(); } + + /// Returns a const iterator to the start of the `Sequential`. ConstIterator begin() const { return modules_.begin(); } @@ -134,6 +223,8 @@ class SequentialImpl : public Cloneable { Iterator end() { return modules_.end(); } + + /// Returns a const iterator to the end of the `Sequential`. ConstIterator end() const { return modules_.end(); } @@ -227,6 +318,10 @@ class SequentialImpl : public Cloneable { std::vector modules_; }; +/// A `ModuleHolder` subclass for `SequentialImpl`. +/// See the documentation for `SequentialImpl` class to learn what methods it +/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. TORCH_MODULE(Sequential); } // namespace nn } // namespace torch From d24bcfd9304f0e38c4024620de4da8f2379f6956 Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Fri, 14 Sep 2018 15:32:17 -0700 Subject: [PATCH 168/237] Suppress hiprand "duplicate-decl-specifier" warning (#11698) Summary: Otherwise each build produces 65MB of warnings log, which makes the CI hard to debug. iotamudelta Jorghi12 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11698 Differential Revision: D9840356 Pulled By: bddppq fbshipit-source-id: b69bf6a5c38a97b188221f9c084c608ffc9b37c8 --- cmake/Dependencies.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 551a84180b087a..dbec0d07ebf6e3 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -542,6 +542,7 @@ if(NOT BUILD_ATEN_MOBILE) set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-negative") set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-overflow") set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-unused-command-line-argument") + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-duplicate-decl-specifier") set(Caffe2_HIP_INCLUDES ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $ ${Caffe2_HIP_INCLUDES}) From 8e76dcf173f7fb28c2736b86b38a829531496bab Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Fri, 14 Sep 2018 15:55:35 -0700 Subject: [PATCH 169/237] Prevent raising KeyboardInterrupt in worker (#11718) Summary: Current behavior is that each process (main and workers) will print trace from `KeyboardInterrupt`. And the main process will also print ``` RuntimeError: DataLoader worker (pid 46045) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with nm_workers=0 may give better error trace. ``` due to our SIGCLD handler. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11718 Differential Revision: D9840844 Pulled By: SsnL fbshipit-source-id: 1a05060bb02907fef5aac3f274d2c84f9f42d187 --- torch/utils/data/dataloader.py | 76 ++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 0874fd1185028d..4618e731406e68 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -73,49 +73,53 @@ def is_alive(self): def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed, init_fn, worker_id): - global _use_shared_memory - _use_shared_memory = True + try: + global _use_shared_memory + _use_shared_memory = True - # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal - # module's handlers are executed after Python returns from C low-level - # handlers, likely when the same fatal signal happened again already. - # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1 - _set_worker_signal_handlers() + # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal + # module's handlers are executed after Python returns from C low-level + # handlers, likely when the same fatal signal happened again already. + # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1 + _set_worker_signal_handlers() - torch.set_num_threads(1) - random.seed(seed) - torch.manual_seed(seed) + torch.set_num_threads(1) + random.seed(seed) + torch.manual_seed(seed) - # Do not wait for putting thread to join when this worker exits. Otherwise, - # this worker may always be waiting to put and doesn't check index_queue - # and done_event for termination signal. - data_queue.cancel_join_thread() + # Do not wait for putting thread to join when this worker exits. + # Otherwise, this worker may always be waiting to put and doesn't check + # index_queue and done_event for termination signal. + data_queue.cancel_join_thread() - if init_fn is not None: - init_fn(worker_id) + if init_fn is not None: + init_fn(worker_id) - watchdog = ManagerWatchdog() + watchdog = ManagerWatchdog() - while True: - try: - r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL) - except queue.Empty: - if watchdog.is_alive() and not done_event.is_set(): - continue - else: + while True: + try: + r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL) + except queue.Empty: + if watchdog.is_alive() and not done_event.is_set(): + continue + else: + break + # use done_event so that we can get faster exiting signal even if there + # are still indices in index_queue + if r is None or done_event.is_set(): break - # use done_event so that we can get faster exiting signal even if there - # are still indices in index_queue - if r is None or done_event.is_set(): - break - idx, batch_indices = r - try: - samples = collate_fn([dataset[i] for i in batch_indices]) - except Exception: - data_queue.put((idx, ExceptionWrapper(sys.exc_info()))) - else: - data_queue.put((idx, samples)) - del samples + idx, batch_indices = r + try: + samples = collate_fn([dataset[i] for i in batch_indices]) + except Exception: + data_queue.put((idx, ExceptionWrapper(sys.exc_info()))) + else: + data_queue.put((idx, samples)) + del samples + except KeyboardInterrupt: + # Main process will raise KeyboardInterrupt anyways. + pass def _pin_memory_loop(in_queue, out_queue, done_event, pin_memory, device_id): From 2c8a1b957ea19c25d672ccc5e51047279e68d85d Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 14 Sep 2018 16:34:15 -0700 Subject: [PATCH 170/237] Back out "Refactor Tensor/TensorImpl constructors." Summary: Original commit changeset: 7501b54fe5f3 Reviewed By: gchanan Differential Revision: D9838097 fbshipit-source-id: 093e4c47d5574ce99f706b0683ef369a89b62b38 --- caffe2/core/tensor.h | 76 +++++++++++---------------------------- caffe2/core/tensor_impl.h | 71 +++++++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 56 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index a167658060e7fc..c17667c006f54c 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -50,75 +50,45 @@ class CAFFE2_API Tensor final { return impl_.get(); } - explicit Tensor(Storage storage) - : impl_(c10::make_intrusive(std::move(storage))) {} - - /** - * @brief Creates a tensor of the given dimension. - * - * Note that the actual data allocation is not going to be carried out until - * the first time mutable_data() is called. - */ + explicit Tensor(DeviceType type) + : impl_(c10::make_intrusive(type)) {} + explicit Tensor(const vector& dims, DeviceType type) - : Tensor(Storage(type)) { - // TODO: here, we create a Storage - // and immediately discard it in Resize() since - // reset_tensor will be true and FreeMemory will be called, - // we might want to avoid creating Storage twice? - Resize(dims); - } + : impl_( + c10::make_intrusive(dims, type)) {} explicit Tensor(const vector& dims, DeviceType type) - : Tensor(Storage(type)) { - Resize(dims); - } + : impl_( + c10::make_intrusive(dims, type)) {} - /** - * context_for_copy is required to have the same DeviceType as src - */ Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type) - : Tensor(Storage(type)) { - CopyFrom(src, context_for_copy); - } + : impl_(c10::make_intrusive( + *src.impl_, + context_for_copy, + type)) {} - /** - * @brief: Create a Tensor of at::DeviceType `type` and initialize it with - * src Tensor - */ Tensor(const Tensor& src, DeviceType type) - : Tensor(Storage(type)) { - CopyFrom(src); - } + : impl_(c10::make_intrusive( + *src.impl_, + type)) {} - /** - * @brief Creates a tensor, and fills its contents with the given values. - * The type of tensor will be decided by the context parameter - */ template Tensor( const vector& dims, const vector& values, BaseContext* context) - : Tensor(Storage(context->device_type(), TypeMeta::Make())) { - Resize(dims); - CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size()); - context->CopyItemsFromCPU( - storage().dtype(), size(), values.data(), mutable_data()); - } + : impl_(c10::make_intrusive( + dims, + values, + context)) {} - /** - * @brief Creates a scalar tensor, and fills its content with the given value. - * The type of tensor will be decided by the context parameter - */ template < typename T, typename = typename std::enable_if::value>::type> Tensor(const T& value, BaseContext* context) - : Tensor(Storage(context->device_type(), TypeMeta::Make())) { - Resize(std::vector{}); - context->CopyItemsFromCPU( - storage().dtype(), size(), &value, mutable_data()); - } + : impl_(c10::make_intrusive( + value, + context)) {} Tensor Clone() const { Tensor x(GetDeviceType()); @@ -306,10 +276,6 @@ class CAFFE2_API Tensor final { inline void ExtractDeviceOption(DeviceOption* device) const { return impl_.get()->ExtractDeviceOption(device); } - - const Storage& storage() { - return impl_->storage(); - } }; using TensorCPU = Tensor; diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 2e01c55318d88f..4cdf612a1fb091 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -88,8 +88,77 @@ inline int canonical_axis_index_(int axis_index, int ndims) { class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { public: TensorImpl() = delete; + explicit TensorImpl(DeviceType device_type) : storage_(device_type) {} - explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {} + /** + * @brief Creates a tensor of the given dimension. + * + * Note that the actual data allocation is not going to be carried out until + * the first time mutable_data() is called. + */ + // TODO: here, we create a Storage + // and immediately discard it in Resize() since + // reset_tensor will be true and FreeMemory will be called, + // we might want to avoid creating Storage twice? + explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) + : storage_(device_type) { + Resize(dims); + } + + explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) + : storage_(device_type) { + Resize(dims); + } + + /* Now we require that context_for_copy has the same device type as src since + * template is removed + */ + TensorImpl( + const TensorImpl& src, + at::BaseContext* context_for_copy, + at::DeviceType device_type) + : storage_(device_type) { + CopyFrom(src, context_for_copy); + } + + /** + * @brief: Create a Tensor of at::DeviceType `type` and initialize it with + * src Tensor + */ + TensorImpl(const TensorImpl& src, at::DeviceType device_type) + : storage_(device_type) { + CopyFrom(src); + } + + /** + * @brief Creates a tensor, and fills its contents with the given values. + * The type of tensor will be decided by the context parameter + */ + template + TensorImpl( + const std::vector& dims, + const std::vector& values, + at::BaseContext* context) + : storage_(context->device_type(), TypeMeta::Make()) { + Resize(dims); + CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); + context->CopyItemsFromCPU( + storage_.dtype(), numel_, values.data(), mutable_data()); + } + + /** + * @brief Creates a scalar tensor, and fills its content with the given value. + * The type of tensor will be decided by the context parameter + */ + template < + typename T, + typename = typename std::enable_if::value>::type> + TensorImpl(const T& value, at::BaseContext* context) + : storage_(context->device_type(), TypeMeta::Make()) { + Resize(std::vector{}); + context->CopyItemsFromCPU( + storage_.dtype(), numel_, &value, mutable_data()); + } TensorImpl(const TensorImpl&) = default; TensorImpl& operator=(const TensorImpl&) = default; From f4d9fe395d59f140d1a52c6becf89f6ca59f303b Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Fri, 14 Sep 2018 16:54:22 -0700 Subject: [PATCH 171/237] Remove intrusive_ptr::reclaim() in Storage (#11352) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11352 Pushing manual refcounting further back, making things safer. Reviewed By: ezyang Differential Revision: D9694327 fbshipit-source-id: befdbcac199225383a93520472ee7c6511a0e9cd --- aten/src/ATen/templates/TypeDefault.cpp | 2 +- aten/src/TH/THTensor.cpp | 2 +- aten/src/TH/generic/THTensor.cpp | 18 +++++++++++++++--- aten/src/THC/generic/THCTensor.cpp | 24 ++++++++++++++++++------ torch/csrc/utils/python_arg_parser.h | 2 +- 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 5e68470ecc0abd..0891f6d9f4f492 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -120,7 +120,7 @@ Storage TypeDefault::unsafeStorageFromTH(void * th_pointer, bool retain) const { if (retain && th_pointer) { c10::raw::intrusive_ptr::incref(static_cast(th_pointer)); } - return Storage(static_cast(th_pointer)); + return Storage(c10::intrusive_ptr::reclaim(static_cast(th_pointer))); } diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 2d1200eebd37aa..887b0dcf2dff97 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -202,5 +202,5 @@ void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { // Caffe2 might have tensors whose storages are null, but we // don't allow it in PyTorch. AT_ASSERT(storage); - tensor->storage_ = at::Storage(storage); + tensor->storage_ = at::Storage(c10::intrusive_ptr::reclaim(storage)); } diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index 5eac1e319347d1..9a7bd0be736f25 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -54,13 +54,21 @@ scalar_t *THTensor_(data)(const THTensor *self) { /* Empty init */ THTensor *THTensor_(new)(void) { - return c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + return c10::make_intrusive( + c10::intrusive_ptr::reclaim(THStorage_(new)()), + at::CPUTensorId(), + false + ).release(); } /* Pointer-copy init */ THTensor *THTensor_(newWithTensor)(THTensor *tensor) { - THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + THTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THStorage_(new)()), + at::CPUTensorId(), + false + ).release(); THTensor_(setStorageNd)(self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), @@ -75,7 +83,11 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, if (strides.data()) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } - THTensor *self = c10::make_intrusive(THStorage_(new)(), at::CPUTensorId(), false).release(); + THTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THStorage_(new)()), + at::CPUTensorId(), + false + ).release(); THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index 483a1fbec593e7..a7779047863466 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -63,13 +63,21 @@ scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self) /* Empty init */ THCTensor *THCTensor_(new)(THCState *state) { - return c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + return c10::make_intrusive( + c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), + at::CUDATensorId(), + false + ).release(); } /* Pointer-copy init */ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) { - THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + THCTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), + at::CUDATensorId(), + false + ).release(); THCTensor_(setStorageNd)(state, self, THTensor_getStoragePtr(tensor), @@ -85,7 +93,11 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd if (strides.data()) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } - THCTensor *self = c10::make_intrusive(THCStorage_(new)(state), at::CUDATensorId(), false).release(); + THCTensor *self = c10::make_intrusive( + c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), + at::CUDATensorId(), + false + ).release(); THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); @@ -594,13 +606,13 @@ int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...) } const int tensorDev = THCTensor_(getDevice)(state, tensor); - + // Skips CPU tensors if (tensorDev == -1) { continue; } // Checks all tensors are on the same device - if (tensorDev != curDev) { - valid = 0; + if (tensorDev != curDev) { + valid = 0; break; } } diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 294819a5a11550..9ff25d2d4e5134 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -434,7 +434,7 @@ inline at::Generator* PythonArgs::generator(int i) { } inline at::Storage PythonArgs::storage(int i) { - if (!args[i]) return nullptr; + if (!args[i]) return at::Storage(); return createStorage(args[i]); } From 270fb22bd84e915d76fd6316bf4f22a8c7882da6 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Fri, 14 Sep 2018 16:54:24 -0700 Subject: [PATCH 172/237] Remove intrusive_ptr::reclaim() in Storage (2/2) (#11547) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11547 Pushing manual refcounting further back, making things safer. Reviewed By: ezyang Differential Revision: D9778042 fbshipit-source-id: c9572edc440c5ce5ea1b2355b5c54f87078ea28e --- aten/src/ATen/core/Storage.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h index 23a3327da1cafc..ab201be88d630e 100644 --- a/aten/src/ATen/core/Storage.h +++ b/aten/src/ATen/core/Storage.h @@ -7,9 +7,7 @@ namespace at { struct AT_API Storage { public: Storage() {} - Storage(StorageImpl* storage_impl) : storage_impl_(c10::intrusive_ptr::reclaim(storage_impl)) {} - Storage(const c10::intrusive_ptr& ptr) : storage_impl_(ptr) {} - Storage(c10::intrusive_ptr&& ptr) : storage_impl_(std::move(ptr)) {} + Storage(c10::intrusive_ptr ptr) : storage_impl_(std::move(ptr)) {} Storage( caffe2::TypeMeta data_type, size_t size, From 690c999bba2da3bb25370fe49181e7d0eb3e4b69 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Fri, 14 Sep 2018 16:54:26 -0700 Subject: [PATCH 173/237] Simplify union payload copying (#11353) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11353 Before, there was one extra member in the union that had to be at least as large as the largest other member, because it was used for copying. Now, this isn't needed anymore and we copy the union directly. Reviewed By: ezyang Differential Revision: D9694326 fbshipit-source-id: 42b2f7d51ac5d4ea5ebafea3a598b018e10fed68 --- torch/csrc/jit/ivalue.h | 47 +++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index d32a24b54917a8..ffff496fdd6386 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -69,7 +69,7 @@ using DoubleList = ConstantList; struct TORCH_API IValue final { IValue() - : payload(0) + : payload{0} , tag(Tag::None) , is_intrusive_ptr(false) {} IValue(const IValue& rhs) @@ -77,7 +77,7 @@ struct TORCH_API IValue final { tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::incref(as_intrusive_ptr); + c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); } } IValue(IValue&& rhs) noexcept : IValue() { @@ -85,7 +85,7 @@ struct TORCH_API IValue final { } ~IValue() { if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::decref(as_intrusive_ptr); + c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); } } IValue & operator=(IValue && rhs) & noexcept { @@ -113,19 +113,19 @@ struct TORCH_API IValue final { // This is not an optional optimization: our incref call // *will not* do the right thing when called on an // undefined tensor. - as_tensor_impl = t.unsafeReleaseTensorImpl(); + payload.as_tensor_impl = t.unsafeReleaseTensorImpl(); } bool isTensor() const { return Tag::Tensor == tag; } at::Tensor toTensor() && { JIT_ASSERT(isTensor()); - at::Tensor t(c10::intrusive_ptr::reclaim(as_tensor_impl)); + at::Tensor t(c10::intrusive_ptr::reclaim(payload.as_tensor_impl)); clearToNone(); return t; } at::Tensor toTensor() const & { JIT_ASSERT(isTensor()); - JIT_ASSERT(is_intrusive_ptr == (as_tensor_impl != at::UndefinedTensorImpl::singleton())); - auto tensor_impl = c10::intrusive_ptr::reclaim(as_tensor_impl); + JIT_ASSERT(is_intrusive_ptr == (payload.as_tensor_impl != at::UndefinedTensorImpl::singleton())); + auto tensor_impl = c10::intrusive_ptr::reclaim(payload.as_tensor_impl); if (is_intrusive_ptr) { c10::raw::intrusive_ptr::incref(tensor_impl.get()); } @@ -147,18 +147,18 @@ struct TORCH_API IValue final { // Double IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) { - as_double = d; + payload.as_double = d; } bool isDouble() const { return Tag::Double == tag; } double toDouble() const { JIT_ASSERT(isDouble()); - return as_double; + return payload.as_double; } // Int IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) { - as_int = i; + payload.as_int = i; } // allow you to pass literals (3, 4) without ambiguity @@ -171,7 +171,7 @@ struct TORCH_API IValue final { int64_t toInt() const { JIT_ASSERT(isInt()); - return as_int; + return payload.as_int; } // IntList @@ -301,31 +301,28 @@ struct TORCH_API IValue final { template c10::intrusive_ptr moveToIntrusivePtr() { - auto t = c10::intrusive_ptr::reclaim(static_cast(as_intrusive_ptr)); + auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); clearToNone(); return t; } template c10::intrusive_ptr toIntrusivePtr() const { - auto r = c10::intrusive_ptr::reclaim(static_cast(as_intrusive_ptr)); + auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); auto p = r; r.release(); return p; } void clearToNone() { - payload = 0; + payload.as_int = 0; tag = Tag::None; is_intrusive_ptr = false; } union { + int64_t as_int; + double as_double; at::TensorImpl* as_tensor_impl; c10::intrusive_ptr_target* as_intrusive_ptr; - double as_double; - int64_t as_int; - // this type should be as big as all the other types because it will - // be used to copy the union's value in certain cases - int64_t payload; - }; + } payload; Tag tag; bool is_intrusive_ptr; }; @@ -389,33 +386,33 @@ DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY) inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Tuple), is_intrusive_ptr(true) { - as_intrusive_ptr = v.release(); + payload.as_intrusive_ptr = v.release(); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::IntList), is_intrusive_ptr(true) { - as_intrusive_ptr = v.release(); + payload.as_intrusive_ptr = v.release(); } inline IValue::IValue(std::vector v) : IValue(IntList::create(std::move(v))) {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::String), is_intrusive_ptr(true) { - as_intrusive_ptr = v.release(); + payload.as_intrusive_ptr = v.release(); } inline IValue::IValue(std::string v) : IValue(ConstantString::create(std::move(v))) {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::DoubleList), is_intrusive_ptr(true) { - as_intrusive_ptr = v.release(); + payload.as_intrusive_ptr = v.release(); } inline IValue::IValue(std::vector v) : IValue(DoubleList::create(std::move(v))) {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::TensorList), is_intrusive_ptr(true) { - as_intrusive_ptr = v.release(); + payload.as_intrusive_ptr = v.release(); } inline IValue::IValue(std::vector v) : IValue(TensorList::create(std::move(v))) {} From bb6f18c44fbad0926e1eaf963375a8f934bc3c6b Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Fri, 14 Sep 2018 16:54:29 -0700 Subject: [PATCH 174/237] Simplify IValue::toTensor() (#11355) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11355 There is no reason to implement refcounting manually in this case. Given the correct NullType, toIntrusivePtr() and moveToIntrusivePtr() will do the right thing. Reviewed By: ezyang Differential Revision: D9694918 fbshipit-source-id: 8aae4d66aec32ca5f85c438d66339bd80b72b656 --- torch/csrc/jit/ivalue.h | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index ffff496fdd6386..16af0c9b292d5f 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -113,23 +113,16 @@ struct TORCH_API IValue final { // This is not an optional optimization: our incref call // *will not* do the right thing when called on an // undefined tensor. - payload.as_tensor_impl = t.unsafeReleaseTensorImpl(); + payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); } bool isTensor() const { return Tag::Tensor == tag; } at::Tensor toTensor() && { JIT_ASSERT(isTensor()); - at::Tensor t(c10::intrusive_ptr::reclaim(payload.as_tensor_impl)); - clearToNone(); - return t; + return at::Tensor(moveToIntrusivePtr()); } at::Tensor toTensor() const & { JIT_ASSERT(isTensor()); - JIT_ASSERT(is_intrusive_ptr == (payload.as_tensor_impl != at::UndefinedTensorImpl::singleton())); - auto tensor_impl = c10::intrusive_ptr::reclaim(payload.as_tensor_impl); - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::incref(tensor_impl.get()); - } - return at::Tensor(std::move(tensor_impl)); + return at::Tensor(toIntrusivePtr()); } // Tuple @@ -299,15 +292,15 @@ struct TORCH_API IValue final { #undef DEFINE_TAG }; - template - c10::intrusive_ptr moveToIntrusivePtr() { - auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + template> + c10::intrusive_ptr moveToIntrusivePtr() { + auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); clearToNone(); return t; } - template - c10::intrusive_ptr toIntrusivePtr() const { - auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + template> + c10::intrusive_ptr toIntrusivePtr() const { + auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); auto p = r; r.release(); return p; @@ -320,7 +313,6 @@ struct TORCH_API IValue final { union { int64_t as_int; double as_double; - at::TensorImpl* as_tensor_impl; c10::intrusive_ptr_target* as_intrusive_ptr; } payload; Tag tag; From f09054f8d09cca58e3ef63568b18977e66ca56a6 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Fri, 14 Sep 2018 17:45:51 -0700 Subject: [PATCH 175/237] Remove deprecate warning for Upsampling (#11568) Summary: Fixes #11452 . Based on the discussion with SsnL and soumith , we want to bring back Upsample as a module instead of introducing a new nn.interpolate module for now. If anyone want to do downsample, they should use `nn.functional.interpolate ` instead. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11568 Differential Revision: D9804359 Pulled By: ailzhang fbshipit-source-id: 2b232d55fc83c2b581bf336f1ee8d1cf1c1159ca --- torch/nn/modules/upsampling.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py index e416326229a142..5b2307470a6060 100644 --- a/torch/nn/modules/upsampling.py +++ b/torch/nn/modules/upsampling.py @@ -50,8 +50,8 @@ class Upsample(Module): 0.3.1. Since then, the default behavior is ``align_corners = False``. See below for concrete examples on how this affects the outputs. - .. warning:: - This class is deprecated in favor of :func:`~nn.functional.interpolate`. + .. note:: + If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`. Examples:: @@ -121,7 +121,6 @@ def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=N self.align_corners = align_corners def forward(self, input): - warnings.warn("nn.Upsampling is deprecated. Use nn.functional.interpolate instead.") return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners) def extra_repr(self): From eb3c47bdd5136979f3ecb52a7c2f5c241515f2fe Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Sat, 15 Sep 2018 18:06:08 -0700 Subject: [PATCH 176/237] max -> fmaxf in cross_entropy kernel (#11733) Summary: Changing `max` to `fmaxf` in `LabelCrossEntropy` kernel for hip to work correctly. bddppq petrex Pull Request resolved: https://github.com/pytorch/pytorch/pull/11733 Differential Revision: D9846783 Pulled By: bddppq fbshipit-source-id: c1b394d2ba7ee0e819f7bf3b36b53d1962de5522 --- caffe2/operators/cross_entropy_op.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu index 7030e846b714e2..df7a124d29711f 100644 --- a/caffe2/operators/cross_entropy_op.cu +++ b/caffe2/operators/cross_entropy_op.cu @@ -13,7 +13,7 @@ __global__ void LabelCrossEntropyKernel( const float log_threshold, float* Ydata) { CUDA_1D_KERNEL_LOOP(i, N) { CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D); - Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold)); + Ydata[i] = -logf(fmaxf(Xdata[i * D + labeldata[i]], log_threshold)); } } __global__ void LabelCrossEntropyGradientKernel( @@ -21,7 +21,7 @@ __global__ void LabelCrossEntropyGradientKernel( const float* dYdata, const float log_threshold, float* dXdata) { CUDA_1D_KERNEL_LOOP(i, N) { int idx = i * D + labeldata[i]; - dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold); + dXdata[idx] = - dYdata[i] / fmaxf(Xdata[idx], log_threshold); } } } // namespace From b3e726042c1746824bb1cd0152d203a0ec00fd12 Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Sat, 15 Sep 2018 18:14:50 -0700 Subject: [PATCH 177/237] Do not use FixedDivisor in ROCM order switch op (#11697) Summary: Fix the recent order_switch_test failure in ROCM CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/11697 Reviewed By: BIT-silence Differential Revision: D9831039 Pulled By: bddppq fbshipit-source-id: 2368fd1ac7b1bab335ff3377071246cfd3392f3f --- caffe2/operators/order_switch_ops.cu | 61 ++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu index c213b7cc848d0d..f63a7d87fa88fe 100644 --- a/caffe2/operators/order_switch_ops.cu +++ b/caffe2/operators/order_switch_ops.cu @@ -8,20 +8,42 @@ namespace caffe2 { template __global__ void NHWC2NCHWCUDAKernel( const int size, +#ifndef __HIPCC__ const FixedDivisor C, const FixedDivisor HxW, +#else + const int C, + const int HxW, +#endif const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, size) { int n; int c; int hxw; + + int c_d; + int hxw_d; +#ifndef __HIPCC__ HxW.DivMod(i, &c, &hxw); C.DivMod(c, &n, &c); + + c_d = C.d(); + hxw_d = HxW.d(); +#else + c = i / HxW; + hxw = i % HxW; + n = c / C; + c = c % C; + + c_d = C; + hxw_d = HxW; +#endif + #if __CUDA_ARCH__ >= 350 - Y[i] = __ldg(X + (n * HxW.d() + hxw) * C.d() + c); + Y[i] = __ldg(X + (n * hxw_d + hxw) * c_d + c); #else - Y[i] = X[(n * HxW.d() + hxw) * C.d() + c]; + Y[i] = X[(n * hxw_d + hxw) * c_d + c]; #endif } } @@ -29,20 +51,41 @@ __global__ void NHWC2NCHWCUDAKernel( template __global__ void NCHW2NHWCCUDAKernel( const int size, +#ifndef __HIPCC__ const FixedDivisor C, const FixedDivisor HxW, +#else + const int C, + const int HxW, +#endif const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, size) { int n; int c; int hxw; + + int c_d; + int hxw_d; +#ifndef __HIPCC__ C.DivMod(i, &hxw, &c); HxW.DivMod(hxw, &n, &hxw); + + c_d = C.d(); + hxw_d = HxW.d(); +#else + hxw = i / C; + c = i % C; + n = hxw / HxW; + hxw = hxw % HxW; + + c_d = C; + hxw_d = HxW; +#endif #if __CUDA_ARCH__ >= 350 - Y[i] = __ldg(X + (n * C.d() + c) * HxW.d() + hxw); + Y[i] = __ldg(X + (n * c_d + c) * hxw_d + hxw); #else - Y[i] = X[(n * C.d() + c) * HxW.d() + hxw]; + Y[i] = X[(n * c_d + c) * hxw_d + hxw]; #endif } } @@ -71,8 +114,13 @@ bool NHWC2NCHWOp::RunOnDevice() { 0, context_.cuda_stream()>>>( size, +#ifndef __HIPCC__ FixedDivisor(C), FixedDivisor(HxW), +#else + C, + HxW, +#endif X.data(), Y->template mutable_data()); return true; @@ -102,8 +150,13 @@ bool NCHW2NHWCOp::RunOnDevice() { 0, context_.cuda_stream()>>>( size, +#ifndef __HIPCC__ FixedDivisor(C), FixedDivisor(HxW), +#else + C, + HxW, +#endif X.data(), Y->template mutable_data()); return true; From ca6f08f359d7dfc2da536baf3001433dff2c8f5b Mon Sep 17 00:00:00 2001 From: Jiyan Yang Date: Sat, 15 Sep 2018 23:31:42 -0700 Subject: [PATCH 178/237] Set correct dtype for fp16 op inference function (#11693) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11693 as desc. Reviewed By: hyuen Differential Revision: D9829061 fbshipit-source-id: 0f4c8a9d2b95d4cf5fa20a2aefd5671f273a8e76 --- caffe2/operators/half_float_ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/operators/half_float_ops.h b/caffe2/operators/half_float_ops.h index e6698a0b7283d1..b8d5dacf69472b 100644 --- a/caffe2/operators/half_float_ops.h +++ b/caffe2/operators/half_float_ops.h @@ -76,7 +76,7 @@ inline std::vector Float16FillerTensorInference( vector out(1); ArgumentHelper helper(def); out[0].set_data_type(static_cast( - helper.GetSingleArgument("dtype", TensorProto_DataType_FLOAT))); + helper.GetSingleArgument("dtype", TensorProto_DataType_FLOAT16))); auto shape = helper.GetRepeatedArgument("shape"); for (int d : shape) { out[0].add_dims(d); From 10c29c8970a1468cd157d371a0d0b9879cb703a9 Mon Sep 17 00:00:00 2001 From: peter Date: Sun, 16 Sep 2018 07:55:15 -0700 Subject: [PATCH 179/237] Fix CUDA 8 build on Windows (#11729) Summary: Tested via https://github.com/pytorch/pytorch/pull/11374. Upstream PR: https://gitlab.kitware.com/cmake/cmake/merge_requests/2391 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11729 Differential Revision: D9847807 Pulled By: orionr fbshipit-source-id: 69af3e6c5bba0abcbc8830495e867a0b1b399c22 --- README.md | 5 ++--- cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4b996cc1b440a6..b23bc60aa19de6 100644 --- a/README.md +++ b/README.md @@ -201,9 +201,8 @@ set DISTUTILS_USE_SDK=1 REM The following two lines are needed for Python 2.7, but the support for it is very experimental. set MSSdk=1 set FORCE_PY27_BUILD=1 -REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following two lines. -set "PREBUILD_COMMAND=%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" -set PREBUILD_COMMAND_ARGS=x64 +REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following line. +set "CUDA_HOST_COMPILER=%VS140COMNTOOLS%\..\..\VC\bin\amd64\cl.exe" call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11 python setup.py install diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake index edc9b3ab3fda74..b2ca36a9677771 100644 --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake @@ -531,7 +531,9 @@ option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON) # Extra user settable flags cmake_initialize_per_config_variable(CUDA_NVCC_FLAGS "Semi-colon delimit multiple arguments.") -if(CMAKE_GENERATOR MATCHES "Visual Studio") +if(DEFINED ENV{CUDA_HOST_COMPILER}) + set(CUDA_HOST_COMPILER "$ENV{CUDA_HOST_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC") +elseif(CMAKE_GENERATOR MATCHES "Visual Studio") set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)Tools/MSVC/$(VCToolsVersion)/bin/Host$(Platform)/$(PlatformTarget)") if(MSVC_VERSION LESS 1910) set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)bin") From 6f6b03566ba3c4828f6ee87a772f9d161be0bae7 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Sun, 16 Sep 2018 20:30:14 -0700 Subject: [PATCH 180/237] Vectorize grid sample 2d CPU kernels (#10980) Summary: This PR vectorizes the CPU grid sample 2d forward and backward kernels. Specifically, 1. add `.data()` in `TensorAccessor` 2. support non-void return value for declaring CPU kernel stub 2. add `bool at:: geometry_is_contiguous(IntList sizes, IntList strides)` 1. The following vectorized CPU primitives are added: + `gather(baseaddr, vindex)`: `result[i] = baseaddr[vindex[i] * scale]` + `mask_gather(src, baseaddr, vindex, mask)`: `result[i] = mask[i] ? baseaddr[vindex[i] * scale] : src[i]`. + comparison ops + binary logical ops + `min(a, b)` + `cast(src_vec)`: changing dtype but keeping the bit representation + `blendv(a, b, mask)`: `result[i] = mask[i] ? b[i] : a[i]`. + ctor with multiple values (i.e., `setr`) + `arange(start = 0, step = 1)`: constructs a vector with values specified by the arange parameters + `convert_to_int_of_same_size(vec)`: convert floating point vector to corresponding integral type of same size + `interleave2(a, b)` & `deinterleave2(x, y)`: interleave or deinterleaves two vectors. E.g., for `interleave`: ``` inputs: {a0, a1, a2, a3, a4, a5, a6, a7} {b0, b1, b2, b3, b4, b5, b6, b7} outputs: {a0, b0, a1, b1, a2, b2, a3, b3} {a4, b4, a5, b5, a6, b6, a7, b7} ``` 2. Grid sample CPU kernel implementations are described in the following note (also in `GridSampleKernel.cpp`: ``` NOTE [ Grid Sample CPU Kernels ] Implementation of vectorized grid sample CPU kernels is divided into three parts: 1. `ComputeLocation` struct Transforms grid values into interpolation locations of the input tensor for a particular spatial dimension, basing on the size of that dimension in input tensor, and the padding mode. ``` ```cpp template struct ComputeLocation { using Vec = Vec256; // ctor ComputeLocation(int64_t size); // Given grid values `in`, return the interpolation locations after // un-normalization and padding mechanism (elementwise). Vec apply(const Vec &in) const; // Similar to `apply`, but also returns `d apply(in) / d in` // (elementwise). // this is often used in gradient computation. std::pair apply_get_grad(const Vec &in) const; }; ``` ``` 2. `ApplyGridSample` struct Owns N `ComputeLocation` structs, where N is the number of spatial dimensions. Given N input grid vectors (one for each spatial dimension) and spatial offset, it gets the interpolation locations from `ComputeLocation`s, applies interpolation procedure, and then writes to the output (or grad_input & grad_grid in backward). ``` ```cpp template struct ApplyGridSample { // ctor ApplyGridSample(const TensorAccessor& input); // Applies grid sampling (forward) procedure: // 1. computes interpolation locations from grid values `grid_x` and // `grid_y`, // 2. interpolates output values using the locations and input data // in `inp_slice`, and // 3. writes the first `len` values in the interpolated vector to // `out_slice` with spatial offset being `offset`. // // This assimes that `grid_x` and `grid_y` all contain valid grid // values \in [-1, 1], even at indices greater than `len`. // // The `*_slice` argument namess mean samples within a batch (i.e., // with the batch dimension sliced out). void forward(TensorAccessor& out_slice, const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const; // Applies grid sampling (backward) procedure. Arguments semantics // and strategy are similar to those of `forward`. void backward(TensorAccessor& gInp_slice, TensorAccessor& gGrid_slice, const TensorAccessor& gOut_slice, const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const; } ``` ``` 3. `grid_sample_2d_grid_slice_iterator` function Among the tensors we work with, we know that the output tensors are contiguous (i.e., `output` in forward, and `grad_input` & `grad_grid` in backward), we need to randomly read `input` anyways, and `grad_output` usually comes from autograd and is often contiguous. So we base our iterating strategy on the geometry of grid. `grid_sample_2d_grid_slice_iterator` function provides an abstract to efficiently iterates through a `grid` slice (without batch dimension). See comments of that function on the specific cases and strategies used. ``` ```cpp template void grid_sample_2d_grid_slice_iterator( const TensorAccessor& grid_slice, const ApplyFn &apply_fn); // `apply_fn` is a function/lambda that can be called as if it has // declaration: // void apply_fn(const Vec256& grid_x, // const Vec256& grid_y, // int64_t spatial_offset, int64_t len); ``` ``` `apply_fn` will be called multiple times, and together cover the entire output spatial space. Therefore, e.g., to implement forward 2d grid sample, we can do ``` ```cpp ApplyGridSample grid_sample(input_accessor); for (int n = 0; n < input_accessor.size(0); n++) { grid_sample_2d_grid_slice_iterator( grid_accessor[n], [&](const Vec256& grid_x, const Vec256& grid_y, int64_t spatial_offset, int64_t len) { grid_sample.forward(out_accessor[n], input_accessor[n], spatial_offset, grid_x, grid_y, len); }); } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/10980 Differential Revision: D9564867 Pulled By: SsnL fbshipit-source-id: 5b7c3c7ea63af00eec230ae9ee1c3e6c6c9679b4 --- aten/src/ATen/Parallel.h | 2 +- aten/src/ATen/TensorGeometry.cpp | 11 +- aten/src/ATen/TensorUtils.cpp | 20 + aten/src/ATen/TensorUtils.h | 7 + aten/src/ATen/core/TensorAccessor.h | 29 +- aten/src/ATen/cpu/vec256/intrinsics.h | 2 +- aten/src/ATen/cpu/vec256/vec256.h | 190 ++++ aten/src/ATen/cpu/vec256/vec256_base.h | 228 ++++- aten/src/ATen/cpu/vec256/vec256_double.h | 60 +- aten/src/ATen/cpu/vec256/vec256_float.h | 65 +- aten/src/ATen/cpu/vec256/vec256_int.h | 164 +++- aten/src/ATen/native/DispatchStub.h | 23 +- aten/src/ATen/native/GridSampler.cpp | 301 +----- aten/src/ATen/native/GridSampler.h | 2 + .../src/ATen/native/cpu/GridSamplerKernel.cpp | 890 ++++++++++++++++++ aten/src/ATen/native/cpu/GridSamplerKernel.h | 18 + test/test_nn.py | 80 +- 17 files changed, 1721 insertions(+), 371 deletions(-) create mode 100644 aten/src/ATen/native/cpu/GridSamplerKernel.cpp create mode 100644 aten/src/ATen/native/cpu/GridSamplerKernel.h diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index 6aadd62eb1d3fd..a4c8b50abe8263 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -27,7 +27,7 @@ inline void parallel_for( const int64_t grain_size, const F& f) { #ifdef _OPENMP -#pragma omp parallel if ((end - begin) >= grain_size) +#pragma omp parallel if (!omp_in_parallel() && ((end - begin) >= grain_size)) { int64_t num_threads = omp_get_num_threads(); int64_t tid = omp_get_thread_num(); diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp index ca3fcd961feda0..b11c7bb159900b 100644 --- a/aten/src/ATen/TensorGeometry.cpp +++ b/aten/src/ATen/TensorGeometry.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -8,15 +9,7 @@ bool TensorGeometry::is_contiguous() const { if (numel_ == 0) { return true; } - int64_t dim = sizes_.size(); - int64_t expected_stride = 1; - for (int64_t i = dim - 1; i >= 0; i--) { - if (sizes_[i] != 1 && strides_[i] != expected_stride) { - return false; - } - expected_stride *= sizes_[i]; - } - return true; + return at::geometry_is_contiguous(sizes_, strides_); } Tensor TensorGeometry::zeros_with_stride(const Type& type) const { diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 454ab9e91fd29e..4ec8e374c2e515 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -215,4 +215,24 @@ void * maybe_data_ptr(const Tensor& tensor) { void * maybe_data_ptr(const TensorArg& tensor) { return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; } + +// See TensorUtils.h on why this is useful now that we cache is_contiguous. +bool geometry_is_contiguous(IntList sizes, IntList strides) { + int64_t dim = sizes.size(); + int64_t expected_stride = 1; + bool contig_if_nonempty = true; + for (int64_t i = dim - 1; i >= 0; i--) { + if (sizes[i] == 0) { + return true; + } + if (contig_if_nonempty) { + if (sizes[i] != 1 && strides[i] != expected_stride) { + contig_if_nonempty = false; + } + expected_stride *= sizes[i]; + } + } + return contig_if_nonempty; +} + } diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index cc7453f77375f6..2443bde4b482cb 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -78,4 +78,11 @@ AT_API void checkBackend(CheckedFrom c, at::ArrayRef t, at::Backend back AT_API void * maybe_data_ptr(const Tensor& tensor); AT_API void * maybe_data_ptr(const TensorArg& tensor); +// Return if the tensor geometry represented by `sizes` and `strides` is contiguous +// Although we cache is_contiguous in tensor now, this is till useful because it +// allows checking if a particular geometry is contiguous without explicitly +// constructing a tensor, e.g., when you want to choose a kernel strategy based +// on whether a subgeometry is contiguous. +AT_API bool geometry_is_contiguous(IntList sizes, IntList strides); + } diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index cbd130ed775d09..f686f0797b079d 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -44,14 +44,16 @@ class TensorAccessorBase { AT_HOSTDEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : data_(data_), sizes_(sizes_), strides_(strides_) {} - AT_HOST IntList sizes() { + AT_HOST IntList sizes() const { return IntList(sizes_,N); } - AT_HOST IntList strides() { + AT_HOST IntList strides() const { return IntList(strides_,N); } - AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; } - AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; } + AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; } + AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; } + AT_HOSTDEVICE T *data() { return data_; } + AT_HOSTDEVICE const T *data() const { return data_; } protected: PtrType data_; const int64_t* sizes_; @@ -73,6 +75,10 @@ class TensorAccessor : public TensorAccessorBase { AT_HOSTDEVICE TensorAccessor operator[](int64_t i) { return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); } + + AT_HOSTDEVICE const TensorAccessor operator[](int64_t i) const { + return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); + } }; template class PtrTraits> @@ -89,7 +95,7 @@ class TensorAccessor : public TensorAccessorBase { // PackedTensorAccessorBase and PackedTensorAccessor are used on for CUDA `Tensor`s on the host -// and as +// and as // In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host) // in order to transfer them on the device when calling kernels. // On the device, indexing of multidimensional tensors gives to `TensorAccessor`s. @@ -106,8 +112,8 @@ class PackedTensorAccessorBase { std::copy(sizes_, sizes_ + N, std::begin(this->sizes_)); std::copy(strides_, strides_ + N, std::begin(this->strides_)); } - AT_HOSTDEVICE int64_t stride(int64_t i) { return strides_[i]; } - AT_HOSTDEVICE int64_t size(int64_t i) { return sizes_[i]; } + AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; } + AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; } protected: PtrType data_; int64_t sizes_[N]; @@ -127,6 +133,12 @@ class PackedTensorAccessor : public PackedTensorAccessorBase { int64_t* new_strides = this->strides_+1; return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); } + + AT_DEVICE const TensorAccessor operator[](int64_t i) const { + int64_t* new_sizes = this->sizes_+1; + int64_t* new_strides = this->strides_+1; + return TensorAccessor(this->data_ + this->strides_[0]*i, new_sizes, new_strides); + } }; template class PtrTraits> @@ -139,6 +151,9 @@ class PackedTensorAccessor : public PackedTensorAccessorBasedata_[this->strides_[0]*i]; } + AT_DEVICE const T& operator[](int64_t i) const { + return this->data_[this->strides_[0]*i]; + } }; } diff --git a/aten/src/ATen/cpu/vec256/intrinsics.h b/aten/src/ATen/cpu/vec256/intrinsics.h index 442e8fd0511fc7..76779aada7a0b3 100644 --- a/aten/src/ATen/cpu/vec256/intrinsics.h +++ b/aten/src/ATen/cpu/vec256/intrinsics.h @@ -19,7 +19,7 @@ /* GCC-compatible compiler, targeting ARM with WMMX */ #include #elif (defined(__GNUC__) || defined(__xlC__)) && \ - (defined(__VEC__) || defined(__ALTIVEC__)) + (defined(__VEC__) || defined(__ALTIVEC__)) /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ #include #elif defined(__GNUC__) && defined(__SPE__) diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h index 98f1158465f2f7..71688bd48e5090 100644 --- a/aten/src/ATen/cpu/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec256/vec256.h @@ -32,4 +32,194 @@ std::ostream& operator<<(std::ostream& stream, const Vec256& vec) { return stream; } + +#if defined(__AVX__) && !defined(_MSC_VER) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template<> +Vec256 cast(const Vec256& src) { + return _mm256_castpd_ps(src); +} + +template<> +Vec256 cast(const Vec256& src) { + return _mm256_castps_pd(src); +} + +#if defined(__AVX2__) + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#define DEFINE_FLOAT_INT_CAST(int_t, float_t, float_ch) \ +template<> \ +Vec256 cast(const Vec256& src) { \ + return _mm256_castp ## float_ch ## _si256(src); \ +} \ +template<> \ +Vec256 cast(const Vec256& src) { \ + return _mm256_castsi256_p ## float_ch (src); \ +} + +DEFINE_FLOAT_INT_CAST(int64_t, double, d) +DEFINE_FLOAT_INT_CAST(int32_t, double, d) +DEFINE_FLOAT_INT_CAST(int16_t, double, d) +DEFINE_FLOAT_INT_CAST(int64_t, float, s) +DEFINE_FLOAT_INT_CAST(int32_t, float, s) +DEFINE_FLOAT_INT_CAST(int16_t, float, s) + +#undef DEFINE_FLOAT_INT_CAST + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +c10::guts::enable_if_t> +inline gather(const double* base_addr, const Vec256& vindex) { + return _mm256_i64gather_pd(base_addr, vindex, scale); +} + +template +c10::guts::enable_if_t> +inline gather(const float* base_addr, const Vec256& vindex) { + return _mm256_i32gather_ps(base_addr, vindex, scale); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +c10::guts::enable_if_t> +inline mask_gather(const Vec256& src, const double* base_addr, + const Vec256& vindex, const Vec256& mask) { + return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale); +} + +template +c10::guts::enable_if_t> +inline mask_gather(const Vec256& src, const float* base_addr, + const Vec256& vindex, const Vec256& mask) { + return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +// Only works for inputs in the range: [-2^51, 2^51] +// From: https://stackoverflow.com/a/41148578 +template<> +Vec256 +inline convert_to_int_of_same_size(const Vec256 &src) { + auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000)); + return _mm256_sub_epi64( + _mm256_castpd_si256(x), + _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) + ); +} + +template<> +Vec256 +inline convert_to_int_of_same_size(const Vec256 &src) { + return _mm256_cvttps_epi32(src); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vec256> +inline interleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, a1, a3, a3} + // b = {b0, b1, b2, b3} + + // swap lanes: + // a_swapped = {a0, a1, b0, b1} + // b_swapped = {a2, a3, b2, b3} + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + auto a_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_a); + auto b_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_b); + + // group cols crossing lanes: + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6); // 0, 2, 1, 3 + return std::make_pair(_mm256_permute4x64_pd(a_swapped, group_ctrl), + _mm256_permute4x64_pd(b_swapped, group_ctrl)); +} + +template <> +std::pair, Vec256> +inline interleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, a1, a2, a3, a4, a5, a6, a7} + // b = {b0, b1, b2, b3, b4, b5, b6, b7} + + // swap lanes: + // a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3} + // b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7} + // TODO: can we support caching this? + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + auto a_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_a); + auto b_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_b); + + // group cols crossing lanes: + // return {a0, b0, a1, b1, a2, b2, a3, b3} + // {a4, b4, a5, b5, a6, b6, a7, b7} + const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl), + _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template <> +std::pair, Vec256> +inline deinterleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + + // group cols crossing lanes: + // a_grouped = {a0, a1, b0, b1} + // b_grouped = {a2, a3, b2, b3} + static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6); // 0, 2, 1, 3 + auto a_grouped = _mm256_permute4x64_pd(a, group_ctrl); + auto b_grouped = _mm256_permute4x64_pd(b, group_ctrl); + + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_a), + _mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_b)); +} + +template <> +std::pair, Vec256> +inline deinterleave2(const Vec256& a, const Vec256& b) { + // inputs: + // a = {a0, b0, a1, b1, a2, b2, a3, b3} + // b = {a4, b4, a5, b5, a6, b6, a7, b7} + + // group cols crossing lanes: + // a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3} + // b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7} + // TODO: can we support caching this? + const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl); + auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl); + + // swap lanes: + // return {a0, a1, a2, a3, a4, a5, a6, a7} + // {b0, b1, b2, b3, b4, b5, b6, b7} + static constexpr int swap_ctrl_a = 0 | (2 << 4); // 0, 2. 4 bits apart + static constexpr int swap_ctrl_b = 1 | (3 << 4); // 1, 3. 4 bits apart + return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_a), + _mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_b)); +} + +#endif // defined(__AVX2__) + +#endif // defined(__AVX__) && !defined(_MSC_VER) + }}} diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h index fa3c4e550d6a32..f1eba7e2d3c428 100644 --- a/aten/src/ATen/cpu/vec256/vec256_base.h +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -3,8 +3,11 @@ #include #include #include +#include +#include #include "ATen/Utils.h" +#include "ATen/core/C++17.h" #if defined(__GNUC__) #define __at_align32__ __attribute__((aligned(32))) @@ -18,6 +21,21 @@ namespace at { namespace vec256 { namespace { +template struct int_of_size; + +#define DEFINE_INT_OF_SIZE(int_t) \ +template<> struct int_of_size { using type = int_t; } + +DEFINE_INT_OF_SIZE(int64_t); +DEFINE_INT_OF_SIZE(int32_t); +DEFINE_INT_OF_SIZE(int16_t); +DEFINE_INT_OF_SIZE(int8_t); + +#undef DEFINE_INT_OF_SIZE + +template +using int_same_size_t = typename int_of_size::type; + // NOTE: If you specialize on a type, you must define all operations! // emulates vectorized types @@ -33,8 +51,13 @@ struct Vec256 { values[i] = val; } } + template> + Vec256(Args... vals) { + values = { vals... }; + } template - static Vec256 blend(Vec256 a, Vec256 b) { + static Vec256 blend(const Vec256& a, const Vec256& b) { int64_t mask = mask_; Vec256 vec; for (int64_t i = 0; i < size; i++) { @@ -47,7 +70,29 @@ struct Vec256 { } return vec; } - static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + Vec256 vec; + int_same_size_t buffer[size]; + mask.store(buffer); + for (int64_t i = 0; i < size; i++) { + if (buffer[i] & 0x01) + { + vec[i] = b[i]; + } else { + vec[i] = a[i]; + } + } + return vec; + } + static Vec256 arange(T base = static_cast(0), T step = static_cast(1)) { + Vec256 vec; + for (int64_t i = 0; i < size; i++) { + vec.values[i] = base + i * step; + } + return vec; + } + static Vec256 set(const Vec256& a, const Vec256& b, int64_t count = size) { Vec256 vec; for (int64_t i = 0; i < size; i++) { if (i < count) { @@ -173,9 +218,28 @@ struct Vec256 { } return ret; } +#define DEFINE_COMP(binary_pred) \ + Vec256 operator binary_pred(const Vec256 &other) const { \ + Vec256 vec; \ + for (int64_t i = 0; i != size; i++) { \ + if (values[i] binary_pred other.values[i]) { \ + std::memset(static_cast(vec.values + i), 0xFF, sizeof(T)); \ + } else { \ + std::memset(static_cast(vec.values + i), 0, sizeof(T)); \ + } \ + } \ + return vec; \ + } + DEFINE_COMP(==) + DEFINE_COMP(!=) + DEFINE_COMP(>=) + DEFINE_COMP(<=) + DEFINE_COMP(>) + DEFINE_COMP(<) +#undef DEFINE_COMP }; -template Vec256 operator+(const Vec256 &a, const Vec256 &b) { +template Vec256 inline operator+(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] + b[i]; @@ -183,7 +247,7 @@ template Vec256 operator+(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 operator-(const Vec256 &a, const Vec256 &b) { +template Vec256 inline operator-(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] - b[i]; @@ -191,7 +255,7 @@ template Vec256 operator-(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 operator*(const Vec256 &a, const Vec256 &b) { +template Vec256 inline operator*(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] * b[i]; @@ -199,7 +263,7 @@ template Vec256 operator*(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 operator/(const Vec256 &a, const Vec256 &b) __ubsan_ignore_float_divide_by_zero__ { +template Vec256 inline operator/(const Vec256 &a, const Vec256 &b) __ubsan_ignore_float_divide_by_zero__ { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = a[i] / b[i]; @@ -207,7 +271,8 @@ template Vec256 operator/(const Vec256 &a, const Vec256 &b) _ return c; } -template Vec256 max(const Vec256 &a, const Vec256 &b) { + +template Vec256 inline max(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = std::max(a[i], b[i]); @@ -215,7 +280,7 @@ template Vec256 max(const Vec256 &a, const Vec256 &b) { return c; } -template Vec256 min(const Vec256 &a, const Vec256 &b) { +template Vec256 inline min(const Vec256 &a, const Vec256 &b) { Vec256 c = Vec256(); for (int i = 0; i != Vec256::size; i++) { c[i] = std::min(a[i], b[i]); @@ -223,9 +288,154 @@ template Vec256 min(const Vec256 &a, const Vec256 &b) { return c; } +#define DEFINE_BITWISE_OP(op) \ +template \ +Vec256 inline operator op(const Vec256 &a, const Vec256 &b) { \ + using iT = int_same_size_t; \ + iT buffer[Vec256::size]; \ + for (int64_t i = 0; i != Vec256::size; i++) { \ + auto a_val = a[i]; \ + auto b_val = b[i]; \ + iT *i_a_ptr = reinterpret_cast(&a_val); \ + iT *i_b_ptr = reinterpret_cast(&b_val); \ + buffer[i] = *i_a_ptr op *i_b_ptr; \ + } \ + return Vec256::loadu(buffer); \ +} +DEFINE_BITWISE_OP(&) +DEFINE_BITWISE_OP(|) +DEFINE_BITWISE_OP(^) +#undef DEFINE_BITWISE_OP + template -T fmadd(const T& a, const T& b, const T& c) { +inline T fmadd(const T& a, const T& b, const T& c) { return a * b + c; } +template +c10::guts::enable_if_t> +inline gather(T const* base_addr, const Vec256>& vindex) { + static constexpr int size = Vec256::size; + int_same_size_t index_arr[size]; + vindex.store(static_cast(index_arr)); + T buffer[size]; + for (int64_t i = 0; i < size; i++) { + buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; + } + return Vec256::loadu(static_cast(buffer)); +} + +template +c10::guts::enable_if_t> +inline mask_gather(const Vec256& src, T const* base_addr, + const Vec256>& vindex, Vec256& mask) { + static constexpr int size = Vec256::size; + T src_arr[size]; + int_same_size_t mask_arr[size]; // use int type so we can logical and + int_same_size_t index_arr[size]; + src.store(static_cast(src_arr)); + mask.store(static_cast(mask_arr)); + vindex.store(static_cast(index_arr)); + T buffer[size]; + for (int64_t i = 0; i < size; i++) { + if (mask_arr[i] & 0x01) { // check highest bit + buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; + } else { + buffer[i] = src_arr[i]; + } + } + mask = Vec256(); // "zero out" mask + return Vec256::loadu(static_cast(buffer)); +} + +// Cast a given vector to another type without changing the bits representation. +// So a Vec of 256 bits containing all ones can be cast to a +// Vec of 256 bits containing all ones (i.e., four negative 1s). +namespace { + // There is a struct here because we don't have static_if and I can't + // partially specialize a templated function. + template + struct CastImpl { + static inline Vec256 apply(const Vec256& src) { + src_t src_arr[Vec256::size]; + src.store(static_cast(src_arr)); + return Vec256::loadu(static_cast(src_arr)); + } + }; + + template + struct CastImpl { + static inline Vec256 apply(const Vec256& src) { + return src; + } + }; +} +template +Vec256 cast(const Vec256& src) { + return CastImpl::apply(src); +} + +template +inline Vec256> convert_to_int_of_same_size(const Vec256& src) { + static constexpr int size = Vec256::size; + T src_arr[size]; + src.store(static_cast(src_arr)); + int_same_size_t buffer[size]; + for (int64_t i = 0; i < size; i++) { + buffer[i] = static_cast>(src_arr[i]); + } + return Vec256>::loadu(static_cast(buffer)); +} + +// E.g., inputs: a Vec256 = {a0, b0, a1, b1, a2, b2, a3, b3} +// b Vec256 = {a4, b4, a5, b5, a6, b6, a7, b7} +// returns: Vec256 = {a0, a1, a2, a3, a4, a5, a6, a7} +// Vec256 = {b0, b1, b2, b3, b4, b5, b6, b7} +template +inline c10::guts::enable_if_t::size % 2 == 0, std::pair, Vec256>> +deinterleave2(const Vec256& a, const Vec256& b) { + static constexpr int size = Vec256::size; + static constexpr int half_size = size / 2; + T a_arr[size]; + T b_arr[size]; + T buffer1[size]; + T buffer2[size]; + a.store(static_cast(a_arr)); + b.store(static_cast(b_arr)); + for (int64_t i = 0; i < half_size; i++) { + buffer1[i] = a_arr[i * 2]; + buffer1[half_size + i] = b_arr[i * 2]; + buffer2[i] = a_arr[i * 2 + 1]; + buffer2[half_size + i] = b_arr[i * 2 + 1]; + } + return std::make_pair(Vec256::loadu(static_cast(buffer1)), + Vec256::loadu(static_cast(buffer2))); +} + +// inverse operation of deinterleave2 +// E.g., inputs: a Vec256 = {a0, a1, a2, a3, a4, a5, a6, a7} +// b Vec256 = {b0, b1, b2, b3, b4, b5, b6, b7} +// returns: Vec256 = {a0, b0, a1, b1, a2, b2, a3, b3} +// Vec256 = {a4, b4, a5, b5, a6, b6, a7, b7} +template +inline c10::guts::enable_if_t::size % 2 == 0, std::pair, Vec256>> +interleave2(const Vec256& a, const Vec256& b) { + static constexpr int size = Vec256::size; + static constexpr int half_size = size / 2; + T a_arr[size]; + T b_arr[size]; + T buffer1[size]; + T buffer2[size]; + a.store(static_cast(a_arr)); + b.store(static_cast(b_arr)); + for (int64_t i = 0; i < half_size; i++) { + buffer1[i * 2] = a_arr[i]; + buffer1[i * 2 + 1] = b_arr[i]; + buffer2[i * 2] = a_arr[half_size + i]; + buffer2[i * 2 + 1] = b_arr[half_size + i]; + } + return std::make_pair(Vec256::loadu(static_cast(buffer1)), + Vec256::loadu(static_cast(buffer2))); +} + }}} diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h index 05c21634659754..ced6fa6a37b6aa 100644 --- a/aten/src/ATen/cpu/vec256/vec256_double.h +++ b/aten/src/ATen/cpu/vec256/vec256_double.h @@ -22,14 +22,25 @@ template <> class Vec256 { Vec256(double val) { values = _mm256_set1_pd(val); } + Vec256(double val1, double val2, double val3, double val4) { + values = _mm256_setr_pd(val1, val2, val3, val4); + } operator __m256d() const { return values; } template - static Vec256 blend(Vec256 a, Vec256 b) { + static Vec256 blend(const Vec256& a, const Vec256& b) { return _mm256_blend_pd(a.values, b.values, mask); } - static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_pd(a.values, b.values, mask.values); + } + static Vec256 arange(double base = 0., double step = 1.) { + return Vec256(base, base + step, base + 2 * step, base + 3 * step); + } + static Vec256 set(const Vec256& a, const Vec256& b, + int64_t count = size) { switch (count) { case 0: return a; @@ -56,7 +67,7 @@ template <> class Vec256 { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_pd(reinterpret_cast(ptr), values); - } else { + } else if (count > 0) { double tmp_values[size]; _mm256_storeu_pd(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(double)); @@ -154,6 +165,32 @@ template <> class Vec256 { Vec256 pow(const Vec256 &b) const { return Vec256(Sleef_powd4_u10(values, b)); } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vec256 operator==(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ); + } + + Vec256 operator!=(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_NEQ_OQ); + } + + Vec256 operator<(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ); + } + + Vec256 operator<=(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ); + } + + Vec256 operator>(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ); + } + + Vec256 operator>=(const Vec256& other) const { + return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ); + } }; template <> @@ -186,9 +223,24 @@ Vec256 inline min(const Vec256& a, const Vec256& b) { return _mm256_min_pd(a, b); } +template <> +Vec256 inline operator&(const Vec256& a, const Vec256& b) { + return _mm256_and_pd(a, b); +} + +template <> +Vec256 inline operator|(const Vec256& a, const Vec256& b) { + return _mm256_or_pd(a, b); +} + +template <> +Vec256 inline operator^(const Vec256& a, const Vec256& b) { + return _mm256_xor_pd(a, b); +} + #ifdef __AVX2__ template <> -Vec256 fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { +Vec256 inline fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { return _mm256_fmadd_pd(a, b, c); } #endif diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h index c38fae11c24863..ebd0c10d2b62d1 100644 --- a/aten/src/ATen/cpu/vec256/vec256_float.h +++ b/aten/src/ATen/cpu/vec256/vec256_float.h @@ -16,20 +16,34 @@ template <> class Vec256 { private: __m256 values; public: - static constexpr int64_t size = 8; + static constexpr int size = 8; Vec256() {} Vec256(__m256 v) : values(v) {} Vec256(float val) { values = _mm256_set1_ps(val); } + Vec256(float val1, float val2, float val3, float val4, + float val5, float val6, float val7, float val8) { + values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8); + } operator __m256() const { return values; } template - static Vec256 blend(Vec256 a, Vec256 b) { + static Vec256 blend(const Vec256& a, const Vec256& b) { return _mm256_blend_ps(a.values, b.values, mask); } - static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_ps(a.values, b.values, mask.values); + } + static Vec256 arange(float base = 0.f, float step = 1.f) { + return Vec256( + base, base + step, base + 2 * step, base + 3 * step, + base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); + } + static Vec256 set(const Vec256& a, const Vec256& b, + int64_t count = size) { switch (count) { case 0: return a; @@ -61,7 +75,7 @@ template <> class Vec256 { void store(void* ptr, int64_t count = size) const { if (count == size) { _mm256_storeu_ps(reinterpret_cast(ptr), values); - } else { + } else if (count > 0) { float tmp_values[size]; _mm256_storeu_ps(reinterpret_cast(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(float)); @@ -159,6 +173,32 @@ template <> class Vec256 { Vec256 pow(const Vec256 &b) const { return Vec256(Sleef_powf8_u10(values, b)); } + // Comparison using the _CMP_**_OQ predicate. + // `O`: get false if an operand is NaN + // `Q`: do not raise if an operand is NaN + Vec256 operator==(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ); + } + + Vec256 operator!=(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_NEQ_OQ); + } + + Vec256 operator<(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ); + } + + Vec256 operator<=(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ); + } + + Vec256 operator>(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ); + } + + Vec256 operator>=(const Vec256& other) const { + return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ); + } }; template <> @@ -191,9 +231,24 @@ Vec256 inline min(const Vec256& a, const Vec256& b) { return _mm256_min_ps(a, b); } +template <> +Vec256 inline operator&(const Vec256& a, const Vec256& b) { + return _mm256_and_ps(a, b); +} + +template <> +Vec256 inline operator|(const Vec256& a, const Vec256& b) { + return _mm256_or_ps(a, b); +} + +template <> +Vec256 inline operator^(const Vec256& a, const Vec256& b) { + return _mm256_xor_ps(a, b); +} + #ifdef __AVX2__ template <> -Vec256 fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { +Vec256 inline fmadd(const Vec256& a, const Vec256& b, const Vec256& c) { return _mm256_fmadd_ps(a, b, c); } #endif diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h index c9b643e7d4bb09..2ca4d614c21e7b 100644 --- a/aten/src/ATen/cpu/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec256/vec256_int.h @@ -26,6 +26,9 @@ struct Vec256 : public Vec256i { using Vec256i::Vec256i; Vec256() {} Vec256(int64_t v) { values = _mm256_set1_epi64x(v); } + Vec256(int64_t val1, int64_t val2, int64_t val3, int64_t val4) { + values = _mm256_setr_epi64x(val1, val2, val3, val4); + } template static Vec256 blend(Vec256 a, Vec256 b) { __at_align32__ int64_t tmp_values[size]; @@ -40,6 +43,13 @@ struct Vec256 : public Vec256i { tmp_values[3] = _mm256_extract_epi64(b.values, 3); return loadu(tmp_values); } + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + static Vec256 arange(int64_t base = 0, int64_t step = 1) { + return Vec256(base, base + step, base + 2 * step, base + 3 * step); + } static Vec256 set(Vec256 a, Vec256 b, int64_t count = size) { switch (count) { @@ -65,7 +75,7 @@ struct Vec256 : public Vec256i { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else { + } else if (count > 0) { __at_align32__ int64_t tmp_values[size]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(int64_t)); @@ -79,6 +89,30 @@ struct Vec256 : public Vec256i { auto inverse = _mm256_xor_si256(values, is_larger); return _mm256_sub_epi64(inverse, is_larger); } + Vec256 operator==(const Vec256& other) const { + return _mm256_cmpeq_epi64(values, other.values); + } + Vec256 operator!=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto eq = _mm256_cmpeq_epi64(values, other.values); + return _mm256_xor_si256(zero, eq); // invert + } + Vec256 operator<(const Vec256& other) const { + return _mm256_cmpgt_epi64(other.values, values); + } + Vec256 operator<=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto gt = _mm256_cmpgt_epi64(values, other.values); + return _mm256_xor_si256(zero, gt); // invert + } + Vec256 operator>(const Vec256& other) const { + return _mm256_cmpgt_epi64(values, other.values); + } + Vec256 operator>=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto lt = _mm256_cmpgt_epi64(other.values, values); + return _mm256_xor_si256(zero, lt); // invert + } }; template <> @@ -87,10 +121,23 @@ struct Vec256 : public Vec256i { using Vec256i::Vec256i; Vec256() {} Vec256(int32_t v) { values = _mm256_set1_epi32(v); } + Vec256(int32_t val1, int32_t val2, int32_t val3, int32_t val4, + int32_t val5, int32_t val6, int32_t val7, int32_t val8) { + values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8); + } template static Vec256 blend(Vec256 a, Vec256 b) { return _mm256_blend_epi32(a, b, mask); } + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + static Vec256 arange(int32_t base = 0, int32_t step = 1) { + return Vec256( + base, base + step, base + 2 * step, base + 3 * step, + base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step); + } static Vec256 set(Vec256 a, Vec256 b, int32_t count = size) { switch (count) { @@ -124,7 +171,7 @@ struct Vec256 : public Vec256i { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else { + } else if (count > 0) { __at_align32__ int32_t tmp_values[size]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(int32_t)); @@ -135,6 +182,30 @@ struct Vec256 : public Vec256i { Vec256 abs() const { return _mm256_abs_epi32(values); } + Vec256 operator==(const Vec256& other) const { + return _mm256_cmpeq_epi32(values, other.values); + } + Vec256 operator!=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto eq = _mm256_cmpeq_epi32(values, other.values); + return _mm256_xor_si256(zero, eq); // invert + } + Vec256 operator<(const Vec256& other) const { + return _mm256_cmpgt_epi32(other.values, values); + } + Vec256 operator<=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto gt = _mm256_cmpgt_epi32(values, other.values); + return _mm256_xor_si256(zero, gt); // invert + } + Vec256 operator>(const Vec256& other) const { + return _mm256_cmpgt_epi32(values, other.values); + } + Vec256 operator>=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto lt = _mm256_cmpgt_epi32(other.values, values); + return _mm256_xor_si256(zero, lt); // invert + } }; template <> @@ -143,6 +214,13 @@ struct Vec256 : public Vec256i { using Vec256i::Vec256i; Vec256() {} Vec256(int16_t v) { values = _mm256_set1_epi16(v); } + Vec256(int16_t val1, int16_t val2, int16_t val3, int16_t val4, + int16_t val5, int16_t val6, int16_t val7, int16_t val8, + int16_t val9, int16_t val10, int16_t val11, int16_t val12, + int16_t val13, int16_t val14, int16_t val15, int16_t val16) { + values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8, + val9, val10, val11, val12, val13, val14, val15, val16); + } template static Vec256 blend(Vec256 a, Vec256 b) { __at_align32__ int16_t tmp_values[size]; @@ -181,6 +259,17 @@ struct Vec256 : public Vec256i { tmp_values[15] = _mm256_extract_epi16(b.values, 15); return loadu(tmp_values); } + static Vec256 blendv(const Vec256& a, const Vec256& b, + const Vec256& mask) { + return _mm256_blendv_epi8(a.values, b.values, mask.values); + } + static Vec256 arange(int16_t base = 0, int16_t step = 1) { + return Vec256( + base, base + step, base + 2 * step, base + 3 * step, + base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step, + base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step, + base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step); + } static Vec256 set(Vec256 a, Vec256 b, int16_t count = size) { switch (count) { @@ -230,7 +319,7 @@ struct Vec256 : public Vec256i { void store(void* ptr, int count = size) const { if (count == size) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); - } else { + } else if (count > 0) { __at_align32__ int16_t tmp_values[size]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values); std::memcpy(ptr, tmp_values, count * sizeof(int16_t)); @@ -241,6 +330,30 @@ struct Vec256 : public Vec256i { Vec256 abs() const { return _mm256_abs_epi16(values); } + Vec256 operator==(const Vec256& other) const { + return _mm256_cmpeq_epi16(values, other.values); + } + Vec256 operator!=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto eq = _mm256_cmpeq_epi16(values, other.values); + return _mm256_xor_si256(zero, eq); // invert + } + Vec256 operator<(const Vec256& other) const { + return _mm256_cmpgt_epi16(other.values, values); + } + Vec256 operator<=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto gt = _mm256_cmpgt_epi16(values, other.values); + return _mm256_xor_si256(zero, gt); // invert + } + Vec256 operator>(const Vec256& other) const { + return _mm256_cmpgt_epi16(values, other.values); + } + Vec256 operator>=(const Vec256& other) const { + auto zero = _mm256_set1_epi64x(0); + auto lt = _mm256_cmpgt_epi16(other.values, values); + return _mm256_xor_si256(zero, lt); // invert + } }; template <> @@ -258,6 +371,21 @@ Vec256 inline operator+(const Vec256& a, const Vec256 return _mm256_add_epi16(a, b); } +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_epi64(a, b); +} + +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_epi32(a, b); +} + +template <> +Vec256 inline operator-(const Vec256& a, const Vec256& b) { + return _mm256_sub_epi16(a, b); +} + // AVX2 has no intrinsic for int64_t multiply so it needs to be emulated // This could be implemented more efficiently using epi32 instructions // This is also technically avx compatible, but then we'll need AVX @@ -293,7 +421,7 @@ Vec256 inline operator*(const Vec256& a, const Vec256 } template -Vec256 intdiv_256(const Vec256& a, const Vec256& b) { +Vec256 inline intdiv_256(const Vec256& a, const Vec256& b) { T values_a[Vec256::size]; T values_b[Vec256::size]; a.store(values_a); @@ -304,20 +432,26 @@ Vec256 intdiv_256(const Vec256& a, const Vec256& b) { return Vec256::loadu(values_a); } -template <> -Vec256 inline operator/(const Vec256& a, const Vec256& b) { - return intdiv_256(a, b); +#define DEFINE_INTEGER_BINARY_OP(op, func) \ +template <> \ +Vec256 inline operator op(const Vec256& a, const Vec256& b) { \ + return func(a, b); \ +} \ +template <> \ +Vec256 inline operator op(const Vec256& a, const Vec256& b) { \ + return func(a, b); \ +} \ +template <> \ +Vec256 inline operator op(const Vec256& a, const Vec256& b) { \ + return func(a, b); \ } -template <> -Vec256 inline operator/(const Vec256& a, const Vec256& b) { - return intdiv_256(a, b); -} +DEFINE_INTEGER_BINARY_OP(/, intdiv_256) +DEFINE_INTEGER_BINARY_OP(&, _mm256_and_si256) +DEFINE_INTEGER_BINARY_OP(|, _mm256_or_si256) +DEFINE_INTEGER_BINARY_OP(^, _mm256_xor_si256) -template <> -Vec256 inline operator/(const Vec256& a, const Vec256& b) { - return intdiv_256(a, b); -} +#undef DEFINE_INTEGER_BINARY_OP #endif diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index dad05dcf8b47a8..42ef6a4f6bb5f1 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -22,7 +22,10 @@ // DEFINE_DISPATCH(stub); // // In native/cpu/MyKernel.cpp: -// void kernel(const Tensor& x) { ... } +// namespace { +// // use anonymous namespace so that different cpu versions won't conflict +// void kernel(const Tensor& x) { ... } +// } // REGISTER_DISPATCH(stub, &kernel); // // To call: @@ -46,19 +49,22 @@ enum class CPUCapability { CPUCapability get_cpu_capability(); template -struct AT_API DispatchStub { - static_assert(std::is_pointer::value, "FnPtr should be a pointer type"); +struct AT_API DispatchStub; + +template +struct AT_API DispatchStub { + using FnPtr = rT (*) (Args...); template - void operator()(DeviceType device_type, ArgTypes&&... args) { + rT operator()(DeviceType device_type, ArgTypes&&... args) { if (device_type == DeviceType::CPU) { if (!cpu_dispatch_ptr) { cpu_dispatch_ptr = choose_cpu_impl(); } - (*cpu_dispatch_ptr)(std::forward(args)...); + return (*cpu_dispatch_ptr)(std::forward(args)...); } else if (device_type == DeviceType::CUDA) { AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel"); - (*cuda_dispatch_ptr)(std::forward(args)...); + return (*cuda_dispatch_ptr)(std::forward(args)...); } else { AT_ERROR("DispatchStub: unsupported device type", device_type); } @@ -103,6 +109,11 @@ struct RegisterDispatch { }; } // anonymous namespace +// Compiler will complain if you put things like std::tuple in +// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g., +// adding parentheses and using helper struct to get rid of the parentheses, do +// not work with MSVC. So do a `using`-declaration if you need to pass in such +// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h. #define DECLARE_DISPATCH(fn, name) \ struct name : DispatchStub {}; \ extern AT_API struct name name diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 1547ab2c934053..4d09307bd640e0 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -3,8 +3,9 @@ #include "ATen/Device.h" #include "ATen/Error.h" #include "ATen/NativeFunctions.h" -#include "ATen/detail/CUDAHooksInterface.h" #include "ATen/native/GridSampler.h" +#include "ATen/native/cpu/GridSamplerKernel.h" +#include "ATen/cpu/vml.h" #ifdef _OPENMP #include @@ -16,6 +17,7 @@ using at::native::detail::GridSamplerInterpolation; using at::native::detail::GridSamplerPadding; namespace { + template static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) { return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); @@ -117,121 +119,10 @@ namespace { } } - template - Tensor grid_sampler_2d_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - auto output = at::empty({N, C, out_H, out_W}, input.options()); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t out_sN = output.stride(0); - int64_t out_sC = output.stride(1); - int64_t out_sH = output.stride(2); - int64_t out_sW = output.stride(3); - scalar_t *inp_ptr = input.data(); - scalar_t *out_ptr = output.data(); - scalar_t *grid_ptr = grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; - scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix = clip_coordinates(ix, inp_W); - iy = clip_coordinates(iy, inp_H); - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - ix = reflect_coordinates(ix, inp_W); - iy = reflect_coordinates(iy, inp_H); - } - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get NE, NW, SE, SW pixel values from (x, y) - int64_t ix_nw = static_cast(std::floor(ix)); - int64_t iy_nw = static_cast(std::floor(iy)); - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - // calculate bilinear weighted pixel value and set output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne - // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se - *out_ptr_NCHW = static_cast(0); - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast(std::round(ix)); - int64_t iy_nearest = static_cast(std::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) { - *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast(0); - } - } - } - } - } - } - return output; - } - template Tensor grid_sampler_3d_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { int64_t N = input.size(0); int64_t C = input.size(1); int64_t inp_D = input.size(2); @@ -395,167 +286,12 @@ namespace { return output; } - template - std::tuple - grid_sampler_2d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - // If interpolation mode is Nearest, then grad_grid is not filled in the - // loop below. - if (interpolation_mode == GridSamplerInterpolation::Nearest) { - grad_grid.zero_(); - } - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t gOut_sN = grad_output.stride(0); - int64_t gOut_sC = grad_output.stride(1); - int64_t gOut_sH = grad_output.stride(2); - int64_t gOut_sW = grad_output.stride(3); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sH = grad_input.stride(2); - int64_t gInp_sW = grad_input.stride(3); - int64_t gGrid_sN = grad_grid.stride(0); - int64_t gGrid_sW = grad_grid.stride(2); - scalar_t *inp_ptr = input.data(); - scalar_t *grid_ptr = grid.data(); - scalar_t *gOut_ptr = grad_output.data(); - scalar_t *gInp_ptr = grad_input.data(); - scalar_t *gGrid_ptr = grad_grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; - scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - - // multipliers for gradients on ix and iy - // E.g., 0 for out-of-bound indices when GridSamplerPadding::Border - scalar_t gix_mult, giy_mult; - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix = clip_coordinates_set_grad(ix, inp_W, &gix_mult); - iy = clip_coordinates_set_grad(iy, inp_H, &giy_mult); - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - ix = reflect_coordinates_set_grad(ix, inp_W, &gix_mult); - iy = reflect_coordinates_set_grad(iy, inp_H, &giy_mult); - } else { // padding_mode == GridSamplerPadding::Zeros - gix_mult = static_cast(1); - giy_mult = static_cast(1); - } - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get NE, NW, SE, SW pixel values from (x, y) - int64_t ix_nw = static_cast(std::floor(ix)); - int64_t iy_nw = static_cast(std::floor(iy)); - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCHW; - - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); - safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); - safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); - safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); - - // calculate grad_grid - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - // un-normalize grad_grid values back to [-1, 1] constraints - gix = gix * (inp_W - 1) / 2; - giy = giy * (inp_H - 1) / 2; - - // assuming grad_grid is contiguous - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast(std::round(ix)); - int64_t iy_nearest = static_cast(std::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) { - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW); - } - } - } - } - } - return std::make_tuple(grad_input, grad_grid); - } - template std::tuple grid_sampler_3d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { + const Tensor& input, const Tensor& grid, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { auto grad_input = at::zeros_like(input); auto grad_grid = at::empty_like(grid); // If interpolation mode is Nearest, then grad_grid is not filled in the @@ -783,18 +519,18 @@ namespace { } return std::make_tuple(grad_input, grad_grid); } -} + +} // namespace // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] { - return grid_sampler_2d_cpu_impl( - input, grid, static_cast(interpolation_mode), - static_cast(padding_mode)); - }); + return grid_sampler_2d_cpu_kernel(kCPU, input, grid, interpolation_mode, padding_mode); } +DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel); + + // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { @@ -809,14 +545,11 @@ Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, std::tuple grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] { - return grid_sampler_2d_backward_cpu_impl( - grad_output, input, grid, - static_cast(interpolation_mode), - static_cast(padding_mode)); - }); + return grid_sampler_2d_backward_cpu_kernel(kCPU, grad_output, input, grid, interpolation_mode, padding_mode); } +DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel); + // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. std::tuple grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h index f39b4e996469fa..ac9c72002c66cc 100644 --- a/aten/src/ATen/native/GridSampler.h +++ b/aten/src/ATen/native/GridSampler.h @@ -1,3 +1,5 @@ +#pragma once + #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp new file mode 100644 index 00000000000000..648defd192e117 --- /dev/null +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -0,0 +1,890 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +#include +#include +#include + +namespace at { namespace native { namespace { + +/** NOTE [ Grid Sample CPU Kernels ] + * + * Implementation of vectorized grid sample CPU kernels is divided into three + * parts. More detailed description exist after this paragraph, but on a high + * level, they are + * 1. `ComputeLocation` struct + * + Computes the interpolation location basing on padding mode. + * 2. `ApplyGridSample` struct + * + Owns N (# spatial dims) `ComputeLocation` structs, and uses them to + * compute the interpolation locations. + * + Interpolates the values and writes to output. + * 3. `grid_sample_2d_grid_slice_iterator` function + * + Iterates over a slice of the grid tensor based on the geometry by the + * spatial ordering, i.e., the first iteration will process grid values + * grid[n, 0, 0, :], grid[n, 0, 1, :], grid[n, 0, 2, :], ... + * (Recall that, e.g., 2D grid has shape [N x H x W x 2], so grid[n, ...] + * is a slice, and grid[n, h, w, :] contains the values for a single + * output spatial location.) + * + Applies a given operator at each iteration, so we can use the same + * pattern for forward and backward. + * + * Putting everything together, we have, e.g., the forward kernel implemented + * as + * + * // `ApplyGridSample` struct that processes grid values, extracts and + * // interpolates input values, and write to output. + * ApplyGridSample grid_sample(input_accessor); + * + * // For each slice, we call `grid_sample_2d_grid_slice_iterator` with + * // 1. the grid slice, and + * // 2. a lambda that takes in + * // i. location vectors (x and y for 2D) extracted from grid + * // ii. `spatial_offset` as the spatial offset of these vectors + * // from the beginning of this slice. + * // iii. `len` as the number of valid locations in the vectors. + * // (There might not be enough near boundary.) + * for (int n = 0; n < input_accessor.size(0); n++) { + * grid_sample_2d_grid_slice_iterator( + * grid_accessor[n], + * [&](const Vec256& grid_x, + * const Vec256& grid_y, + * int64_t spatial_offset, int64_t len) { + * grid_sample.forward(out_accessor[n], input_accessor[n], + * spatial_offset, grid_x, grid_y, len); + * }); + * } + * + * Now we talk about details of each of these three parts: + * + * 1. `ComputeLocation` struct + * Transforms grid values into interpolation locations of the input tensor + * for a particular spatial dimension, based on the size of that dimension + * in input tensor, and the padding mode. + * + * template + * struct ComputeLocation { + * using Vec = Vec256; + * + * // ctor + * ComputeLocation(int64_t size); + * + * // Given grid values `in`, return the interpolation locations after + * // un-normalization and padding mechanism (elementwise). + * Vec apply(const Vec &in) const; + * + * // Similar to `apply`, but also returns `d apply(in) / d in` + * // (elementwise). + * // this is often used in gradient computation. + * std::pair apply_get_grad(const Vec &in) const; + * }; + * + * 2. `ApplyGridSample` struct + * Owns N `ComputeLocation` structs, where N is the number of spatial + * dimensions. Given N input grid vectors (one for each spatial dimension) + * and spatial offset, it gets the interpolation locations from + * `ComputeLocation`s, applies interpolation procedure, and then writes to + * the output (or grad_input & grad_grid in backward). + * + * template + * struct ApplyGridSample { + * + * // ctor + * ApplyGridSample(const TensorAccessor& input); + * + * // Applies grid sampling (forward) procedure: + * // 1. computes interpolation locations from grid values `grid_x` + * // and `grid_y`, + * // 2. interpolates output values using the locations and input + * // data in `inp_slice`, and + * // 3. writes the first `len` values in the interpolated vector to + * // `out_slice` with spatial offset being `offset`. + * // + * // This assimes that `grid_x` and `grid_y` all contain valid grid + * // values \in [-1, 1], even at indices greater than `len`. + * // + * // The `*_slice` argument namess mean samples within a batch (i.e., + * // with the batch dimension sliced out). + * void forward(TensorAccessor& out_slice, + * const TensorAccessor& inp_slice, + * int64_t offset, const Vec& grid_x, const Vec& grid_y, + * int64_t len) const; + * + * // Applies grid sampling (backward) procedure. Arguments semantics + * // and strategy are similar to those of `forward`. + * void backward(TensorAccessor& gInp_slice, + * TensorAccessor& gGrid_slice, + * const TensorAccessor& gOut_slice, + * const TensorAccessor& inp_slice, + * int64_t offset, const Vec& grid_x, const Vec& grid_y, + * int64_t len) const; + * }; + * + * 3. `grid_sample_2d_grid_slice_iterator` function + * Among the tensors we work with, we know that the output tensors are + * contiguous (i.e., `output` in forward, and `grad_input` & `grad_grid` in + * backward), we need to randomly read `input` anyways, and `grad_output` + * usually comes from autograd and is often contiguous. So we base our + * iterating strategy on the geometry of grid. + * `grid_sample_2d_grid_slice_iterator` function provides an abstraction to + * efficiently iterates through a `grid` slice (without batch dimension). + * See comments of that function on the specific cases and strategies used. + * + * template + * void grid_sample_2d_grid_slice_iterator( + * const TensorAccessor& grid_slice, + * const ApplyFn &apply_fn); + * + * `apply_fn` is a function/lambda that takes in + * i. location vectors (x and y for 2D) extracted from grid + * ii. `spatial_offset` as the spatial offset of these vectors + * from the beginning of this slice. + * iii. `len` as the number of valid locations in the vectors. + * (There might not be enough near boundary.) + + * It should be callable as if it has declaration: + * void apply_fn(const Vec256& grid_x, + * const Vec256& grid_y, + * int64_t spatial_offset, int64_t len); + * + * `apply_fn` will be called multiple times, and together cover the entire + * output spatial space. + * + * Now you should be able tp understand everything about the implementaion of + * 2D forward kernel shown at the beginning of this note. + * + **/ + + +using at::native::detail::GridSamplerInterpolation; +using at::native::detail::GridSamplerPadding; +using namespace at::vec256; + + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ComputeLocation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Struct to compute interpolation location from grid values, and to apply +// padding mechanism (e.g., reflection). +// See NOTE [ Grid Sample CPU Kernels ] for details. + +template +struct ComputeLocationBase { + using Vec = Vec256; + + const scalar_t half_max_val; + + ComputeLocationBase(int64_t size) + : half_max_val(static_cast(size - 1) / 2) {} + + inline Vec unnormalize(const Vec &in) const { + return (in + Vec(1)) * Vec(half_max_val); + } +}; + +template +struct ComputeLocation; + +template +struct ComputeLocation + : ComputeLocationBase { + using Vec = Vec256; + using ComputeLocationBase::unnormalize; + using ComputeLocationBase::half_max_val; + + using ComputeLocationBase::ComputeLocationBase; + + inline Vec apply(const Vec &in) const { + return unnormalize(in); + } + + inline std::pair apply_get_grad(const Vec &in) const { + return std::make_pair(unnormalize(in), Vec(half_max_val)); + } +}; + +template +struct ComputeLocation + : ComputeLocationBase { + using Vec = Vec256; + using ComputeLocationBase::unnormalize; + using ComputeLocationBase::half_max_val; + + const scalar_t max_val; + + ComputeLocation(int64_t size) + : ComputeLocationBase(size) + , max_val(static_cast(size - 1)) {} + + inline Vec apply(const Vec &in) const { + return min(Vec(max_val), max(unnormalize(in), Vec(0))); + } + inline std::pair apply_get_grad(const Vec &in) const { + using int_t = int_same_size_t; + Vec max_val_vec(max_val), zeros(0); + auto indices = unnormalize(in); + auto bounded_lo = max(indices, zeros); + // Integral type equality comparison is very very fast because it just looks + // at the bits. Casting is free too. So we use the following pattern instead + // of comparison + blendv. + auto in_bound_lo = cast(cast(bounded_lo) == cast(indices)); + auto res = min(bounded_lo, max_val_vec); + auto in_bound_hi = cast(cast(res) == cast(indices)); + return std::make_pair(res, (in_bound_lo & in_bound_hi) & Vec(half_max_val)); + } +}; + +template +struct ComputeLocation + : ComputeLocationBase { + using Vec = Vec256; + using ComputeLocationBase::unnormalize; + using ComputeLocationBase::half_max_val; + + bool unit_size; // whether size == 1, just return 0 in this case + const scalar_t double_max_val; + const scalar_t neg_half_max_val; + + ComputeLocation(int64_t size) + : ComputeLocationBase(size) + , unit_size(size == 1) + , double_max_val(static_cast((size - 1) * 2)) + , neg_half_max_val(-0.5 * static_cast(size - 1)) {} + + inline Vec apply(const Vec &in) const { + if (unit_size) { + return Vec(0); + } + Vec double_max_val_vec(double_max_val); + auto abs_in = unnormalize(in).abs(); + auto fdouble_flips = abs_in / double_max_val_vec; + auto double_flips = fdouble_flips.trunc(); + auto extra = abs_in - double_flips * double_max_val_vec; + // Now we need to test if extra > max_val to find out if another flip is + // needed. The following comparison does that and returns the correct + // flipped value. + return min(extra, double_max_val_vec - extra); + } + + inline std::pair apply_get_grad(const Vec &in) const { + if (unit_size) { + return std::make_pair(Vec(0), Vec(0)); + } + Vec double_max_val_vec(double_max_val); + auto unnorm_in = unnormalize(in); + auto neg_in = unnorm_in < Vec(0); + auto abs_in = unnorm_in.abs(); + auto fdouble_flips = abs_in / double_max_val_vec; + auto double_flips = fdouble_flips.trunc(); + + auto extra = abs_in - double_flips * double_max_val_vec; + auto reflected_extra = double_max_val_vec - extra; + auto one_more_flip = extra > reflected_extra; + + return std::make_pair( + Vec::blendv(extra, reflected_extra, one_more_flip), + Vec::blendv(Vec(half_max_val), Vec(neg_half_max_val), one_more_flip ^ neg_in) + ); + } +}; + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ApplyGridSample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Struct to apply grid sample (reading from input, interpolate, and write to +// output). +// See NOTE [ Grid Sample CPU Kernels ] for details. + +template +static inline void +mask_scatter_add(const scalar_t *src, scalar_t* base_addr, + const int_same_size_t *offsets, + const int_same_size_t *mask, int64_t len) { + #pragma unroll + for (int64_t i = 0; i < len; i++) { + if (mask[i] & 0x01) { + base_addr[offsets[i]] += src[i]; + } + } +} + +template +struct ApplyGridSample; + +template +struct ApplyGridSample { + using Vec = Vec256; + using integer_t = int_same_size_t; + using iVec = Vec256; + + const int64_t inp_H; + const int64_t inp_W; + const int64_t inp_sH; + const int64_t inp_sW; + const int64_t C; + const int64_t inp_sC; + const ComputeLocation compute_H; + const ComputeLocation compute_W; + const bool must_in_bound = padding != GridSamplerPadding::Zeros; + + ApplyGridSample(const TensorAccessor& input) + : inp_H(input.size(2)) + , inp_W(input.size(3)) + , inp_sH(input.stride(2)) + , inp_sW(input.stride(3)) + , C(input.size(1)) + , inp_sC(input.stride(1)) + , compute_H(input.size(2)) + , compute_W(input.size(3)) {} + + inline std::tuple< + Vec, Vec, Vec, Vec, // distances to 4 sides + Vec, Vec, Vec, Vec, // interpolation weights wrt 4 corners + Vec, Vec, Vec, Vec, // in_bound masks + iVec, iVec // y_n and x_w + > + compute_interp_params(const Vec& x, const Vec& y) const { + // get NE, NW, SE, SW pixel values from (x, y) + // assuming we get exact integer representation and just use scalar_t + // if we don't, the weights will be garbage anyways. + auto x_w = x.floor(); + auto y_n = y.floor(); + + // get distances to each side + auto w = x - x_w; + auto e = Vec(1) - w; + auto n = y - y_n; + auto s = Vec(1) - n; + + // get interpolation weights for each neighbor + // e.g., for the nw corder, the weight is `dist_to_south * dist_to_east`. + auto nw = s * e; + auto ne = s * w; + auto sw = n * e; + auto se = n * w; + + auto i_x_w = convert_to_int_of_same_size(x_w); + auto i_y_n = convert_to_int_of_same_size(y_n); + auto i_x_e = i_x_w + iVec(1); + auto i_y_s = i_y_n + iVec(1); + + // Use int comparison because it is much faster than float comp with AVX2 + // (latency 1 cyc vs. 4 cyc on skylake) + // Avoid using the le and ge because those are not implemented in AVX2 and + // are actually simulated using multiple instructions. + auto w_mask = must_in_bound ? iVec(-1) // true = all ones + : (i_x_w > iVec(-1)) & (i_x_w < iVec(inp_W)); + auto n_mask = must_in_bound ? iVec(-1) // true = all ones + : (i_y_n > iVec(-1)) & (i_y_n < iVec(inp_H)); + auto e_mask = must_in_bound ? (i_x_e < iVec(inp_W)) + : (i_x_e > iVec(-1)) & (i_x_e < iVec(inp_W)); + auto s_mask = must_in_bound ? (i_y_s < iVec(inp_H)) + : (i_y_s > iVec(-1)) & (i_y_s < iVec(inp_H)); + auto nw_mask = cast(must_in_bound ? iVec(-1) : (w_mask & n_mask)); + auto ne_mask = cast(e_mask & n_mask); + auto sw_mask = cast(w_mask & s_mask); + auto se_mask = cast(e_mask & s_mask); + + return std::make_tuple( + n, s, w, e, + nw, ne, sw, se, + nw_mask, ne_mask, sw_mask, se_mask, + i_y_n, i_x_w); + } + + inline void forward(TensorAccessor& out_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + auto x = compute_W.apply(grid_x); + auto y = compute_H.apply(grid_y); + + auto interp_params = compute_interp_params(x, y); + + auto nw = std::get<4>(interp_params); + auto ne = std::get<5>(interp_params); + auto sw = std::get<6>(interp_params); + auto se = std::get<7>(interp_params); + + auto nw_mask = std::get<8>(interp_params); + auto ne_mask = std::get<9>(interp_params); + auto sw_mask = std::get<10>(interp_params); + auto se_mask = std::get<11>(interp_params); + + auto i_y_n = std::get<12>(interp_params); + auto i_x_w = std::get<13>(interp_params); + + auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW); + auto i_ne_offset = i_nw_offset + iVec(inp_sW); + auto i_sw_offset = i_nw_offset + iVec(inp_sH); + auto i_se_offset = i_sw_offset + iVec(inp_sW); + + #pragma unroll + for (int64_t c = 0; c < C; ++c) { + auto inp_slice_C_ptr = inp_slice[c].data(); + + // mask_gather zeros out the mask, so we need to make copies + Vec nw_mask_copy = nw_mask; + Vec ne_mask_copy = ne_mask; + Vec sw_mask_copy = sw_mask; + Vec se_mask_copy = se_mask; + auto nw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy); + auto ne_val = mask_gather(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy); + auto sw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy); + auto se_val = mask_gather(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy); + + auto interpolated = (nw_val * nw) + (ne_val * ne) + (sw_val * sw) + (se_val * se); + interpolated.store(out_slice[c].data() + offset, len); + } + } + + inline void backward(TensorAccessor& gInp_slice, + TensorAccessor& gGrid_slice, + const TensorAccessor& gOut_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + Vec x, y, gx_mult, gy_mult; + std::tie(x, gx_mult) = compute_W.apply_get_grad(grid_x); + std::tie(y, gy_mult) = compute_H.apply_get_grad(grid_y); + + Vec n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask; + iVec i_y_n, i_x_w; + + std::tie( + n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask, + i_y_n, i_x_w) = compute_interp_params(x, y); + + auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW); + auto i_ne_offset = i_nw_offset + iVec(inp_sW); + auto i_sw_offset = i_nw_offset + iVec(inp_sH); + auto i_se_offset = i_sw_offset + iVec(inp_sW); + + auto i_gInp_nw_offset = i_y_n * iVec(inp_W) + i_x_w; + auto i_gInp_ne_offset = i_gInp_nw_offset + iVec(1); + auto i_gInp_sw_offset = i_gInp_nw_offset + iVec(inp_W); + auto i_gInp_se_offset = i_gInp_sw_offset + iVec(1); + + // When reading input values, we used mask_gather. Unfortunately, there is + // no mask_scatter_add (the backward of mask_gather) in Intel intrinsics. + // So we store the necessary vectors to temporary arrays and use the helper + // mask_scatter_add defined above. + + integer_t i_gInp_nw_offset_arr[iVec::size]; + integer_t i_gInp_ne_offset_arr[iVec::size]; + integer_t i_gInp_sw_offset_arr[iVec::size]; + integer_t i_gInp_se_offset_arr[iVec::size]; + i_gInp_nw_offset.store(i_gInp_nw_offset_arr); + i_gInp_ne_offset.store(i_gInp_ne_offset_arr); + i_gInp_sw_offset.store(i_gInp_sw_offset_arr); + i_gInp_se_offset.store(i_gInp_se_offset_arr); + + integer_t i_nw_mask_arr[iVec::size]; + integer_t i_ne_mask_arr[iVec::size]; + integer_t i_sw_mask_arr[iVec::size]; + integer_t i_se_mask_arr[iVec::size]; + nw_mask.store(i_nw_mask_arr); + ne_mask.store(i_ne_mask_arr); + sw_mask.store(i_sw_mask_arr); + se_mask.store(i_se_mask_arr); + + scalar_t gInp_corner_arr[Vec::size]; + + auto gx = Vec(0), gy = Vec(0); + #pragma unroll + for (int64_t c = 0; c < C; ++c) { + auto inp_slice_C_ptr = inp_slice[c].data(); + auto gInp_slice_C_ptr = gInp_slice[c].data(); + auto gOut = Vec::loadu(gOut_slice[c].data() + offset, len); + + (nw * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_nw_offset_arr, i_nw_mask_arr, len); + (ne * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_ne_offset_arr, i_ne_mask_arr, len); + (sw * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_sw_offset_arr, i_sw_mask_arr, len); + (se * gOut).store(gInp_corner_arr); + mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_se_offset_arr, i_se_mask_arr, len); + + // mask_gather zeros out the mask, so we need to make copies + Vec nw_mask_copy = nw_mask; + Vec ne_mask_copy = ne_mask; + Vec sw_mask_copy = sw_mask; + Vec se_mask_copy = se_mask; + auto nw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy); + auto ne_val = mask_gather(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy); + auto sw_val = mask_gather(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy); + auto se_val = mask_gather(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy); + + gx = gx + ((ne_val - nw_val) * s + (se_val - sw_val) * n) * gOut; + gy = gy + ((sw_val - nw_val) * e + (se_val - ne_val) * w) * gOut; + } + + gx = gx * gx_mult; + gy = gy * gy_mult; + + constexpr int64_t step = Vec::size; + auto interleaved_gGrid = interleave2(gx, gy); + auto gGrid_ptr = gGrid_slice.data() + offset * 2; + std::get<0>(interleaved_gGrid).store(gGrid_ptr, + std::min(len * 2, step)); + std::get<1>(interleaved_gGrid).store(gGrid_ptr + step, + std::max(static_cast(0), len * 2 - step)); + } +}; + +template +struct ApplyGridSample { + using Vec = Vec256; + using integer_t = int_same_size_t; + using iVec = Vec256; + + const int64_t inp_H; + const int64_t inp_W; + const int64_t inp_sH; + const int64_t inp_sW; + const int64_t C; + const int64_t inp_sC; + const ComputeLocation compute_H; + const ComputeLocation compute_W; + const bool must_in_bound = padding != GridSamplerPadding::Zeros; + + ApplyGridSample(const TensorAccessor& input) + : inp_H(input.size(2)) + , inp_W(input.size(3)) + , inp_sH(input.stride(2)) + , inp_sW(input.stride(3)) + , C(input.size(1)) + , inp_sC(input.stride(1)) + , compute_H(input.size(2)) + , compute_W(input.size(3)) {} + + inline void forward(TensorAccessor& out_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + auto x = compute_W.apply(grid_x); + auto y = compute_H.apply(grid_y); + + auto x_nearest = x.round(); + auto y_nearest = y.round(); + + auto i_x_nearest = convert_to_int_of_same_size(x_nearest); + auto i_y_nearest = convert_to_int_of_same_size(y_nearest); + + auto i_mask = must_in_bound ? iVec(-1) + : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) & + (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H)); + auto mask = cast(i_mask); + + auto i_offset = i_y_nearest * iVec(inp_sH) + i_x_nearest * iVec(inp_sW); + + auto out_ptr = out_slice.data() + offset; + auto out_sC = out_slice.stride(0); + auto inp_slice_ptr = inp_slice.data(); + #pragma unroll + for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) { + // mask_gather zeros out the mask, so we need to make a copy + auto mask_copy = mask; + auto inp_val = mask_gather(Vec(0), inp_slice_ptr, i_offset, mask_copy); + inp_val.store(static_cast(out_ptr), len); + } + } + + inline void backward(TensorAccessor& gInp_slice, + TensorAccessor& gGrid_slice, + const TensorAccessor& gOut_slice, + const TensorAccessor& inp_slice, + int64_t offset, const Vec& grid_x, const Vec& grid_y, + int64_t len) const { + auto x = compute_W.apply(grid_x); + auto y = compute_H.apply(grid_y); + + auto x_nearest = x.round(); + auto y_nearest = y.round(); + + auto i_x_nearest = convert_to_int_of_same_size(x_nearest); + auto i_y_nearest = convert_to_int_of_same_size(y_nearest); + + auto i_mask = must_in_bound ? iVec(-1) + : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) & + (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H)); + + auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest; // gInp is contiguous + + integer_t mask_arr[iVec::size]; + i_mask.store(mask_arr); + integer_t gInp_offset_arr[iVec::size]; + i_gInp_offset.store(gInp_offset_arr); + + #pragma unroll + for (int64_t c = 0; c < C; ++c) { + mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(), + gInp_offset_arr, mask_arr, len); + } + + // grid has zero 0 gradient in Nearest mode + auto gGrid_ptr = gGrid_slice.data() + offset * 2; + std::memset(gGrid_ptr, 0, sizeof(scalar_t) * len * 2); + } +}; + +// ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~ +// Function to apply a vectorized function on a grid slice tensor (without batch +// dimension). +// See NOTE [ Grid Sample CPU Kernels ] for details. + +template +static inline void grid_sample_2d_grid_slice_iterator( + const TensorAccessor& grid_slice, const ApplyFn &apply_fn) { + int64_t out_H = grid_slice.size(0); + int64_t out_W = grid_slice.size(1); + int64_t grid_sH = grid_slice.stride(0); + int64_t grid_sW = grid_slice.stride(1); + int64_t grid_sCoor = grid_slice.stride(2); + auto grid_ptr = grid_slice.data(); + + using Vec = Vec256; + using iVec = Vec256>; + constexpr int64_t step = Vec::size; + + // Loop over each output pixel in grid. + // We consider the following three cases (after slicing out the batch + // dimension). + // See detailed discussions under each if-case. + + if (at::geometry_is_contiguous({out_H, out_W, 2}, {grid_sH, grid_sW, grid_sCoor})) { + // Case 1: + // Grid is contiguous. + // Strategy: Sequentially load two vectors at the same time, and get, + // e.g., {x0, y0, x1, y1}, {x2, y2, x3, y3}. Then we use + // at::vec256::deinterleave2 to get x and y vectors. + auto total_size = out_H * out_W; + for (int64_t spatial_offset = 0; spatial_offset < total_size; spatial_offset += step) { + auto grid_offset = spatial_offset * 2; + auto len = std::min(step, total_size - spatial_offset); + auto vec1 = Vec::loadu(grid_ptr + grid_offset, + std::min(step, len * 2)); + auto vec2 = Vec::loadu(grid_ptr + grid_offset + step, + std::max(static_cast(0), len * 2 - step)); + auto vec_xy_pair = deinterleave2(vec1, vec2); + + auto x = std::get<0>(vec_xy_pair); + auto y = std::get<1>(vec_xy_pair); + + // make sure that x and y are valid grid sample locations + if (len < step) { + x = Vec::set(Vec(0), x, len); + y = Vec::set(Vec(0), y, len); + } + apply_fn(x, y, spatial_offset, len); + } + } else if (grid_sW == 1 || out_W == 1) { + // Case 2: + // The W dimension is contiguous. + // This can be common, e.g., grid is from a conv net output of shape + // [N, 2, H, W]. + // Strategy: Divide into two contiguous slices each of shape [H, W], and + // each containing x and y vectors. So we sequentially load a + // vector from each of them to get x and y vector + + // Function to apply along a contiguous W dimension (or flattened H x W). + auto line_fn = [&](const scalar_t *grid_ptr_x, const scalar_t *grid_ptr_y, + int64_t out_base_offset, int64_t total_size) { + for (int64_t i = 0; i < total_size; i += step) { + auto len = std::min(step, total_size - i); + auto x = Vec::loadu(grid_ptr_x + i, len); + auto y = Vec::loadu(grid_ptr_y + i, len); + // make sure that x and y are valid grid sample locations + if (len < step) { + x = Vec::set(Vec(0), x, len); + y = Vec::set(Vec(0), y, len); + } + apply_fn(x, y, out_base_offset + i, len); + } + }; + + if (at::geometry_is_contiguous({out_H, out_W}, {grid_sH, grid_sW})) { + // If [H, W] is contiguous, apply line_fn once. + line_fn(grid_ptr, grid_ptr + grid_sCoor, 0, out_H * out_W); + } else { + // If only [W] is contiguous, apply line_fn once for each h slice. + auto grid_ptr_NH = grid_ptr; + for (int64_t h = 0; h < out_H; h++) { + line_fn(grid_ptr_NH, grid_ptr_NH + grid_sCoor, h * out_W, out_W); + grid_ptr_NH += grid_sH; + } + } + } else { + // Case 3: + // General case. + // Strategy: Do a for-loop over H, for each W slice, use + // at::vec256::gather to load the x and y vectors. + auto spatial_offset = 0; + auto i_offsets_delta = iVec(grid_sW * step); + + #pragma unroll + for (int64_t h = 0; h < out_H; h++) { + auto grid_ptr_x = grid_ptr + h * grid_sH; + auto grid_ptr_y = grid_ptr_x + grid_sCoor; + auto i_offsets = iVec::arange(0, grid_sW); + #pragma unroll + for (int64_t w = 0; w < out_W; w += step) { + auto len = std::min(step, out_W - w); + if (len < step) { + // prevents illegal memory access, sets the exceeding offsets to zero + i_offsets = iVec::set(iVec(0), i_offsets, len); + } + apply_fn(gather(grid_ptr_x, i_offsets), + gather(grid_ptr_y, i_offsets), + spatial_offset, len); + + i_offsets = i_offsets + i_offsets_delta; + spatial_offset += len; + } + } + } +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~ Grid Sample Kernels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Use the structs & functions defined above to calculate grid sample forward +// and backward. +// See NOTE [ Grid Sample CPU Kernels ] for details. + +Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode) { + auto N = input.size(0); + auto H = grid.size(1); + auto W = grid.size(2); + auto output = at::empty({N, input.size(1), H, W}, input.options()); + auto spatial_size = H * W; + auto grain_size = spatial_size == 0 ? (N + 1) + : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/); + +#define HANDLE_CASE(interp, padding) \ + case padding: { \ + ApplyGridSample grid_sample(inp_acc); \ + parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) { \ + for (int64_t n = begin; n < end; n++) { \ + auto out_slice = out_acc[n]; \ + auto inp_slice = inp_acc[n]; \ + grid_sample_2d_grid_slice_iterator( \ + grid_acc[n], \ + [&](const Vec256& grid_x, const Vec256& grid_y, \ + int64_t spatial_offset, int64_t len) { \ + grid_sample.forward(out_slice, inp_slice, spatial_offset, \ + grid_x, grid_y, len); \ + }); \ + } \ + }); \ + return; \ + } + +#define HANDLE_INTERP(interp) \ + case interp: { \ + switch (static_cast(padding_mode)) { \ + HANDLE_CASE(interp, GridSamplerPadding::Zeros); \ + HANDLE_CASE(interp, GridSamplerPadding::Border); \ + HANDLE_CASE(interp, GridSamplerPadding::Reflection); \ + } \ + return; \ + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_cpu_kernel_impl", [&] { + auto out_acc = output.accessor(); + auto inp_acc = input.accessor(); + auto grid_acc = grid.accessor(); + switch (static_cast(interpolation_mode)) { + HANDLE_INTERP(GridSamplerInterpolation::Bilinear); + HANDLE_INTERP(GridSamplerInterpolation::Nearest); + } + }); +#undef HANDLE_CASE +#undef HANDLE_INTERP + + return output; +} + +std::tuple +grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, + const Tensor& input, + const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode) { + // grad_output should be contiguous most of time. Ensuring that it is + // contiguous can greatly simplify this code. + auto grad_output = grad_output_.contiguous(); + + auto grad_input = at::zeros_like(input); + auto grad_grid = at::empty_like(grid); + auto N = input.size(0); + auto spatial_size = grid.size(1) * grid.size(2); + auto grain_size = spatial_size == 0 ? (N + 1) + : at::divup(at::internal::GRAIN_SIZE, spatial_size * 10 /* 2d * 5 tensors*/); + +#define HANDLE_CASE(interp, padding) \ + case padding: { \ + ApplyGridSample grid_sample(inp_acc); \ + parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) { \ + for (int64_t n = begin; n < end; n++) { \ + auto gInp_slice = gInp_acc[n]; \ + auto gGrid_slice = gGrid_acc[n]; \ + auto gOut_slice = gOut_acc[n]; \ + auto inp_slice = inp_acc[n]; \ + grid_sample_2d_grid_slice_iterator( \ + grid_acc[n], \ + [&](const Vec256& grid_x, const Vec256& grid_y, \ + int64_t spatial_offset, int64_t len) { \ + grid_sample.backward(gInp_slice, gGrid_slice, gOut_slice, inp_slice, \ + spatial_offset, grid_x, grid_y, len); \ + }); \ + } \ + }); \ + return; \ + } + +#define HANDLE_INTERP(interp) \ + case interp: { \ + switch (static_cast(padding_mode)) { \ + HANDLE_CASE(interp, GridSamplerPadding::Zeros); \ + HANDLE_CASE(interp, GridSamplerPadding::Border); \ + HANDLE_CASE(interp, GridSamplerPadding::Reflection); \ + } \ + return; \ + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] { + auto gInp_acc = grad_input.accessor(); + auto gGrid_acc = grad_grid.accessor(); + auto inp_acc = input.accessor(); + auto grid_acc = grid.accessor(); + auto gOut_acc = grad_output.accessor(); + switch (static_cast(interpolation_mode)) { + HANDLE_INTERP(GridSamplerInterpolation::Bilinear); + HANDLE_INTERP(GridSamplerInterpolation::Nearest); + } + }); +#undef HANDLE_CASE +#undef HANDLE_INTERP + + return std::make_tuple(grad_input, grad_grid); +} + +} + +REGISTER_DISPATCH(grid_sampler_2d_cpu_kernel, &grid_sampler_2d_cpu_kernel_impl); +REGISTER_DISPATCH(grid_sampler_2d_backward_cpu_kernel, &grid_sampler_2d_backward_cpu_kernel_impl); + + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h new file mode 100644 index 00000000000000..36ba3a91cc9bb8 --- /dev/null +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h @@ -0,0 +1,18 @@ +#pragma once + +#include "ATen/ATen.h" +#include "ATen/Dispatch.h" +#include "ATen/NativeFunctions.h" +#include "ATen/native/DispatchStub.h" +#include "ATen/cpu/vml.h" + +#include + +namespace at { namespace native { + +using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t); +using backward_2d_fn = std::tuple(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t); +DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel); +DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel); + +}} // namespace at::native diff --git a/test/test_nn.py b/test/test_nn.py index 809b820c42202e..ac2496620c3977 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -5097,34 +5097,55 @@ def test_grid_sample_error_checking(self): def test_grid_sample(self): def test(N, C, H, W, mode, padding_mode): def test_shape(N, C, IH, IW, H, W, mode, padding_mode): - input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_() - grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_() - out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode) - self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W])) + for grid_dim_contig_order in [(0, 1, 2, 3), (0, 3, 1, 2), (3, 0, 1, 2), (0, 2, 1, 3)]: + # grid_dim_contig_order specifies the dimension order that can + # make grid to be contiguous. + # i.e., grid.permute(grid_dim_contig_order) is contiguous. + # e.g., with grid_dim_contig_order=[0, 3, 1, 2], grid should be + # initialized with contiguous tensor of shape [N, 2, H, W] + # and permuted to [N, H, W, 2] afterwards. + grid_shape = [N, H, W, 2] + grid_init_shape = [grid_shape[d] for d in grid_dim_contig_order] + grid_fwd_permute = [None, None, None, None] + for i, d in enumerate(grid_dim_contig_order): + grid_fwd_permute[d] = i + + def get_grid(device='cpu', data=None): + if data is not None: + assert list(data.shape) == grid_shape + data = data.permute(grid_dim_contig_order).to(device) + else: + data = torch.randn(grid_init_shape, device=device) + grid = data.permute(grid_fwd_permute) + assert grid.permute(grid_dim_contig_order).is_contiguous() + return grid + + input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_() + grid_cpu = get_grid().requires_grad_() + out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode) + self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W])) - gradients = torch.randn_like(out_cpu) - out_cpu.backward(gradients) + gradients = torch.randn_like(out_cpu) + out_cpu.backward(gradients) - if TEST_CUDA: - input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() - grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() - out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode) - self.assertEqual(out_cpu, out_cuda) + if TEST_CUDA: + input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() + grid_cuda = get_grid('cuda', grid_cpu.detach()).requires_grad_() + out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode) + self.assertEqual(out_cpu, out_cuda) - out_cuda.backward(gradients.cuda()) - self.assertEqual(input_cpu.grad, input_cuda.grad) - self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5) + out_cuda.backward(gradients.cuda()) + self.assertEqual(input_cpu.grad, input_cuda.grad) + self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5) - # check that zero-dimensional input strides don't error out - base_input = torch.randn(N, C, 1, IW) - input_cpu = base_input.expand_as(input_cuda).requires_grad_() - grid_cpu = torch.randn(N, H, W, 2, requires_grad=True) - out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode) + # check that zero-dimensional input strides don't error out + base_input = torch.randn(N, C, 1, IW) + input_cpu = base_input.expand_as(input_cuda).requires_grad_() + out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode) - input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() - grid_cuda = grid_cpu.detach().cuda().requires_grad_() - out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode) - self.assertEqual(out_cpu, out_cuda) + input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() + out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode) + self.assertEqual(out_cpu, out_cuda) # test same size output test_shape(N, C, H, W, H, W, mode, padding_mode) @@ -5182,15 +5203,11 @@ def test_shape(N, C, IH, IW, H, W, mode, padding_mode): for mode in ('bilinear', 'nearest'): for padding_mode in ('zeros', 'border', 'reflection'): - # test known input on CPU input = torch.arange(1., 11).view(1, 1, 2, 5) grid = torch.tensor( - [[-0.9, -4.1, 0, 0.2, 1], - [-1, -0.333, 0, 0.5, 1], - [-1, -0.5, 0, 0.3333, 1], - [-1, -0.2, 0, 1.5, 0.5]]).view(1, 2, 5, 2) - output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode) + [[[-0.9, -4.1], [0, 0.2000], [1, -1], [-0.333, 1e-10], [0.5, 1.0]], + [[-1.0, -0.5], [0, 0.3333], [1, -1], [-0.200, 1e-10], [1.5, 0.5]]]).view(1, 2, 5, 2) if mode == 'bilinear': if padding_mode == 'zeros': groundtruth = torch.tensor( @@ -5223,7 +5240,10 @@ def test_shape(N, C, IH, IW, H, W, mode, padding_mode): assert False, "missing groundtruth test for padding mode '{}'".format(padding_mode) else: assert False, "missing groundtruth test for interpolation mode '{}'".format(mode) - self.assertEqual(output, groundtruth) + output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode) + self.assertEqual(output, groundtruth, + "groundtruth comparison failed for mode={}, " + "padding_mode={}".format(mode, padding_mode)) # do gradcheck N = random.randint(2, 8) From f5bc2aef0790c82f002be79db718203284de531e Mon Sep 17 00:00:00 2001 From: JerryShih Date: Mon, 17 Sep 2018 07:44:08 -0700 Subject: [PATCH 181/237] Update OpenMP cmake setting for xcode 9 compiler(AppleClang 9.0) (#11563) Summary: Fix the link OpenMP link error for AppleClang 9.0 compiler. Built with the following command: python setup.py build develop The error message: ``` Undefined symbols for architecture x86_64: "___kmpc_critical", referenced from: _THFloatTensor_addmm in THTensorMath.cpp.o _THDoubleTensor_addmm in THTensorMath.cpp.o _THByteTensor_addmm in THTensorMath.cpp.o _THCharTensor_addmm in THTensorMath.cpp.o _THShortTensor_addmm in THTensorMath.cpp.o _THIntTensor_addmm in THTensorMath.cpp.o _THLongTensor_addmm in THTensorMath.cpp.o ... "___kmpc_end_critical", referenced from: _THFloatTensor_addmm in THTensorMath.cpp.o _THDoubleTensor_addmm in THTensorMath.cpp.o _THByteTensor_addmm in THTensorMath.cpp.o _THCharTensor_addmm in THTensorMath.cpp.o _THShortTensor_addmm in THTensorMath.cpp.o _THIntTensor_addmm in THTensorMath.cpp.o _THLongTensor_addmm in THTensorMath.cpp.o ... "___kmpc_end_reduce_nowait", referenced from: _.omp_outlined..270 in THTensorMoreMath.cpp.o _.omp_outlined..271 in THTensorMoreMath.cpp.o _.omp_outlined..273 in THTensorMoreMath.cpp.o _.omp_outlined..275 in THTensorMoreMath.cpp.o _.omp_outlined..43 in THTensorEvenMoreMath.cpp.o _.omp_outlined..44 in THTensorEvenMoreMath.cpp.o _.omp_outlined..46 in THTensorEvenMoreMath.cpp.o ... "___kmpc_end_serialized_parallel", referenced from: at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o at::native::softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o at::native::log_softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o at::native::softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o at::native::log_softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o at::TensorIterator::for_each(std::__1::function const&) in TensorIterator.cpp.o ... "___kmpc_for_static_fini", referenced from: _.omp_outlined..9 in Embedding.cpp.o _.omp_outlined. in EmbeddingBag.cpp.o _.omp_outlined. in GridSampler.cpp.o _.omp_outlined..42 in GridSampler.cpp.o _.omp_outlined..44 in GridSampler.cpp.o _.omp_outlined..45 in GridSampler.cpp.o _.omp_outlined..47 in GridSampler.cpp.o ... "___kmpc_for_static_init_4", referenced from: _.omp_outlined. in init.cpp.o _.omp_outlined..35 in init.cpp.o _.omp_outlined..36 in init.cpp.o _.omp_outlined..37 in init.cpp.o _.omp_outlined..49 in init.cpp.o _.omp_outlined..52 in init.cpp.o _.omp_outlined..220 in init.cpp.o ... "___kmpc_for_static_init_8", referenced from: _.omp_outlined..9 in Embedding.cpp.o _.omp_outlined. in EmbeddingBag.cpp.o _.omp_outlined. in GridSampler.cpp.o _.omp_outlined..42 in GridSampler.cpp.o _.omp_outlined..44 in GridSampler.cpp.o _.omp_outlined..45 in GridSampler.cpp.o _.omp_outlined..47 in GridSampler.cpp.o ... "___kmpc_for_static_init_8u", referenced from: _.omp_outlined..203 in init.cpp.o _.omp_outlined..207 in init.cpp.o _.omp_outlined..209 in init.cpp.o _.omp_outlined..210 in init.cpp.o "___kmpc_fork_call", referenced from: at::native::embedding_dense_backward_cpu(at::Tensor const&, at::Tensor const&, long long, long long, bool) in Embedding.cpp.o at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o at::native::grid_sampler_2d_cpu(at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o at::native::grid_sampler_3d_cpu(at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o at::native::grid_sampler_2d_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o at::native::grid_sampler_3d_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, long long) in GridSampler.cpp.o ... "___kmpc_global_thread_num", referenced from: at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o at::native::softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o at::native::log_softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o at::native::softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o at::native::log_softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o at::TensorIterator::for_each(std::__1::function const&) in TensorIterator.cpp.o ... "___kmpc_push_num_threads", referenced from: void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Transpose const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Transpose const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Transpose const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Transpose const, 0, Eigen::OuterStride<-1> > const>, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o ... "___kmpc_reduce_nowait", referenced from: _.omp_outlined..270 in THTensorMoreMath.cpp.o _.omp_outlined..271 in THTensorMoreMath.cpp.o _.omp_outlined..273 in THTensorMoreMath.cpp.o _.omp_outlined..275 in THTensorMoreMath.cpp.o _.omp_outlined..43 in THTensorEvenMoreMath.cpp.o _.omp_outlined..44 in THTensorEvenMoreMath.cpp.o _.omp_outlined..46 in THTensorEvenMoreMath.cpp.o ... "___kmpc_serialized_parallel", referenced from: at::native::embedding_renorm_cpu_(at::Tensor&, at::Tensor const&, double, double) in Embedding.cpp.o at::native::_embedding_bag_dense_backward_cpu(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, long long, bool, long long) in EmbeddingBag.cpp.o at::native::softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o at::native::log_softmax_cpu(at::Tensor const&, long long) in SoftMax.cpp.o at::native::softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o at::native::log_softmax_backward_cpu(at::Tensor const&, at::Tensor const&, long long, at::Tensor const&) in SoftMax.cpp.o at::TensorIterator::for_each(std::__1::function const&) in TensorIterator.cpp.o ... "_omp_get_max_threads", referenced from: _THGetNumThreads in THGeneral.cpp.o caffe2::Caffe2SetOpenMPThreads(int*, char***) in init_omp.cc.o void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::Stride<0, 0> >, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Transpose const, 0, Eigen::Stride<0, 0> > const>, Eigen::Map, 0, Eigen::Stride<0, 0> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o void Eigen::internal::parallelize_gemm, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> >, long>(Eigen::internal::gemm_functor, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map const, 0, Eigen::OuterStride<-1> >, Eigen::Map, 0, Eigen::OuterStride<-1> >, Eigen::internal::gemm_blocking_space<0, float, float, -1, -1, -1, 1, false> > const&, long, long, long, bool) in math_cpu.cc.o ... "_omp_get_num_procs", referenced from: _THGetNumCores in THGeneral.cpp.o "_omp_get_num_threads", referenced from: _.omp_outlined. in Embedding.cpp.o _.omp_outlined. in SoftMax.cpp.o _.omp_outlined..35 in SoftMax.cpp.o _.omp_outlined..37 in SoftMax.cpp.o _.omp_outlined..38 in SoftMax.cpp.o _.omp_outlined..46 in SoftMax.cpp.o _.omp_outlined..47 in SoftMax.cpp.o ... "_omp_get_thread_num", referenced from: _.omp_outlined. in Embedding.cpp.o _.omp_outlined. in SoftMax.cpp.o _.omp_outlined..35 in SoftMax.cpp.o _.omp_outlined..37 in SoftMax.cpp.o _.omp_outlined..38 in SoftMax.cpp.o _.omp_outlined..46 in SoftMax.cpp.o _.omp_outlined..47 in SoftMax.cpp.o ... "_omp_in_parallel", referenced from: _THFloatTensor_copy in THTensorCopy.cpp.o _THDoubleTensor_copy in THTensorCopy.cpp.o _THByteTensor_copy in THTensorCopy.cpp.o _THCharTensor_copy in THTensorCopy.cpp.o _THShortTensor_copy in THTensorCopy.cpp.o _THIntTensor_copy in THTensorCopy.cpp.o _THLongTensor_copy in THTensorCopy.cpp.o ... "_omp_set_num_threads", referenced from: _THSetNumThreads in THGeneral.cpp.o caffe2::Caffe2SetOpenMPThreads(int*, char***) in init_omp.cc.o ld: symbol(s) not found for architecture x86_64 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11563 Differential Revision: D9831398 Pulled By: ezyang fbshipit-source-id: db119d3f9c26a71180335ad955f2f62c5369f9ed --- caffe2/CMakeLists.txt | 8 +++++ cmake/Dependencies.cmake | 65 +++++++++++++++++----------------------- torch/CMakeLists.txt | 13 ++++++-- 3 files changed, 45 insertions(+), 41 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 4e9b3eef8937f4..361ce228f3227a 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -206,6 +206,14 @@ if (${CAFFE2_LINK_LOCAL_PROTOBUF}) else() target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf) endif() +#cmake only check for separate OpenMP library on AppleClang 7+ +#https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252 +if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") + if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") + target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY}) + endif() +endif() target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS}) target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS}) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index dbec0d07ebf6e3..8fdfbd8ca8eb4a 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -456,12 +456,37 @@ endif() # ---[ OpenMP if(USE_OPENMP) - find_package(OpenMP) + set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") + if(APPLE AND CMAKE_COMPILER_IS_GNUCC) + exec_program(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) + string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) + message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") + if(DARWIN_VERSION GREATER 9) + set(APPLE_OPENMP_SUCKS 1) + endif(DARWIN_VERSION GREATER 9) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion + OUTPUT_VARIABLE GCC_VERSION) + if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) + message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") + message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") + add_compile_options(-Wno-unknown-pragmas) + set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) + endif() + endif() + + if(WITH_OPENMP AND NOT CHECKED_OPENMP) + find_package(OpenMP) + set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP") + + # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached) + # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake + set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found") + endif() + if(OPENMP_FOUND) message(STATUS "Adding " ${OpenMP_CXX_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") else() message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF") caffe2_update_option(USE_OPENMP OFF) @@ -956,42 +981,6 @@ if (NOT BUILD_ATEN_MOBILE) STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG}) STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE}) ENDIF() - - # OpenMP support? - SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") - IF (APPLE AND CMAKE_COMPILER_IS_GNUCC) - EXEC_PROGRAM (uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) - STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) - MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") - IF (DARWIN_VERSION GREATER 9) - SET(APPLE_OPENMP_SUCKS 1) - ENDIF (DARWIN_VERSION GREATER 9) - EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion - OUTPUT_VARIABLE GCC_VERSION) - IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) - MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") - MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") - add_compile_options(-Wno-unknown-pragmas) - SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) - ENDIF() - ENDIF() - - IF (WITH_OPENMP AND NOT CHECKED_OPENMP) - FIND_PACKAGE(OpenMP) - SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP") - - # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached) - # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake - SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found") - ENDIF() - - IF (OPENMP_FOUND) - MESSAGE(STATUS "Compiling with OpenMP support") - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - ENDIF() - - SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) FIND_PACKAGE(MAGMA) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 7679f3d6467ba0..706ff0c2d7091b 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -3,7 +3,7 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) return() endif() else() - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) + cmake_minimum_required(VERSION 3.5 FATAL_ERROR) project(torch CXX C) find_package(Caffe2 REQUIRED) option(USE_CUDA "Use CUDA" ON) @@ -322,8 +322,15 @@ if(OPENMP_FOUND) if (VERBOSE) message(STATUS "Compiling with OpenMP") endif() - target_compile_options(torch INTERFACE -fopenmp) - target_link_libraries(torch -fopenmp) + target_compile_options(torch INTERFACE ${OpenMP_CXX_FLAGS}) + #cmake only check for separate OpenMP library on AppleClang 7+ + #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252 + if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") + if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") + target_link_libraries(torch ${OpenMP_libomp_LIBRARY}) + endif() + endif() endif() if (NOT NO_API AND NOT USE_ROCM) From d63bb72d89723d0d913ba38b7a98e288236391b0 Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 17 Sep 2018 08:12:40 -0700 Subject: [PATCH 182/237] Remove symbol export annotations in THC/generic/*.cu (#11367) Summary: We use these annotations during function declarations, not definitions. See the description of compiler error [C2491](https://msdn.microsoft.com/en-us/library/62688esh.aspx) for more details. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11367 Reviewed By: ezyang Differential Revision: D9697923 Pulled By: orionr fbshipit-source-id: 1e539c02957851386f887e6d0510ce83117a1695 --- aten/src/THC/THCTensorSort.cuh | 2 + aten/src/THC/generic/THCStorage.cu | 2 +- aten/src/THC/generic/THCTensor.cu | 2 +- aten/src/THC/generic/THCTensorCopy.cu | 13 ++- aten/src/THC/generic/THCTensorMasked.cu | 30 +++---- aten/src/THC/generic/THCTensorMath.cu | 12 +-- aten/src/THC/generic/THCTensorMathBlas.cu | 28 +++--- aten/src/THC/generic/THCTensorMathCompare.cu | 24 ++--- aten/src/THC/generic/THCTensorMathCompareT.cu | 36 +++----- aten/src/THC/generic/THCTensorMathMagma.cu | 28 +++--- aten/src/THC/generic/THCTensorMathPairwise.cu | 41 +++------ .../src/THC/generic/THCTensorMathPointwise.cu | 60 +++++-------- aten/src/THC/generic/THCTensorMathReduce.cu | 87 ++++++++----------- aten/src/THC/generic/THCTensorMode.cu | 42 ++++----- aten/src/THC/generic/THCTensorRandom.cu | 42 ++++----- aten/src/THC/generic/THCTensorSort.cu | 18 ++-- aten/src/THC/generic/THCTensorSort.h | 7 ++ aten/src/THC/generic/THCTensorTopK.cu | 10 +-- 18 files changed, 205 insertions(+), 279 deletions(-) diff --git a/aten/src/THC/THCTensorSort.cuh b/aten/src/THC/THCTensorSort.cuh index 9b75a7355a00f4..b8feedcc1f21c9 100644 --- a/aten/src/THC/THCTensorSort.cuh +++ b/aten/src/THC/THCTensorSort.cuh @@ -1,6 +1,8 @@ #ifndef THC_TENSORSORT_CUH #define THC_TENSORSORT_CUH +#include "THCTensorMath.h" +#include "THCGeneral.h" #include "THCReduceApplyUtils.cuh" #include "THCSortUtils.cuh" #include "THCTensorCopy.h" diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu index 88ed2e5541820e..cd12d2bcae0fb6 100644 --- a/aten/src/THC/generic/THCStorage.cu +++ b/aten/src/THC/generic/THCStorage.cu @@ -18,7 +18,7 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) THCStorage_resize(state, self, size); } -THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { +int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) { return THCStorage_getDevice(state, storage); } diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu index 98478341575c75..0c694f5e4e25c8 100644 --- a/aten/src/THC/generic/THCTensor.cu +++ b/aten/src/THC/generic/THCTensor.cu @@ -2,7 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensor.cu" #else -THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) { +int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) { return THCTensor_getDevice(state, tensor); } diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 25ded22747441f..2c05c74f247a48 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorCopy.cu" #else -THC_API void -THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { +void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { if (dst == src) return; THC_copyTensor(state, dst, src); } @@ -51,16 +50,14 @@ void THCTensor_copyIgnoringOverlaps(THCState* state, THCTensor* dst, T ReadOnly); } -THC_API void -THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { +void THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { THCTensor_copyIgnoringOverlaps(state, dst, src); } #define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA, SCALARC) \ - THC_API void \ - THCTensor_(copyCuda##TYPEC)(THCState *state, \ - THCTensor *self, \ - THCuda##TYPECUDA##Tensor *src) { \ + void THCTensor_(copyCuda##TYPEC)(THCState *state, \ + THCTensor *self, \ + THCuda##TYPECUDA##Tensor *src) { \ THC_copyTensor(state, self, src); \ } diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu index f7e3e3f32a9a18..684ce31b141f79 100644 --- a/aten/src/THC/generic/THCTensorMasked.cu +++ b/aten/src/THC/generic/THCTensorMasked.cu @@ -3,9 +3,8 @@ #else -THC_API void -THCTensor_(maskedFill)(THCState* state, - THCTensor *tensor, THCudaByteTensor *mask, scalar_t value) +void THCTensor_(maskedFill)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask)); THArgCheck(THCTensor_(nElement)(state, tensor) == @@ -20,9 +19,8 @@ THCTensor_(maskedFill)(THCState* state, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(maskedFillByte)(THCState* state, - THCTensor *tensor, THByteTensor *mask, scalar_t value) +void THCTensor_(maskedFillByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor)); THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {}); @@ -31,9 +29,8 @@ THCTensor_(maskedFillByte)(THCState* state, THCudaByteTensor_free(state, maskCuda); } -THC_API void -THCTensor_(maskedCopy)(THCState* state, - THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) +void THCTensor_(maskedCopy)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask); @@ -98,9 +95,8 @@ THCTensor_(maskedCopy)(THCState* state, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(maskedCopyByte)(THCState* state, - THCTensor *tensor, THByteTensor *mask, THCTensor *src) { +void THCTensor_(maskedCopyByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, THCTensor *src) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {}); THCudaByteTensor_copyByte(state, maskCuda, mask); @@ -108,9 +104,8 @@ THCTensor_(maskedCopyByte)(THCState* state, THCudaByteTensor_free(state, maskCuda); } -THC_API void -THCTensor_(maskedSelect)(THCState* state, - THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) { +void THCTensor_(maskedSelect)(THCState* state, + THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); THArgCheck(THCudaByteTensor_nElement(state, mask) == THCTensor_(nElement)(state, src), @@ -171,9 +166,8 @@ THCTensor_(maskedSelect)(THCState* state, } // FIXME: remove now that we have THCudaByteTensor? -THC_API void -THCTensor_(maskedSelectByte)(THCState* state, - THCTensor *tensor, THCTensor *src, THByteTensor *mask) +void THCTensor_(maskedSelectByte)(THCState* state, + THCTensor *tensor, THCTensor *src, THByteTensor *mask) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src)); THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {}); diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index 9ffe626dd8425f..c4f7afb6a227b9 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMath.cu" #else -THC_API void -THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) +void THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); @@ -15,8 +14,7 @@ THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value) THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(zero)(THCState *state, THCTensor *self_) +void THCTensor_(zero)(THCState *state, THCTensor *self_) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); if (THCTensor_(isContiguous)(state, self_)) { @@ -35,16 +33,14 @@ THCTensor_(zero)(THCState *state, THCTensor *self_) THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input) +void THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input)); THCTensor_(resizeAs)(state, r_, input); THCTensor_(zero)(state, r_); } -THC_API void -THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input) +void THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input)); THCTensor_(resizeAs)(state, r_, input); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index 45364c7574cad4..d83b8ff929b4f6 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -5,8 +5,7 @@ #define ERROR_ONLY_FP_TYPES(func) \ THError("%s for CUDA tensors only supports floating-point types. Try converting the tensors with .float()", func); -THC_API accreal -THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) +accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -43,8 +42,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) #endif } -THC_API void -THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec) +void THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); @@ -151,8 +149,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s #endif } -THC_API void -THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2) +void THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); @@ -249,8 +246,7 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, sc #endif } -THC_API void -THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2) +void THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) @@ -413,9 +409,8 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s #endif } -THC_API void -THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, - scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { +void THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, + scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 2, 4, "expected 2D tensor"); @@ -478,9 +473,8 @@ __global__ void createBatchGemmBuffer3(const scalar_t** buffer1, const scalar_t } } -THC_API void -THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, - scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { +void THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t, + scalar_t alpha, THCTensor *batch1, THCTensor *batch2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 3, 4, "expected 3D tensor"); @@ -745,7 +739,7 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor #endif } -THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a) +void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THAssert(THCTensor_(checkGPU)(state, 2, ra_, a)); @@ -852,8 +846,8 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens } -THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, - THCTensor *atf, THCudaIntTensor *pivots) +void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b, + THCTensor *atf, THCudaIntTensor *pivots) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b)); diff --git a/aten/src/THC/generic/THCTensorMathCompare.cu b/aten/src/THC/generic/THCTensorMathCompare.cu index 0a0041ab9e4784..3c8e8ce0c325a7 100644 --- a/aten/src/THC/generic/THCTensorMathCompare.cu +++ b/aten/src/THC/generic/THCTensorMathCompare.cu @@ -2,7 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu" #else -THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -10,7 +10,7 @@ THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -18,7 +18,7 @@ THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -26,7 +26,7 @@ THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -34,7 +34,7 @@ THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -42,7 +42,7 @@ THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -50,7 +50,7 @@ THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTe unsigned char>(value)); } -THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -58,7 +58,7 @@ THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -66,7 +66,7 @@ THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -74,7 +74,7 @@ THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -82,7 +82,7 @@ THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, @@ -90,7 +90,7 @@ THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor * scalar_t>(value)); } -THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) +void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); THC_logicalValue(state, self_, src, diff --git a/aten/src/THC/generic/THCTensorMathCompareT.cu b/aten/src/THC/generic/THCTensorMathCompareT.cu index 6397a0b7caaa96..1bd4b9909fc26d 100644 --- a/aten/src/THC/generic/THCTensorMathCompareT.cu +++ b/aten/src/THC/generic/THCTensorMathCompareT.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu" #else -THC_API void -THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -11,8 +10,7 @@ THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -20,8 +18,7 @@ THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -29,8 +26,7 @@ THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -38,8 +34,7 @@ THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -47,8 +42,7 @@ THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -56,8 +50,7 @@ THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, unsigned char>()); } -THC_API void -THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -65,8 +58,7 @@ THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -74,8 +66,7 @@ THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -83,8 +74,7 @@ THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -92,8 +82,7 @@ THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, @@ -101,8 +90,7 @@ THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen scalar_t>()); } -THC_API void -THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THC_logicalTensor(state, self_, src1, src2, diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index ecf39d9a1bf0f2..29c7999f74a9b6 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -60,7 +60,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T } -THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); @@ -98,8 +98,8 @@ THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, T #endif } -THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_, - const char *uplo, const char *trans, const char *diag) +void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_, + const char *uplo, const char *trans, const char *diag) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); @@ -135,7 +135,7 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, #endif } -THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) +void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); @@ -182,7 +182,7 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T #endif } -THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) +void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) { #ifdef USE_MAGMA int64_t n = THTensor_sizeLegacyNoScalars(a, 0); @@ -247,7 +247,7 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T #endif } -THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) +void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) { #ifdef USE_MAGMA THArgCheck(a_->dim() == 2, 3, "A should be 2 dimensional"); @@ -321,7 +321,7 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T #endif } -THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu) +void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu) { #ifdef USE_MAGMA THCTensor *ra_ = THCTensor_(new)(state); @@ -332,7 +332,7 @@ THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, #endif } -THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus) +void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus) { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); @@ -396,7 +396,7 @@ THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, #endif } -THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) +void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) { THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); @@ -524,7 +524,7 @@ __global__ void THCTensor_(copyLowerSymmetric)(scalar_t *input, int n, int len) } } -THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); @@ -564,7 +564,7 @@ THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, co #endif } -THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) +void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional"); @@ -600,7 +600,7 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co #endif } -THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) +void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); @@ -632,7 +632,7 @@ THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, TH #endif } -THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_) +void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); @@ -669,7 +669,7 @@ THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_ #endif } -THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_) +void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_) { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu index 06720b358df985..0f2d0067ff5546 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu" #else -THC_API void -THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -21,8 +20,7 @@ THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -40,20 +38,17 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) +void THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) { THCTensor_(add)(state, self_, src_, value * alpha); } -THC_API void -THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) +void THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha) { THCTensor_(sub)(state, self_, src_, value * alpha); } -THC_API void -THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -71,8 +66,7 @@ THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); THArgCheck(value != ScalarConvert::to(0), 3, "divide by zero"); @@ -92,8 +86,7 @@ THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t val THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCTensor_(mul)(state, self_, src_, pow(2, value)); @@ -116,8 +109,7 @@ THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t #endif } -THC_API void -THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCTensor_(mul)(state, self_, src_, pow(2, -value)); @@ -140,8 +132,7 @@ THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t #endif } -THC_API void -THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -159,8 +150,7 @@ THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t va THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (self_ == src_) { @@ -235,7 +225,7 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_ THCudaCheck(cudaGetLastError()); } -THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) +int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); if (!THCTensor_(isSameSizeAs(state, self_, src_))) { @@ -259,8 +249,7 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_ return min != 0; } -THC_API void -THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) return THError("bitand only supported for integer type tensors"); @@ -281,8 +270,7 @@ THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t #endif } -THC_API void -THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) return THError("bitor only supported for integer type tensors"); @@ -303,8 +291,7 @@ THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t v #endif } -THC_API void -THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) return THError("bitxor only supported for integer type tensors"); diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index f5e1443b0eb5e7..4ff836fd53dda6 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -108,8 +108,7 @@ void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, scalar THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension) +void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y)); @@ -211,8 +210,7 @@ void THCTensor_(polygamma)(THCState* state, THCTensor* self_, int64_t n, THCTens THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w) +void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b)); THArgCheck(THCTensor_(nElement)(state, a) == @@ -235,8 +233,7 @@ c10::intrusive_ptr retainTensorImpl(THC } } -THC_API void -THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) +void THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) { auto out = at::Tensor(retainTensorImpl(self_)); #ifdef THC_REAL_IS_HALF @@ -247,8 +244,7 @@ THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t va at::add_out(out, retainTensorImpl(src1), retainTensorImpl(src2), alpha); } -THC_API void -THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) +void THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2) { auto out = at::Tensor(retainTensorImpl(self_)); #ifdef THC_REAL_IS_HALF @@ -259,15 +255,13 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t va at::sub_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)), alpha); } -THC_API void -THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { auto out = at::Tensor(retainTensorImpl(self_)); at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } -THC_API void -THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -374,15 +368,13 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, scalar_t value, THCTens THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { auto out = at::Tensor(retainTensorImpl(self_)); at::div_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } -THC_API void -THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) return THError("clshift not supported for torch.CudaHalfTensor"); @@ -409,8 +401,7 @@ THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso #endif } -THC_API void -THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) return THError("crshift not supported for torch.CudaHalfTensor"); @@ -437,8 +428,7 @@ THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso #endif } -THC_API void -THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -456,8 +446,7 @@ THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s } } -THC_API void -THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -475,8 +464,7 @@ THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s } } -THC_API void -THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -494,8 +482,7 @@ THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTen } } -THC_API void -THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) +void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); THArgCheck(THCTensor_(nElement)(state, src1) == @@ -513,8 +500,7 @@ THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor * } } -THC_API void -THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) +void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -530,8 +516,7 @@ THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t } } -THC_API void -THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) +void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -547,8 +532,7 @@ THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t } } -THC_API void -THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) +void THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2)); if(self_ != t) @@ -572,8 +556,7 @@ THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) +void THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2)); if(self_ != t) @@ -596,8 +579,7 @@ THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) return THError("cbitand is only supported for integer type tensors"); @@ -624,8 +606,7 @@ THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso #endif } -THC_API void -THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) return THError("cbitor is only supported for integer type tensors"); @@ -652,8 +633,7 @@ THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor #endif } -THC_API void -THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) return THError("cbitor is only supported for integer type tensors"); diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu index 91319745b6a1de..009ac39f346db9 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.cu +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -2,8 +2,7 @@ #define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu" #else -THC_API void -THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { +void THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); if (!THC_reduceDim(state, self, src, thrust::identity{}, @@ -18,8 +17,7 @@ THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { +void THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); if (!THC_reduceDim(state, self, src, thrust::identity{}, @@ -34,8 +32,7 @@ THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim) +void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); const accreal size = scalar_cast(THCTensor_(size)(state, src, dim)); @@ -54,8 +51,7 @@ THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) -THC_API void -THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm) +void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); THCTensor *self_; @@ -88,8 +84,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t va THCTensor_(free)(state, data); } -THC_API void -THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) +void THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); @@ -116,8 +111,7 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension } } -THC_API void -THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) +void THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); @@ -144,15 +138,13 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension } } -THC_API accreal -THCTensor_(stdall)(THCState *state, THCTensor *self, int biased) +accreal THCTensor_(stdall)(THCState *state, THCTensor *self, int biased) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); return THCNumerics::sqrt((THCTensor_(varall)(state, self, biased))); } -THC_API accreal -THCTensor_(varall)(THCState *state, THCTensor *self, int biased) +accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal mean = THCTensor_(meanall)(state, self); @@ -175,8 +167,7 @@ THCTensor_(varall)(THCState *state, THCTensor *self, int biased) return val; } -THC_API void -THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim) +void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim) { const accreal value = scalar_cast(_value); THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src)); @@ -220,8 +211,7 @@ THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _val THCudaCheck(cudaGetLastError()); } -THC_API accreal -THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value) +accreal THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value) { const accreal value = scalar_cast(_value); THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); @@ -294,8 +284,7 @@ accreal THCTensor_(dist)(THCState *state, THCTensor *self, #endif -THC_API accreal -THCTensor_(sumall)(THCState *state, THCTensor *self) { +accreal THCTensor_(sumall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -310,8 +299,7 @@ THCTensor_(sumall)(THCState *state, THCTensor *self) { return val; } -THC_API accreal -THCTensor_(prodall)(THCState *state, THCTensor *self) { +accreal THCTensor_(prodall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -326,15 +314,13 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) { return val; } -THC_API accreal -THCTensor_(meanall)(THCState *state, THCTensor *self) +accreal THCTensor_(meanall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self); } -THC_API scalar_t -THCTensor_(minall)(THCState *state, THCTensor *self) { +scalar_t THCTensor_(minall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -348,8 +334,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) { return scalar_cast(val); } -THC_API scalar_t -THCTensor_(maxall)(THCState *state, THCTensor *self) { +scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); accreal val; if (!THC_reduceAll(state, self, @@ -363,8 +348,7 @@ THCTensor_(maxall)(THCState *state, THCTensor *self) { return scalar_cast(val); } -THC_API scalar_t -THCTensor_(medianall)(THCState *state, THCTensor *self) { +scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); scalar_t val; @@ -391,13 +375,12 @@ THCTensor_(medianall)(THCState *state, THCTensor *self) { return val; } -THC_API void -THCTensor_(median)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *self, - int dimension, - int keepdim) { +void THCTensor_(median)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *self, + int dimension, + int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self)); int64_t t_size_dim, k; @@ -433,13 +416,12 @@ THCTensor_(median)(THCState *state, THCudaCheck(cudaGetLastError()); } -THC_API void -THCTensor_(max)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *src, - int dimension, - int keepdim) { +void THCTensor_(max)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + int dimension, + int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src)); thrust::pair @@ -452,13 +434,12 @@ THCTensor_(max)(THCState *state, MaxValuePair()); } -THC_API void -THCTensor_(min)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *src, - int dimension, - int keepdim) { +void THCTensor_(min)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + int dimension, + int keepdim) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src)); thrust::pair diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu index 274093ef105ae5..db72921bcfd1b0 100644 --- a/aten/src/THC/generic/THCTensorMode.cu +++ b/aten/src/THC/generic/THCTensorMode.cu @@ -2,13 +2,13 @@ #define THC_GENERIC_FILE "generic/THCTensorMode.cu" #else -THC_API void THCTensor_(calculateMode)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *input, - THCudaLongStorage *sortBuffer, - int dimension, - THLongStorage *position) { +void THCTensor_(calculateMode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + THCudaLongStorage *sortBuffer, + int dimension, + THLongStorage *position) { THAssert(THCTensor_(isContiguous)(state, input)); // Because the input is contiguous, we want to get a reference to the @@ -129,14 +129,14 @@ THC_API void THCTensor_(calculateMode)(THCState *state, } // this probably could be a loop, not a recursive algorithm -THC_API void THCTensor_(dimApplyMode)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *input, - THCudaLongStorage *sortBuffer, - int dimension, - THLongStorage *position, - int curDim) { +void THCTensor_(dimApplyMode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + THCudaLongStorage *sortBuffer, + int dimension, + THLongStorage *position, + int curDim) { int64_t ndim = THCTensor_(nDimensionLegacyAll)(state, input); // Because we have transposed the Tensor, the data for the dimension we are mode'ing along @@ -155,12 +155,12 @@ THC_API void THCTensor_(dimApplyMode)(THCState *state, #define MAX_GRID_SIZE 65535 #define MAX_BLOCK_SIZE 1024 -THC_API void THCTensor_(mode)(THCState *state, - THCTensor *values, - THCudaLongTensor *indices, - THCTensor *input, - int dimension, - int keepdim) { +void THCTensor_(mode)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *input, + int dimension, + int keepdim) { THCTensor *transposed, *contiguous, *valuesTransposed; THLongStorage *position; THCudaLongStorage *sortBuffer; diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu index 620c73e9af01d3..4cbc6dd1a29999 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -6,7 +6,7 @@ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) -THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b) +void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -21,7 +21,7 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv) +void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -36,13 +36,13 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) { +void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) { THCTensor_(resizeAs)(state, self, means); THCTensor_(normal)(state, self, 0, stddev); THCTensor_(cadd)(state, self, self, ScalarConvert::to(1), means); } -THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs) +void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs) { THCTensor_(resizeAs)(state, self, stddevs); THCTensor_(normal)(state, self, 0, 1); @@ -50,7 +50,7 @@ THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double THCTensor_(add)(state, self, self, ScalarConvert::to(mean)); } -THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs) +void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs) { THCTensor_(resizeAs)(state, self, means); THCTensor_(normal)(state, self, 0, 1); @@ -58,7 +58,7 @@ THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor_(cadd)(state, self, self, ScalarConvert::to(1), means); } -THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv) +void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); @@ -75,7 +75,7 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda) +void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -91,7 +91,7 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma) +void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -128,11 +128,11 @@ void THCTensor_(renormRows)(struct THCState* state, rows, cols); } -THC_API void THCTensor_(multinomial)(struct THCState *state, - THCudaLongTensor *self, - THCTensor *prob_dist, - int n_sample, - int with_replacement) +void THCTensor_(multinomial)(struct THCState *state, + THCudaLongTensor *self, + THCTensor *prob_dist, + int n_sample, + int with_replacement) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist)); THCGenerator* gen = THCRandom_getGenerator(state); @@ -299,7 +299,7 @@ THC_API void THCTensor_(multinomial)(struct THCState *state, } } -THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){ +void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){ THAssert(THCTensor_(isContiguous)(state, _q)); THAssert(THCudaLongTensor_isContiguous(state, _J)); THAssert(THCTensor_(isContiguous)(state, _probs)); @@ -354,7 +354,7 @@ THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_prob THCudaLongTensor_free(state, larger_short); } -THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){ +void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){ THAssert(THCTensor_(isContiguous)(state, _q)); THAssert(THCudaLongTensor_isContiguous(state, _J)); THCGenerator* gen = THCRandom_getGenerator(state); @@ -388,7 +388,7 @@ GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_do GENERATE_KERNEL1(generate_bernoulli, scalar_t, double p, float, curand_uniform, (ScalarConvert::to(x <= p))) #endif -THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p) +void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -413,7 +413,7 @@ void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p } #define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE) \ -THC_API void THCTensor_(NAME)(THCState* state, \ +void THCTensor_(NAME)(THCState* state, \ THCTensor *self_, PROB_TYPE *probs_) \ { \ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_)); \ @@ -458,7 +458,7 @@ GENERATE_KERNEL2(generate_random, scalar_t, int32_t base, uint32_t range, uint32 static_cast(static_cast(x % range + base))) #endif -THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) +void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); @@ -474,7 +474,7 @@ THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val) +void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val) { THArgCheck(min_val < max_val, 2, "max must be greater than min, but got: min = %lld, max = %lld", min_val, max_val); @@ -502,14 +502,14 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_ THCTensor_(freeCopyTo)(state, self, self_); }; -THC_API void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val) +void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val) { THCTensor_(clampedRandom)(state, self_, 0LL, max_val); }; #define HLF_MANT_DIG 11 -THC_API void THCTensor_(random)(THCState* state, THCTensor *self_) +void THCTensor_(random)(THCState* state, THCTensor *self_) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); ptrdiff_t size = THCTensor_(nElement)(state, self_); diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu index 0dca009b80ab72..d60fa0c4da96fb 100644 --- a/aten/src/THC/generic/THCTensorSort.cu +++ b/aten/src/THC/generic/THCTensorSort.cu @@ -5,10 +5,10 @@ // In alignment with default sort on a c++ map, this function // will permute key and value tensors identically, and // in such a way that the 'key' tensor is ordered numerically -THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, - THCTensor* key, - THCudaLongTensor* value, - int dim, bool dir) { +void THCTensor_(sortKeyValueInplace)(THCState* state, + THCTensor* key, + THCudaLongTensor* value, + int dim, bool dir) { THArgCheck(key->sizes().equals(value->sizes()), 2, "Key tensor must have same size as value tensor"); int dims = THCudaLongTensor_nDimensionLegacyNoScalars(state, value); @@ -274,11 +274,11 @@ void THCTensor_(sortViaThrust)(THCState* state, THCudaLongTensor_freeCopyTo(state, trContigIndices, indices); } -THC_API void THCTensor_(sort)(THCState* state, - THCTensor *sorted, - THCudaLongTensor *indices, - THCTensor *input, - int dim, int order) { +void THCTensor_(sort)(THCState* state, + THCTensor *sorted, + THCudaLongTensor *indices, + THCTensor *input, + int dim, int order) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input)); THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices)); int64_t dims = THCTensor_(nDimensionLegacyNoScalars)(state, sorted); diff --git a/aten/src/THC/generic/THCTensorSort.h b/aten/src/THC/generic/THCTensorSort.h index 009d825a223975..eba93fb75773c3 100644 --- a/aten/src/THC/generic/THCTensorSort.h +++ b/aten/src/THC/generic/THCTensorSort.h @@ -4,10 +4,17 @@ /* Performs an in-place sort of (keys, values). Only works for slice sizes <= 2048 at the moment (slice size == size of keys/values dim `dim`) */ +#ifdef __cplusplus +THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, + THCTensor* keys, + THCudaLongTensor* values, + int dim, bool dir); +#else THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, THCTensor* keys, THCudaLongTensor* values, int dim, int order); +#endif /* Performs an out-of-place sort of `input`, returning the per-slice indices in `indices` and the sorted values in `sorted` */ diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu index 71ee008659b12a..a195dfbe5ca7a8 100644 --- a/aten/src/THC/generic/THCTensorTopK.cu +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -2,11 +2,11 @@ #define THC_GENERIC_FILE "generic/THCTensorTopK.cu" #else -THC_API void THCTensor_(topk)(THCState* state, - THCTensor *topK, - THCudaLongTensor *indices, - THCTensor *input_, - int64_t k, int dim, int dir, int sorted) { +void THCTensor_(topk)(THCState* state, + THCTensor *topK, + THCudaLongTensor *indices, + THCTensor *input_, + int64_t k, int dim, int dir, int sorted) { THAssert(topK != NULL && indices != NULL && input_ != NULL); THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_)); THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); From a8b1755de6c967bd8e9d777b6a0dddf4e7a6eed1 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Mon, 17 Sep 2018 08:15:19 -0700 Subject: [PATCH 183/237] Check device argument makes sense for legacy tensor constructors. (#11669) Summary: Fixes: https://github.com/pytorch/pytorch/issues/11427. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11669 Differential Revision: D9817881 Pulled By: gchanan fbshipit-source-id: 77dc5b0e6bc9884d2616210b96c07e4734058bb6 --- aten/src/ATen/DeviceGuard.h | 7 ++++ test/test_sparse.py | 29 +++++++++++++++ test/test_torch.py | 32 +++++++++++++++++ torch/csrc/utils/tensor_new.cpp | 63 ++++++++++++++++++++++++++------- 4 files changed, 118 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h index b51d80d22d350f..4da336aef5b7cd 100644 --- a/aten/src/ATen/DeviceGuard.h +++ b/aten/src/ATen/DeviceGuard.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,12 @@ struct DeviceGuard { } } + explicit DeviceGuard(optional device_opt) { + if (device_opt.has_value() && device_opt.value().is_cuda()) { + set_index(device_opt.value().index()); + } + } + /// Calls `set_index` with the given index. explicit DeviceGuard(int32_t index) { set_index(index); diff --git a/test/test_sparse.py b/test/test_sparse.py index 0b3ff39ab48301..82de45d6d17384 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -1176,6 +1176,35 @@ def test_factory_copy(self): self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + @cpu_only # just run once, we test both cpu and cuda + def test_constructor_device_legacy(self): + i = torch.tensor([[0, 1, 1], [2, 0, 2]]) + v = torch.tensor([3., 4., 5.]) + size = torch.Size([2, 3]) + + self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, size, device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cuda')) + + x = torch.sparse_coo_tensor(i, v, size, device='cpu') + self.assertRaises(RuntimeError, lambda: x.new(device='cuda')) + self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cuda')) + self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cuda')) + self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda')) + + if torch.cuda.is_available(): + self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, size, device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cpu')) + + x = torch.sparse_coo_tensor(i, v, size, device='cuda') + self.assertRaises(RuntimeError, lambda: x.new(device='cpu')) + self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cpu')) + self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cpu')) + self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu')) + @cpu_only # not really, but we only really want to run this once def test_dtypes(self): all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16] diff --git a/test/test_torch.py b/test/test_torch.py index b90e0631d4f470..cf938d4d0f50d7 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -2329,6 +2329,38 @@ def test_constructor_dtypes(self): torch.set_default_tensor_type(default_type) + def test_constructor_device_legacy(self): + self.assertRaises(RuntimeError, lambda: torch.FloatTensor(device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.FloatTensor(torch.Size([2, 3, 4]), device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.FloatTensor((2.0, 3.0), device='cuda')) + + self.assertRaises(RuntimeError, lambda: torch.Tensor(device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.Tensor(torch.Size([2, 3, 4]), device='cuda')) + self.assertRaises(RuntimeError, lambda: torch.Tensor((2.0, 3.0), device='cuda')) + + x = torch.randn((3,), device='cpu') + self.assertRaises(RuntimeError, lambda: x.new(device='cuda')) + self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda')) + self.assertRaises(RuntimeError, lambda: x.new((2.0, 3.0), device='cuda')) + + if torch.cuda.is_available(): + self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor(device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor(torch.Size([2, 3, 4]), device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor((2.0, 3.0), device='cpu')) + + default_type = torch.Tensor().type() + torch.set_default_tensor_type(torch.cuda.FloatTensor) + self.assertRaises(RuntimeError, lambda: torch.Tensor(device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.Tensor(torch.Size([2, 3, 4]), device='cpu')) + self.assertRaises(RuntimeError, lambda: torch.Tensor((2.0, 3.0), device='cpu')) + torch.set_default_tensor_type(torch.cuda.FloatTensor) + torch.set_default_tensor_type(default_type) + + x = torch.randn((3,), device='cuda') + self.assertRaises(RuntimeError, lambda: x.new(device='cpu')) + self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu')) + self.assertRaises(RuntimeError, lambda: x.new((2.0, 3.0), device='cpu')) + def test_type(self): x = torch.randn(3, 3).double() self.assertEqual(x.type('torch.FloatTensor').dtype, torch.float32) diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 3c3ab4384dc85f..ebed9c07fad7a7 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -250,6 +250,15 @@ Tensor legacy_new_from_sequence(const Type & type, at::optional device, return legacy_new_from_data(type, device, data); } +void check_legacy_ctor_device(const Type& type, at::optional device) { + if (device.has_value()) { + AT_CHECK(type.device_type() == device.value().type(), + "legacy constructor for device type: ", type.device_type(), + " was passed device type: ", device.value().type(), + ", but device type must be: ", type.device_type()); + } +} + Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) { static PythonArgParser parser({ "new(*, Device? device=None)", @@ -261,22 +270,30 @@ Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwa ParsedArgs<4> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if (r.idx == 0) { + auto deviceOptional = r.deviceOptional(0); + check_legacy_ctor_device(type, deviceOptional); return at::empty({0}, type.options(r.device(0).index())); } else if (r.idx == 1) { auto cdata = reinterpret_cast(r.toInt64(0)); return type.unsafeTensorFromTH(cdata, true); } else if (r.idx == 2) { - at::DeviceGuard device_guard(r.device(2)); + auto deviceOptional = r.deviceOptional(2); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.sparse_coo_tensor(r.tensor(0), r.tensor(1)); } else if (r.idx == 3) { - at::DeviceGuard device_guard(r.device(3)); + auto deviceOptional = r.deviceOptional(3); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2)); } else if (r.idx == 4) { PyObject* arg = r.pyobject(0); + auto deviceOptional = r.deviceOptional(1); + check_legacy_ctor_device(type, deviceOptional); if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) { // new(sequence) binds to this signature but should be treated differently // unless the sequences is a torch.Size - return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0)); + return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0)); } return new_with_sizes(type, r.device(1).index(), r.intlist(0)); } @@ -294,27 +311,35 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar ParsedArgs<5> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if (r.idx == 0) { - at::DeviceGuard device_guard(r.device(0)); + auto deviceOptional = r.deviceOptional(0); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.tensor(); } else if (r.idx == 1) { - auto cdata = reinterpret_cast(r.device(0).index()); + auto cdata = reinterpret_cast(r.toInt64(0)); return type.unsafeTensorFromTH(cdata, true); } else if (r.idx == 2) { // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't // have a device (we should infer it). - at::DeviceGuard device_guard(r.device(2)); + auto deviceOptional = r.deviceOptional(2); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.sparse_coo_tensor(r.tensor(0), r.tensor(1)); } else if (r.idx == 3) { // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't // have a device (we should infer it). - at::DeviceGuard device_guard(r.device(3)); + auto deviceOptional = r.deviceOptional(3); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2)); } else if (r.idx == 4) { PyObject* arg = r.pyobject(0); + auto deviceOptional = r.deviceOptional(1); + check_legacy_ctor_device(type, deviceOptional); if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) { // new(sequence) binds to this signature but should be treated differently // unless the sequences is a torch.Size - return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0)); + return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0)); } return new_with_sizes(type, r.device(1).index(), r.intlist(0)); } @@ -346,7 +371,9 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) { ParsedArgs<2> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if (r.idx == 0) { - at::DeviceGuard device_guard(r.device(0)); + auto deviceOptional = r.deviceOptional(0); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.tensor(); } else if (r.idx == 1) { return new_with_storage(type, r.storage(0)); @@ -357,14 +384,18 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) { return new_with_tensor(type, r.tensor(0)); } else if (r.idx == 4) { PyObject* arg = r.pyobject(0); + auto deviceOptional = r.deviceOptional(1); + check_legacy_ctor_device(type, deviceOptional); if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) { // new(sequence) binds to this signature but should be treated differently // unless the sequences is a torch.Size - return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0)); + return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0)); } return new_with_sizes(type, r.device(1).index(), r.intlist(0)); } else if (r.idx == 5) { - return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0)); + auto deviceOptional = r.deviceOptional(1); + check_legacy_ctor_device(type, deviceOptional); + return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0)); } throw std::runtime_error("new(): invalid arguments"); } @@ -386,7 +417,9 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) { ParsedArgs<3> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if (r.idx == 0) { - at::DeviceGuard device_guard(r.device(0)); + auto deviceOptional = r.deviceOptional(0); + check_legacy_ctor_device(type, deviceOptional); + at::DeviceGuard device_guard(deviceOptional); return type.tensor(); } else if (r.idx == 1) { return new_with_storage(type, r.storage(0)); @@ -397,13 +430,17 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) { return new_with_tensor(type, r.tensor(0)); } else if (r.idx == 4) { PyObject* arg = r.pyobject(0); + auto deviceOptional = r.deviceOptional(1); + check_legacy_ctor_device(type, deviceOptional); if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) { // new(sequence) binds to this signature but should be treated differently // unless the sequences is a torch.Size - return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0)); + return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0)); } return new_with_sizes(type, r.device(1).index(), r.intlist(0)); } else if (r.idx == 5) { + auto deviceOptional = r.deviceOptional(1); + check_legacy_ctor_device(type, deviceOptional); return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0)); } throw std::runtime_error("new(): invalid arguments"); From 5bfd8f583c7edb9804a9988de53a93970f2ef055 Mon Sep 17 00:00:00 2001 From: Jesse Hellemn Date: Mon, 17 Sep 2018 08:42:03 -0700 Subject: [PATCH 184/237] Moving copy of Caffe2 protos back to build_pytorch_libs.sh (#11726) Summary: This way it shows up in all current and future setup.py commands, as otherwise we'd have to override every once to have them all call copy_protos. This is needed because the nightly packages still do not include caffe2_pb2, because setup.py bdist does not go through setup.py install or setup.py develop Pull Request resolved: https://github.com/pytorch/pytorch/pull/11726 Reviewed By: orionr Differential Revision: D9844075 Pulled By: pjh5 fbshipit-source-id: 57b469e48010aacd0c08c214ba8a7e5d757feefa --- setup.py | 7 ++++++- tools/build_pytorch_libs.sh | 32 +++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 533b6dee4fcc53..ccfced5f35bc83 100644 --- a/setup.py +++ b/setup.py @@ -413,6 +413,7 @@ def build_libs(libs): # protobuf python compiler) from the build folder to the root folder # cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py def copy_protos(): + print('setup.py::copy_protos()') for src in glob.glob( os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')): dst = os.path.join( @@ -423,6 +424,7 @@ def copy_protos(): # Build all dependent libraries class build_deps(PytorchCommand): def run(self): + print('setup.py::build_deps::run()') # Check if you remembered to check out submodules def check_file(f): if not os.path.exists(f): @@ -507,6 +509,7 @@ def run(self): class build_module(PytorchCommand): def run(self): + print('setup.py::build_module::run()') self.run_command('build_py') self.run_command('build_ext') @@ -514,6 +517,7 @@ def run(self): class build_py(setuptools.command.build_py.build_py): def run(self): + print('setup.py::build_py::run()') self.run_command('create_version_file') setuptools.command.build_py.build_py.run(self) @@ -521,6 +525,7 @@ def run(self): class develop(setuptools.command.develop.develop): def run(self): + print('setup.py::develop::run()') self.run_command('create_version_file') setuptools.command.develop.develop.run(self) self.create_compile_commands() @@ -727,9 +732,9 @@ def run(self): class install(setuptools.command.install.install): def run(self): + print('setup.py::run()') if not self.skip_build: self.run_command('build_deps') - copy_protos() setuptools.command.install.install.run(self) diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 79dfc948ea07b2..37d816775f75b2 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -323,15 +323,26 @@ function build_caffe2() { ${CMAKE_INSTALL} -j"$MAX_JOBS" + # Install Python proto files + if [[ "$BUILD_PYTHON" == 'ON' ]]; then + echo "Copying Caffe2 proto files from $(pwd)/caffe2/proto to $(cd .. && pwd)/caffe2/proto" + echo "All the files in caffe2/proto are $(find caffe2/proto)" + for proto_file in $(pwd)/caffe2/proto/*.py; do + cp $proto_file "$(pwd)/../caffe2/proto/" + done + fi + + # Fix rpaths of shared libraries if [[ $(uname) == 'Darwin' ]]; then - # root/torch/lib/tmp_install/lib - pushd "$INSTALL_DIR/lib" - for lib in *.dylib; do - echo "Updating install_name for $lib" - install_name_tool -id @rpath/$lib $lib - done - popd + # root/torch/lib/tmp_install/lib + echo "Updating all install_names in $INSTALL_DIR/lib" + pushd "$INSTALL_DIR/lib" + for lib in *.dylib; do + echo "Updating install_name for $(pwd)/$lib" + install_name_tool -id @rpath/$lib $lib + done + popd fi } @@ -373,14 +384,21 @@ pushd $TORCH_LIB_DIR # If all the builds succeed we copy the libraries, headers, # binaries to torch/lib +echo "tools/build_pytorch_libs.sh succeeded at $(date)" +echo "removing $INSTALL_DIR/lib/cmake and $INSTALL_DIR/lib/python" rm -rf "$INSTALL_DIR/lib/cmake" rm -rf "$INSTALL_DIR/lib/python" + +echo "Copying $INSTALL_DIR/lib to $(pwd)" $SYNC_COMMAND -r "$INSTALL_DIR/lib"/* . if [ -d "$INSTALL_DIR/lib64/" ]; then $SYNC_COMMAND -r "$INSTALL_DIR/lib64"/* . fi +echo "Copying $(cd ../.. && pwd)/aten/src/generic/THNN.h to $(pwd)" $SYNC_COMMAND ../../aten/src/THNN/generic/THNN.h . $SYNC_COMMAND ../../aten/src/THCUNN/generic/THCUNN.h . + +echo "Copying $INSTALL_DIR/include to $(pwd)" $SYNC_COMMAND -r "$INSTALL_DIR/include" . if [ -d "$INSTALL_DIR/bin/" ]; then $SYNC_COMMAND -r "$INSTALL_DIR/bin/"/* . From 0d345cfa181970d255ac9e14825b7ebf6cf2ce13 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Mon, 17 Sep 2018 08:42:49 -0700 Subject: [PATCH 185/237] Remove Type method defaults in ATen. (#11675) Summary: This will allow us to break the dependency cycle between Tensor and Type, because currently Type has defaulted Tensor (reference) arguments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11675 Reviewed By: ezyang Differential Revision: D9819720 Pulled By: gchanan fbshipit-source-id: a9577ac34a358120075129ab0654e7862d1dace6 --- aten/src/ATen/core/Type.h | 244 +++++++++++++++--------------- aten/src/ATen/function_wrapper.py | 15 +- 2 files changed, 128 insertions(+), 131 deletions(-) diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h index dabd19c7837223..dea7844e93debb 100644 --- a/aten/src/ATen/core/Type.h +++ b/aten/src/ATen/core/Type.h @@ -167,7 +167,7 @@ struct AT_API Type { virtual int64_t storage_offset(const Tensor & self) const = 0; virtual Tensor & resize_(Tensor & self, IntList size) const = 0; virtual Tensor & set_(Tensor & self, Storage source) const = 0; - virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride={}) const = 0; + virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride) const = 0; virtual Tensor & set_(Tensor & self, const Tensor & source) const = 0; virtual Tensor & set_(Tensor & self) const = 0; virtual bool is_contiguous(const Tensor & self) const = 0; @@ -185,7 +185,7 @@ struct AT_API Type { virtual Tensor view(const Tensor & self, IntList size) const = 0; virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const = 0; virtual Tensor take(const Tensor & self, const Tensor & index) const = 0; - virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate=false) const = 0; + virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) const = 0; virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0; virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0; virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const = 0; @@ -269,8 +269,8 @@ struct AT_API Type { virtual Tensor max(const Tensor & self, const Tensor & other) const = 0; virtual Tensor max(const Tensor & self) const = 0; virtual Tensor median(const Tensor & self) const = 0; - virtual std::tuple sort(const Tensor & self, int64_t dim=-1, bool descending=false) const = 0; - virtual std::tuple topk(const Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const = 0; + virtual std::tuple sort(const Tensor & self, int64_t dim, bool descending) const = 0; + virtual std::tuple topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) const = 0; virtual Tensor all(const Tensor & self) const = 0; virtual Tensor any(const Tensor & self) const = 0; virtual Tensor lgamma(const Tensor & self) const = 0; @@ -285,8 +285,8 @@ struct AT_API Type { virtual Tensor frac(const Tensor & self) const = 0; virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0; virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0; - virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0; - virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p=2) const = 0; + virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p) const = 0; + virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p) const = 0; virtual Tensor reciprocal(const Tensor & self) const = 0; virtual Tensor & reciprocal_(Tensor & self) const = 0; virtual Tensor neg(const Tensor & self) const = 0; @@ -305,7 +305,7 @@ struct AT_API Type { virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0; virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0; virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0; - virtual Tensor histc(const Tensor & self, int64_t bins=100, Scalar min=0, Scalar max=0) const = 0; + virtual Tensor histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) const = 0; virtual Tensor sign(const Tensor & self) const = 0; virtual Tensor & sign_(Tensor & self) const = 0; virtual Tensor trace(const Tensor & self) const = 0; @@ -321,71 +321,71 @@ struct AT_API Type { virtual Tensor & remainder_(Tensor & self, Scalar other) const = 0; virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const = 0; virtual Tensor & remainder_(Tensor & self, const Tensor & other) const = 0; - virtual Tensor tril(const Tensor & self, int64_t diagonal=0) const = 0; - virtual Tensor & tril_(Tensor & self, int64_t diagonal=0) const = 0; - virtual Tensor triu(const Tensor & self, int64_t diagonal=0) const = 0; - virtual Tensor & triu_(Tensor & self, int64_t diagonal=0) const = 0; - virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim=-1) const = 0; - virtual Tensor diag(const Tensor & self, int64_t diagonal=0) const = 0; - virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; - virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const = 0; + virtual Tensor tril(const Tensor & self, int64_t diagonal) const = 0; + virtual Tensor & tril_(Tensor & self, int64_t diagonal) const = 0; + virtual Tensor triu(const Tensor & self, int64_t diagonal) const = 0; + virtual Tensor & triu_(Tensor & self, int64_t diagonal) const = 0; + virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) const = 0; + virtual Tensor diag(const Tensor & self, int64_t diagonal) const = 0; + virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; + virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0; virtual std::tuple gels(const Tensor & self, const Tensor & A) const = 0; - virtual std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const = 0; - virtual std::tuple symeig(const Tensor & self, bool eigenvectors=false, bool upper=true) const = 0; - virtual std::tuple eig(const Tensor & self, bool eigenvectors=false) const = 0; - virtual std::tuple svd(const Tensor & self, bool some=true) const = 0; - virtual Tensor potrf(const Tensor & self, bool upper=true) const = 0; - virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper=true) const = 0; - virtual Tensor potri(const Tensor & self, bool upper=true) const = 0; - virtual std::tuple pstrf(const Tensor & self, bool upper=true, Scalar tol=-1) const = 0; + virtual std::tuple trtrs(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) const = 0; + virtual std::tuple symeig(const Tensor & self, bool eigenvectors, bool upper) const = 0; + virtual std::tuple eig(const Tensor & self, bool eigenvectors) const = 0; + virtual std::tuple svd(const Tensor & self, bool some) const = 0; + virtual Tensor potrf(const Tensor & self, bool upper) const = 0; + virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) const = 0; + virtual Tensor potri(const Tensor & self, bool upper) const = 0; + virtual std::tuple pstrf(const Tensor & self, bool upper, Scalar tol) const = 0; virtual std::tuple qr(const Tensor & self) const = 0; virtual std::tuple geqrf(const Tensor & self) const = 0; virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0; - virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const = 0; - virtual std::tuple btrifact(const Tensor & self, bool pivot=true) const = 0; - virtual std::tuple btrifact_with_info(const Tensor & self, bool pivot=true) const = 0; + virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) const = 0; + virtual std::tuple btrifact(const Tensor & self, bool pivot) const = 0; + virtual std::tuple btrifact_with_info(const Tensor & self, bool pivot) const = 0; virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const = 0; - virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator=nullptr) const = 0; - virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator=nullptr) const = 0; - virtual Tensor & random_(Tensor & self, Generator * generator=nullptr) const = 0; - virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const = 0; - virtual Tensor & uniform_(Tensor & self, double from=0, double to=1, Generator * generator=nullptr) const = 0; - virtual Tensor & normal_(Tensor & self, double mean=0, double std=1, Generator * generator=nullptr) const = 0; - virtual Tensor & cauchy_(Tensor & self, double median=0, double sigma=1, Generator * generator=nullptr) const = 0; - virtual Tensor & log_normal_(Tensor & self, double mean=1, double std=2, Generator * generator=nullptr) const = 0; - virtual Tensor & exponential_(Tensor & self, double lambd=1, Generator * generator=nullptr) const = 0; - virtual Tensor & geometric_(Tensor & self, double p, Generator * generator=nullptr) const = 0; - virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride={}) const = 0; + virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator) const = 0; + virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator) const = 0; + virtual Tensor & random_(Tensor & self, Generator * generator) const = 0; + virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) const = 0; + virtual Tensor & uniform_(Tensor & self, double from, double to, Generator * generator) const = 0; + virtual Tensor & normal_(Tensor & self, double mean, double std, Generator * generator) const = 0; + virtual Tensor & cauchy_(Tensor & self, double median, double sigma, Generator * generator) const = 0; + virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0; + virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0; + virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0; + virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride) const = 0; virtual Tensor tensor(IntList size, IntList stride) const = 0; virtual Tensor abs(const Tensor & self) const = 0; virtual Tensor & abs_(Tensor & self) const = 0; virtual Tensor acos(const Tensor & self) const = 0; virtual Tensor & acos_(Tensor & self) const = 0; - virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; - virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; - virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha=1) const = 0; - virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha=1) const = 0; - virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; - virtual bool allclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0; - virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; - AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step=1) const = 0); + virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0; + virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0; + virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim) const = 0; + AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step) const = 0); AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0); - virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor argmax(const Tensor & self) const = 0; - virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor argmin(const Tensor & self) const = 0; virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0; virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0; @@ -395,19 +395,19 @@ struct AT_API Type { virtual Tensor & asin_(Tensor & self) const = 0; virtual Tensor atan(const Tensor & self) const = 0; virtual Tensor & atan_(Tensor & self) const = 0; - virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0; - virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator=nullptr) const = 0; + virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator) const = 0; + virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0; virtual Tensor bernoulli(const Tensor & self) const = 0; - virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator=nullptr) const = 0; - virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator=nullptr) const = 0; + virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) const = 0; + virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator) const = 0; virtual Tensor & bernoulli_(Tensor & self) const = 0; - virtual Tensor bincount(const Tensor & self, const Tensor & weights={}, int64_t minlength=0) const = 0; + virtual Tensor bincount(const Tensor & self, const Tensor & weights, int64_t minlength) const = 0; virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0; virtual Tensor ceil(const Tensor & self) const = 0; virtual Tensor & ceil_(Tensor & self) const = 0; - virtual std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim=0) const = 0; + virtual std::vector chunk(const Tensor & self, int64_t chunks, int64_t dim) const = 0; virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const = 0; virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const = 0; virtual Tensor clamp_max(const Tensor & self, Scalar max) const = 0; @@ -423,8 +423,8 @@ struct AT_API Type { virtual Tensor cumprod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; virtual Tensor cumprod(const Tensor & self, int64_t dim) const = 0; virtual Tensor det(const Tensor & self) const = 0; - virtual Tensor diagflat(const Tensor & self, int64_t offset=0) const = 0; - virtual Tensor diagonal(const Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const = 0; + virtual Tensor diagflat(const Tensor & self, int64_t offset) const = 0; + virtual Tensor diagonal(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2) const = 0; virtual Tensor div(const Tensor & self, const Tensor & other) const = 0; virtual Tensor & div_(Tensor & self, const Tensor & other) const = 0; virtual Tensor div(const Tensor & self, Scalar other) const = 0; @@ -439,10 +439,10 @@ struct AT_API Type { virtual Tensor & exp_(Tensor & self) const = 0; virtual Tensor expm1(const Tensor & self) const = 0; virtual Tensor & expm1_(Tensor & self) const = 0; - virtual Tensor expand(const Tensor & self, IntList size, bool implicit=false) const = 0; + virtual Tensor expand(const Tensor & self, IntList size, bool implicit) const = 0; virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0; - AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m=-1) const = 0); - virtual Tensor flatten(const Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) const = 0; + AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m) const = 0); + virtual Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim) const = 0; virtual Tensor & fill_(Tensor & self, Scalar value) const = 0; virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0; virtual Tensor floor(const Tensor & self) const = 0; @@ -450,16 +450,16 @@ struct AT_API Type { AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0); virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0; virtual std::tuple gesv(const Tensor & self, const Tensor & A) const = 0; - virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0; - virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized=false) const = 0; - virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true) const = 0; - virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const = 0; + virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0; + virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0; + virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided) const = 0; + virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const = 0; virtual Tensor index(const Tensor & self, TensorList indices) const = 0; virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0; virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0; virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0; virtual Tensor inverse(const Tensor & self) const = 0; - virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const = 0; + virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0; virtual bool is_cuda(const Tensor & self) const = 0; virtual bool is_distributed(const Tensor & self) const = 0; virtual bool is_floating_point(const Tensor & self) const = 0; @@ -468,8 +468,8 @@ struct AT_API Type { virtual bool is_same_size(const Tensor & self, const Tensor & other) const = 0; virtual bool is_signed(const Tensor & self) const = 0; virtual bool is_sparse(const Tensor & self) const = 0; - virtual std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) const = 0; - AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps=100) const = 0); + virtual std::tuple kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) const = 0; + AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps) const = 0); virtual Tensor log(const Tensor & self) const = 0; virtual Tensor & log_(Tensor & self) const = 0; virtual Tensor log10(const Tensor & self) const = 0; @@ -479,23 +479,23 @@ struct AT_API Type { virtual Tensor log2(const Tensor & self) const = 0; virtual Tensor & log2_(Tensor & self) const = 0; virtual Tensor logdet(const Tensor & self) const = 0; - AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps=100) const = 0); + AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps) const = 0); virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0; - virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0; virtual Tensor matrix_power(const Tensor & self, int64_t n) const = 0; - virtual std::tuple max(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; - virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual std::tuple max(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor mean(const Tensor & self, ScalarType dtype) const = 0; virtual Tensor mean(const Tensor & self) const = 0; virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0; - virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor mean(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; - virtual std::tuple median(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; - virtual std::tuple min(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; - virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual std::tuple median(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual std::tuple min(const Tensor & self, int64_t dim, bool keepdim) const = 0; + virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor mm(const Tensor & self, const Tensor & mat2) const = 0; - virtual std::tuple mode(const Tensor & self, int64_t dim=-1, bool keepdim=false) const = 0; + virtual std::tuple mode(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor mul(const Tensor & self, const Tensor & other) const = 0; virtual Tensor & mul_(Tensor & self, const Tensor & other) const = 0; virtual Tensor mul(const Tensor & self, Scalar other) const = 0; @@ -507,13 +507,13 @@ struct AT_API Type { AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0); virtual Tensor permute(const Tensor & self, IntList dims) const = 0; virtual Tensor pin_memory(const Tensor & self) const = 0; - virtual Tensor pinverse(const Tensor & self, double rcond=1e-15) const = 0; - AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator=nullptr) const = 0); - AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator=nullptr) const = 0); - AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator=nullptr) const = 0); - AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator=nullptr) const = 0); - AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator=nullptr) const = 0); - AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step=1) const = 0); + virtual Tensor pinverse(const Tensor & self, double rcond) const = 0; + AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator) const = 0); + AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step) const = 0); virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0; virtual Tensor reshape(const Tensor & self, IntList shape) const = 0; virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0; @@ -521,7 +521,7 @@ struct AT_API Type { virtual Tensor & round_(Tensor & self) const = 0; virtual Tensor relu(const Tensor & self) const = 0; virtual Tensor & relu_(Tensor & self) const = 0; - virtual Tensor hardshrink(const Tensor & self, Scalar lambd=0.5) const = 0; + virtual Tensor hardshrink(const Tensor & self, Scalar lambd) const = 0; virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0; virtual Tensor rsqrt(const Tensor & self) const = 0; virtual Tensor & rsqrt_(Tensor & self) const = 0; @@ -535,32 +535,32 @@ struct AT_API Type { virtual Tensor detach(const Tensor & self) const = 0; virtual Tensor & detach_(Tensor & self) const = 0; virtual int64_t size(const Tensor & self, int64_t dim) const = 0; - virtual Tensor slice(const Tensor & self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const = 0; + virtual Tensor slice(const Tensor & self, int64_t dim, int64_t start, int64_t end, int64_t step) const = 0; virtual std::tuple slogdet(const Tensor & self) const = 0; virtual Tensor smm(const Tensor & self, const Tensor & mat2) const = 0; virtual Tensor softmax(const Tensor & self, int64_t dim) const = 0; - virtual std::vector split(const Tensor & self, int64_t split_size, int64_t dim=0) const = 0; - virtual std::vector split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim=0) const = 0; + virtual std::vector split(const Tensor & self, int64_t split_size, int64_t dim) const = 0; + virtual std::vector split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim) const = 0; virtual Tensor squeeze(const Tensor & self) const = 0; virtual Tensor squeeze(const Tensor & self, int64_t dim) const = 0; virtual Tensor & squeeze_(Tensor & self) const = 0; virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0; - virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const = 0; + virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const = 0; virtual int64_t stride(const Tensor & self, int64_t dim) const = 0; virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0; virtual Tensor sum(const Tensor & self) const = 0; virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0; - virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim=false) const = 0; + virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim) const = 0; virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0; virtual Tensor sqrt(const Tensor & self) const = 0; virtual Tensor & sqrt_(Tensor & self) const = 0; - virtual Tensor std(const Tensor & self, bool unbiased=true) const = 0; - virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0; + virtual Tensor std(const Tensor & self, bool unbiased) const = 0; + virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0; virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0; virtual Tensor prod(const Tensor & self) const = 0; virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0; - virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim) const = 0; virtual Tensor prod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0; virtual Tensor t(const Tensor & self) const = 0; virtual Tensor & t_(Tensor & self) const = 0; @@ -571,29 +571,29 @@ struct AT_API Type { virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const = 0; virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const = 0; virtual Tensor flip(const Tensor & self, IntList dims) const = 0; - virtual Tensor rot90(const Tensor & self, int64_t k=1, IntList dims={0,1}) const = 0; + virtual Tensor rot90(const Tensor & self, int64_t k, IntList dims) const = 0; virtual Tensor trunc(const Tensor & self) const = 0; virtual Tensor & trunc_(Tensor & self) const = 0; virtual Tensor type_as(const Tensor & self, const Tensor & other) const = 0; virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const = 0; virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const = 0; - virtual Tensor var(const Tensor & self, bool unbiased=true) const = 0; - virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased=true, bool keepdim=false) const = 0; + virtual Tensor var(const Tensor & self, bool unbiased) const = 0; + virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0; virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0; virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0; AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0); - virtual Tensor norm(const Tensor & self, Scalar p=2) const = 0; - virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim=false) const = 0; + virtual Tensor norm(const Tensor & self, Scalar p) const = 0; + virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0; virtual Tensor clone(const Tensor & self) const = 0; virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0; virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0; virtual Tensor & zero_(Tensor & self) const = 0; - virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; - virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha=1) const = 0; - virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha=1) const = 0; - virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha=1) const = 0; - virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0; - virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const = 0; + virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha) const = 0; + virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0; + virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0; + virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0; virtual Tensor tensor() const = 0; virtual Tensor tensor(IntList size) const = 0; virtual Tensor native_sparse_coo_tensor(IntList size) const = 0; @@ -615,12 +615,12 @@ struct AT_API Type { virtual Tensor _indices(const Tensor & self) const = 0; virtual Tensor _values(const Tensor & self) const = 0; virtual int64_t numel(const Tensor & self) const = 0; - virtual std::vector unbind(const Tensor & self, int64_t dim=0) const = 0; + virtual std::vector unbind(const Tensor & self, int64_t dim) const = 0; virtual int64_t get_device(const Tensor & self) const = 0; - virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking=false) const = 0; - virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking=false) const = 0; - virtual Tensor to(const Tensor & self, Device device, bool non_blocking=false) const = 0; - virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking=false) const = 0; + virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) const = 0; + virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking) const = 0; + virtual Tensor to(const Tensor & self, Device device, bool non_blocking) const = 0; + virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking) const = 0; virtual Scalar _local_scalar(const Tensor & self) const = 0; protected: TensorTypeId type_id_; diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index e88dd8867a0e96..4ba677a473ef7a 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -39,7 +39,7 @@ def TypedDict(name, attrs, total=True): # type: ignore # declaration under Type.h (right now, we call this template # BROADCAST but it also handles default arguments) TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\ -${return_type} ${api_name}(${type_method_formals_with_defaults}) const override; +${return_type} ${api_name}(${type_method_formals}) const override; """) # 2. broadcasting functions are implemented in Type.cpp TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\ @@ -60,18 +60,18 @@ def TypedDict(name, attrs, total=True): # type: ignore # for 'native' declarations (so the native dispatch is hardcoded into # the template here.) PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\ -virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0; +virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const = 0; """) DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\ AT_DEPRECATED(virtual ${return_type} \ -${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0); +${method_prefix_derived}${api_name}(${type_method_formals}) const = 0); """) PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\ -virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const = 0; +virtual ${return_type} ${api_name}(${type_method_formals}) const = 0; """) TYPE_METHOD_DECLARATION_ABSTRACT = CodeTemplate("""\ -${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const override; +${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override; """) TYPE_METHOD_DEFINITION_ABSTRACT = CodeTemplate("""\ ${return_type} TypeDefault::${method_prefix_derived}${api_name}(${type_method_formals}) const { @@ -79,7 +79,7 @@ def TypedDict(name, attrs, total=True): # type: ignore } """) TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\ -${return_type} ${api_name}(${type_method_formals_with_defaults}) const override; +${return_type} ${api_name}(${type_method_formals}) const override; """) TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\ ${return_type} TypeDefault::${api_name}(${type_method_formals}) const { @@ -515,7 +515,6 @@ def __getitem__(self, x): 'type_definition_body': List[str], 'type_method_actuals': List[str], 'type_method_definition_dispatch': str, - 'type_method_formals_with_defaults': List[str], 'type_method_formals': List[str], 'variants': str, 'when_spares_dispatch': str, @@ -817,7 +816,6 @@ def process_option(option, output_options): # There are no cases where these differ, but they do in native_functions option['type_method_formals'] = option['formals'] - option['type_method_formals_with_defaults'] = option['formals_with_defaults'] option['type_method_actuals'] = option['actuals'] option['const_mark'] = '' if option['inplace'] else ' const' @@ -1051,7 +1049,6 @@ def find_formal(formal_name, formals): dispatch_type['is_type_dispatched'] = True option['type_method_formals'] = [format_formal(f) for f in formals if f != dispatch_type] - option['type_method_formals_with_defaults'] = [formal_with_default(f) for f in formals if f != dispatch_type] option['type_method_actuals'] = [f['name'] for f in formals if f != dispatch_type] option['native_actuals'] = [f['name'] if f != dispatch_type else '*this' for f in formals] From 35518b3dc79e081fca3da29cc0c6ef9451458b38 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 17 Sep 2018 08:49:27 -0700 Subject: [PATCH 186/237] Back out "Back out "Refactor Tensor/TensorImpl constructors."" E2: Confirm problem with old patch (#11744) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11744 Original commit changeset: 093e4c47d557 Restores D9813742 Reviewed By: dzhulgakov Differential Revision: D9847835 fbshipit-source-id: f3f467891e01c923dd9d3352d892cf59e10402f1 --- caffe2/core/tensor.h | 76 ++++++++++++++++++++++++++++----------- caffe2/core/tensor_impl.h | 71 +----------------------------------- 2 files changed, 56 insertions(+), 91 deletions(-) diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index c17667c006f54c..a167658060e7fc 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -50,45 +50,75 @@ class CAFFE2_API Tensor final { return impl_.get(); } - explicit Tensor(DeviceType type) - : impl_(c10::make_intrusive(type)) {} - + explicit Tensor(Storage storage) + : impl_(c10::make_intrusive(std::move(storage))) {} + + /** + * @brief Creates a tensor of the given dimension. + * + * Note that the actual data allocation is not going to be carried out until + * the first time mutable_data() is called. + */ explicit Tensor(const vector& dims, DeviceType type) - : impl_( - c10::make_intrusive(dims, type)) {} + : Tensor(Storage(type)) { + // TODO: here, we create a Storage + // and immediately discard it in Resize() since + // reset_tensor will be true and FreeMemory will be called, + // we might want to avoid creating Storage twice? + Resize(dims); + } explicit Tensor(const vector& dims, DeviceType type) - : impl_( - c10::make_intrusive(dims, type)) {} + : Tensor(Storage(type)) { + Resize(dims); + } + /** + * context_for_copy is required to have the same DeviceType as src + */ Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type) - : impl_(c10::make_intrusive( - *src.impl_, - context_for_copy, - type)) {} + : Tensor(Storage(type)) { + CopyFrom(src, context_for_copy); + } + /** + * @brief: Create a Tensor of at::DeviceType `type` and initialize it with + * src Tensor + */ Tensor(const Tensor& src, DeviceType type) - : impl_(c10::make_intrusive( - *src.impl_, - type)) {} + : Tensor(Storage(type)) { + CopyFrom(src); + } + /** + * @brief Creates a tensor, and fills its contents with the given values. + * The type of tensor will be decided by the context parameter + */ template Tensor( const vector& dims, const vector& values, BaseContext* context) - : impl_(c10::make_intrusive( - dims, - values, - context)) {} + : Tensor(Storage(context->device_type(), TypeMeta::Make())) { + Resize(dims); + CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size()); + context->CopyItemsFromCPU( + storage().dtype(), size(), values.data(), mutable_data()); + } + /** + * @brief Creates a scalar tensor, and fills its content with the given value. + * The type of tensor will be decided by the context parameter + */ template < typename T, typename = typename std::enable_if::value>::type> Tensor(const T& value, BaseContext* context) - : impl_(c10::make_intrusive( - value, - context)) {} + : Tensor(Storage(context->device_type(), TypeMeta::Make())) { + Resize(std::vector{}); + context->CopyItemsFromCPU( + storage().dtype(), size(), &value, mutable_data()); + } Tensor Clone() const { Tensor x(GetDeviceType()); @@ -276,6 +306,10 @@ class CAFFE2_API Tensor final { inline void ExtractDeviceOption(DeviceOption* device) const { return impl_.get()->ExtractDeviceOption(device); } + + const Storage& storage() { + return impl_->storage(); + } }; using TensorCPU = Tensor; diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 4cdf612a1fb091..2e01c55318d88f 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -88,77 +88,8 @@ inline int canonical_axis_index_(int axis_index, int ndims) { class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { public: TensorImpl() = delete; - explicit TensorImpl(DeviceType device_type) : storage_(device_type) {} - /** - * @brief Creates a tensor of the given dimension. - * - * Note that the actual data allocation is not going to be carried out until - * the first time mutable_data() is called. - */ - // TODO: here, we create a Storage - // and immediately discard it in Resize() since - // reset_tensor will be true and FreeMemory will be called, - // we might want to avoid creating Storage twice? - explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - explicit TensorImpl(const std::vector& dims, at::DeviceType device_type) - : storage_(device_type) { - Resize(dims); - } - - /* Now we require that context_for_copy has the same device type as src since - * template is removed - */ - TensorImpl( - const TensorImpl& src, - at::BaseContext* context_for_copy, - at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src, context_for_copy); - } - - /** - * @brief: Create a Tensor of at::DeviceType `type` and initialize it with - * src Tensor - */ - TensorImpl(const TensorImpl& src, at::DeviceType device_type) - : storage_(device_type) { - CopyFrom(src); - } - - /** - * @brief Creates a tensor, and fills its contents with the given values. - * The type of tensor will be decided by the context parameter - */ - template - TensorImpl( - const std::vector& dims, - const std::vector& values, - at::BaseContext* context) - : storage_(context->device_type(), TypeMeta::Make()) { - Resize(dims); - CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, values.data(), mutable_data()); - } - - /** - * @brief Creates a scalar tensor, and fills its content with the given value. - * The type of tensor will be decided by the context parameter - */ - template < - typename T, - typename = typename std::enable_if::value>::type> - TensorImpl(const T& value, at::BaseContext* context) - : storage_(context->device_type(), TypeMeta::Make()) { - Resize(std::vector{}); - context->CopyItemsFromCPU( - storage_.dtype(), numel_, &value, mutable_data()); - } + explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {} TensorImpl(const TensorImpl&) = default; TensorImpl& operator=(const TensorImpl&) = default; From 2baba7f835ec01e5b0776ef0817064d5d6b32116 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 17 Sep 2018 08:49:29 -0700 Subject: [PATCH 187/237] Add storage_offset to Caffe2 (#11701) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11701 There's one extra multiply from TypeMeta::itemsize() which needs to be characterized. For all existing Caffe2 uses, storage_offset is zero. Reviewed By: li-roy Differential Revision: D9831230 fbshipit-source-id: 353678edf76d2ccc297a73475a34f6ab2a20d1e1 --- caffe2/core/tensor_impl.h | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 2e01c55318d88f..cec550132ab9c6 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -89,7 +89,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { public: TensorImpl() = delete; - explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)) {} + explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {} TensorImpl(const TensorImpl&) = default; TensorImpl& operator=(const TensorImpl&) = default; @@ -312,11 +312,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { if (reserved_) { // If tensor is reserved then don't claim its memeory unless capacity() // is smaller than new size - reset_tensor = storage_.capacity() < numel_ * storage_.itemsize(); + reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize(); } else { - reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() || + reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() || !FLAGS_caffe2_keep_on_shrink || - storage_.capacity() - numel_ * storage_.itemsize() > + storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() > FLAGS_caffe2_max_keep_on_shrink_memory; } @@ -371,6 +371,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { inline void FreeMemory() { // We'll detach from the old Storage and create a new one storage_ = at::Storage(storage_.device_type(), storage_.dtype()); + storage_offset_ = 0; } /** @@ -420,6 +421,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * this still keeps the original semantics */ storage_ = src.storage(); + storage_offset_ = src.storage_offset(); } /** @@ -472,10 +474,12 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { "To share data with a raw pointer, you need to set shape first."); storage_.UniqueStorageShareExternalPointer( std::move(data_ptr), data_type, capacity); + storage_offset_ = 0; } else { int64_t numel = capacity / data_type.itemsize(); // Create a new Storage storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true); + storage_offset_ = 0; } } @@ -485,7 +489,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ inline const void* raw_data() const { CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); - return storage_.data(); + return static_cast(static_cast(storage_.data()) + storage_offset_ * storage_.itemsize()); } /** @@ -508,7 +512,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { ", while tensor contains ", storage_.dtype().name(), ". "); - return static_cast(storage_.data()); + return static_cast(storage_.data()) + storage_offset_; } /** @@ -525,13 +529,14 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { inline void* raw_mutable_data(const TypeMeta& meta) { // For 0-size tensors it's fine to return any pointer (including nullptr) if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) { - return storage_.data(); + return static_cast(static_cast(storage_.data()) + storage_offset_ * meta.itemsize()); } else { CAFFE_ENFORCE_WITH_CALLER( numel_ >= 0, "Tensor is not initialized. You probably need to call Resize() " "before calling mutable_data()"); bool had_special_dtor = storage_.dtype().dtor() != nullptr; + storage_offset_ = 0; if (storage_.unique()) { storage_.set_dtype(meta); } else { @@ -546,6 +551,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { if (numel_ == 0 || (meta.ctor() == nullptr && !had_special_dtor && storage_.numel() >= numel_)) { + AT_ASSERT(storage_offset_ == 0); // because we just reallocated return storage_.data(); } const at::Allocator* allocator = storage_.allocator(); @@ -584,6 +590,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { at::Device(storage_.device_type()))); } storage_.set_numel(numel_); + AT_ASSERT(storage_offset_ == 0); // because we just reallocated return storage_.data(); } } @@ -614,7 +621,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { template inline T* mutable_data() { if ((numel_ == 0 || storage_.data()) && IsType()) { - return static_cast(storage_.data()); + return static_cast(storage_.data()) + storage_offset_; } // Check it here statically - otherwise TypeMeta would throw the runtime // error in attempt to invoke TypeMeta::ctor() @@ -652,6 +659,8 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { ; } + // NB: This capacity may also include available space + // in the storage BEFORE the tensor data, if storage_offset != 0 inline size_t capacity_nbytes() const { return storage_.capacity(); } @@ -746,6 +755,10 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { return storage_; } + int64_t storage_offset() const { + return storage_offset_; + } + protected: using DimVector = std::vector; DimVector dims_; // sizes_ @@ -756,7 +769,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { // then subsequent Resize()s will not free up Storage. bool reserved_ = false; at::Storage storage_; - // int64_t storage_offset_; + int64_t storage_offset_ = 0; private: template < From 6660a128a5d2ccaf84a1552bcbeaf62d9743521c Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 17 Sep 2018 08:49:31 -0700 Subject: [PATCH 188/237] Cache and use TypeMeta in TensorImpl (#11706) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11706 This is necessary to handle use-cases when Storage is not set (because the tensor in question doesn't have a notion of storage.) Reviewed By: orionr Differential Revision: D9833361 fbshipit-source-id: e90a384019f44f57682b687d129b54e85b6fabb9 --- caffe2/core/tensor_impl.h | 49 +++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index cec550132ab9c6..3cd5bd23d446fb 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -89,7 +89,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { public: TensorImpl() = delete; - explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {} + explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) { + data_type_ = storage_ ? storage_.dtype() : TypeMeta{}; + } TensorImpl(const TensorImpl&) = default; TensorImpl& operator=(const TensorImpl&) = default; @@ -130,25 +132,27 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { if ((void*)&src == (void*)this) { return; } - if (storage_.dtype() != src.meta()) { + if (data_type_ != src.meta()) { storage_ = at::Storage(GetDeviceType(), src.meta()); + data_type_ = src.meta(); } if (src.size() == -1) { dims_.clear(); numel_ = -1; storage_.reset(); + data_type_ = TypeMeta(); return; } Resize(src.dims()); if (size() > 0) { - if (storage_.dtype().copy()) { + if (data_type_.copy()) { CAFFE_ENFORCE( GetDeviceType() == CPU, "In CopyFrom source and dest tensors must both be CPU for meta copy"); CAFFE_ENFORCE( src.GetDeviceType() == CPU, "In CopyFrom source and dest tensors must both be CPU for meta copy"); - storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size()); + data_type_.copy()(src.raw_data(), raw_mutable_data(), size()); } else { // We'll need to use a non-CPU context to perform the copy if // one of the context is not CPU since only non-CPU context @@ -222,11 +226,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { auto oldSize = numel_; auto oldDims = dims_; Resize(newCapacity); - auto* newData = raw_mutable_data(storage_.dtype()); + auto* newData = raw_mutable_data(data_type_); CAFFE_ENFORCE( context != nullptr, "Context must be provided to Extend the tensor"); context->CopyItemsSameDevice( - storage_.dtype(), oldSize, oldData.get(), newData); + data_type_, oldSize, oldData.get(), newData); reserved_ = true; dims_ = newDims; numel_ = newNumel; @@ -282,7 +286,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { auto oldDims = dims_; Resize(newCapacity); // Allocate new memory but don't copy over the data - raw_mutable_data(storage_.dtype()); + raw_mutable_data(data_type_); dims_ = oldDims; numel_ = oldSize; reserved_ = true; @@ -370,7 +374,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ inline void FreeMemory() { // We'll detach from the old Storage and create a new one - storage_ = at::Storage(storage_.device_type(), storage_.dtype()); + storage_ = at::Storage(storage_.device_type(), data_type_); storage_offset_ = 0; } @@ -382,7 +386,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { std::string DebugString() const { std::stringstream ss; ss << "A Tensor of item size " << storage_.itemsize() << " and type " - << storage_.dtype().name() << " and dimension ("; + << data_type_.name() << " and dimension ("; for (int d : dims_) { ss << d << ","; } @@ -421,6 +425,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * this still keeps the original semantics */ storage_ = src.storage(); + data_type_ = src.dtype(); storage_offset_ = src.storage_offset(); } @@ -474,11 +479,13 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { "To share data with a raw pointer, you need to set shape first."); storage_.UniqueStorageShareExternalPointer( std::move(data_ptr), data_type, capacity); + data_type_ = data_type; storage_offset_ = 0; } else { int64_t numel = capacity / data_type.itemsize(); // Create a new Storage storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true); + data_type_ = data_type; storage_offset_ = 0; } } @@ -510,7 +517,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { "Tensor type mismatch, caller expects elements to be ", TypeMeta::TypeName(), ", while tensor contains ", - storage_.dtype().name(), + data_type_.name(), ". "); return static_cast(storage_.data()) + storage_offset_; } @@ -528,22 +535,23 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ inline void* raw_mutable_data(const TypeMeta& meta) { // For 0-size tensors it's fine to return any pointer (including nullptr) - if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) { + if (data_type_ == meta && (storage_.data() || numel_ == 0)) { return static_cast(static_cast(storage_.data()) + storage_offset_ * meta.itemsize()); } else { CAFFE_ENFORCE_WITH_CALLER( numel_ >= 0, "Tensor is not initialized. You probably need to call Resize() " "before calling mutable_data()"); - bool had_special_dtor = storage_.dtype().dtor() != nullptr; + bool had_special_dtor = data_type_.dtor() != nullptr; storage_offset_ = 0; if (storage_.unique()) { storage_.set_dtype(meta); } else { - if (storage_.dtype() != meta) { + if (data_type_ != meta) { storage_ = at::Storage(storage_.device_type(), meta); } } + data_type_ = meta; // We can reuse the existing buffer if the current data does not have // a special destructor and the new data doesn't have a special @@ -564,7 +572,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { // making sure that when the data is freed, it calls the right // destruction procedure. auto size = numel_; - auto dtor = storage_.dtype().dtor(); + auto dtor = data_type_.dtor(); void* ptr; at::DeleterFnPtr deleter; auto ptr_and_deleter = GetStaticContext()->New( @@ -579,7 +587,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { deleter(local_ptr); }, at::Device(storage_.device_type()))); - storage_.dtype().ctor()(storage_.data(), numel_); + data_type_.ctor()(storage_.data(), numel_); } else { // For fundamental type, new and delete is easier. auto ptr_and_deleter = @@ -606,10 +614,10 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ inline void* raw_mutable_data() { CAFFE_ENFORCE_WITH_CALLER( - storage_.dtype().id() != TypeIdentifier::uninitialized(), + data_type_.id() != TypeIdentifier::uninitialized(), "Calling raw_mutable_data() without meta, but the current meta is " "of unknown type."); - return raw_mutable_data(storage_.dtype()); + return raw_mutable_data(data_type_); } /** @@ -709,7 +717,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * Returns the TypeMeta object associated with the current data type. */ inline const TypeMeta& meta() const { - return storage_.dtype(); + return data_type_; + } + + inline const TypeMeta& dtype() const { + return data_type_; } /** @@ -770,6 +782,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { bool reserved_ = false; at::Storage storage_; int64_t storage_offset_ = 0; + TypeMeta data_type_; private: template < From f6a6d7fae17a1003e2e40735abe0d94b6a59dbac Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 17 Sep 2018 09:08:34 -0700 Subject: [PATCH 189/237] Switch at::TensorImpl to store TypeMeta rather than ScalarType Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11702 Reviewed By: cpuhrsch Differential Revision: D9831384 fbshipit-source-id: 1b1233a70ed70b47a3dab4a5797b6cfcb7a2c265 --- aten/src/ATen/Context.cpp | 2 +- aten/src/ATen/SparseTensorImpl.cpp | 6 ++--- aten/src/ATen/SparseTensorImpl.h | 2 +- aten/src/ATen/core/Tensor.h | 2 +- aten/src/ATen/core/TensorImpl.cpp | 14 +++++------ aten/src/ATen/core/TensorImpl.h | 24 +++++++------------ aten/src/ATen/core/UndefinedTensorImpl.cpp | 2 +- aten/src/ATen/core/typeid.h | 10 ++++---- aten/src/ATen/function_wrapper.py | 15 ++++++------ aten/src/ATen/native/sparse/SparseTensor.cpp | 2 +- aten/src/ATen/templates/SparseTypeDerived.cpp | 2 +- aten/src/ATen/templates/Tensor.h | 2 +- torch/csrc/autograd/variable.cpp | 4 ++-- torch/csrc/jit/interpreter.cpp | 2 +- 14 files changed, 41 insertions(+), 48 deletions(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index cb96814927130c..c73d2efd8ea813 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -116,7 +116,7 @@ TypeExtendedInterface& getType(TensorOptions options) { TypeExtendedInterface& getType(const TensorImpl* impl) { Backend backend = tensorTypeIdToBackend(impl->type_id()); return globalContext().getType( - backend, impl->scalar_type(), impl->is_variable()); + backend, dataTypeToScalarType(impl->dtype().id()), impl->is_variable()); } TypeExtendedInterface& getType(const Tensor& t) { diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 3f13d59b4467e5..66b71dd7b8a650 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -28,13 +28,13 @@ namespace { // // This means that we allocate a [1,0] size indices tensor and a [0] size // values tensor for such an empty tensor. -SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, at::ScalarType scalar_type) - : TensorImpl(type_id, scalar_type, nullptr, false) +SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type) + : TensorImpl(type_id, data_type, nullptr, false) , size_{0} , sparseDims_(1) , denseDims_(0) , indices_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), ScalarType::Long)->tensor({1, 0})) - , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), scalar_type)->tensor()) {} + , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), dataTypeToScalarType(data_type.id()))->tensor()) {} IntList SparseTensorImpl::sizes() const { return size_; diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 77ad805ac99e6f..42b670bea08541 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -36,7 +36,7 @@ struct AT_API SparseTensorImpl : public TensorImpl { public: // Public for now... - explicit SparseTensorImpl(at::TensorTypeId, at::ScalarType); + explicit SparseTensorImpl(at::TensorTypeId, const caffe2::TypeMeta&); int64_t nnz() const { return values_.size(0); } int64_t sparseDims() const { return sparseDims_; } diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h index 4b1f18655547d3..ca23584f870ce7 100644 --- a/aten/src/ATen/core/Tensor.h +++ b/aten/src/ATen/core/Tensor.h @@ -142,7 +142,7 @@ struct AT_API Tensor { return tensor_impl_->type_id(); } ScalarType scalar_type() const { - return tensor_impl_->scalar_type(); + return dataTypeToScalarType(tensor_impl_->dtype().id()); } const Storage& storage() const { return tensor_impl_->storage(); diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp index 1bf6802c7d4d8e..5b568482d8dfe2 100644 --- a/aten/src/ATen/core/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -17,19 +17,19 @@ const Tensor& TensorImpl::grad() const { AT_ERROR("grad is not implemented for Tensor"); } -TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable) - : TensorImpl({}, type_id, scalar_type, is_variable) { +TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable) + : TensorImpl({}, type_id, data_type, is_variable) { // UndefinedTensors and SparseTensors don't have storages. - if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined + if (type_id != UndefinedTensorId() && data_type.id() != caffe2::TypeIdentifier::uninitialized() && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) { - storage_ = Storage(scalarTypeToTypeMeta(scalar_type), 0, allocator, true); + storage_ = Storage(data_type, 0, allocator, true); } } TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable) - : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype().id()), is_variable) {} + : TensorImpl(std::move(storage), type_id, storage.dtype(), is_variable) {} -TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable) +TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable) : storage_(std::move(storage)), storage_offset_(0), sizes_{0}, @@ -37,7 +37,7 @@ TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scala is_contiguous_(true), numel_(0), type_id_(type_id), - scalar_type_(scalar_type), + data_type_(data_type), is_variable_(is_variable) {} IntList TensorImpl::sizes() const { diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index 831c53e23b6d49..d2f98ff52780f8 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -22,7 +22,7 @@ struct Tensor; namespace at { struct AT_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl() = delete; - TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable); + TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable); TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); virtual void release_resources() override; @@ -32,7 +32,7 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { // could not have been created without initializing the Type first. // TODO: This is not actually true via the Caffe2 codepath! Make // it so. - return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable()); + return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), dataTypeToScalarType(dtype().id()), is_variable()); } TensorTypeId type_id() const { return type_id_; } @@ -103,7 +103,7 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { inline void* data() const { return static_cast( static_cast(storage_.data()) + - at::elementSize(scalar_type_) * storage_offset_); + data_type_.itemsize() * storage_offset_); } template @@ -111,14 +111,8 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { return storage_.unsafe_data() + storage_offset_; } - // TODO: Remove this once we get rid of scalar_type and use dmeta or dtype - // instead. - inline at::ScalarType scalar_type() const { - return scalar_type_; - } - - inline caffe2::TypeMeta dtype() const { - return storage_.dtype(); + const caffe2::TypeMeta& dtype() const { + return data_type_; } virtual int64_t storage_offset() const { @@ -204,13 +198,13 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target { is_contiguous_ = compute_contiguous(); } TensorTypeId type_id_; - // INVARIANT: When storage is non-null, this scalar type must - // agree with the scalar type in storage - ScalarType scalar_type_; + // INVARIANT: When storage is non-null, this type meta must + // agree with the type meta in storage + caffe2::TypeMeta data_type_; bool is_variable_ = false; bool is_wrapped_number_ = false; private: - TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable); + TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable); }; } // namespace at diff --git a/aten/src/ATen/core/UndefinedTensorImpl.cpp b/aten/src/ATen/core/UndefinedTensorImpl.cpp index d270e379a3415e..e26b61a03c87e0 100644 --- a/aten/src/ATen/core/UndefinedTensorImpl.cpp +++ b/aten/src/ATen/core/UndefinedTensorImpl.cpp @@ -5,7 +5,7 @@ namespace at { // should this use the globalContext? Can it get a context passed in somehow? UndefinedTensorImpl::UndefinedTensorImpl() -: TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) { +: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is variable */ false) { } IntList UndefinedTensorImpl::sizes() const { diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h index e258d3be6a7fa3..9741de03dbaedb 100644 --- a/aten/src/ATen/core/typeid.h +++ b/aten/src/ATen/core/typeid.h @@ -49,9 +49,9 @@ class AT_CORE_API TypeIdentifier final : public at::IdWrapper) CAFFE_DECLARE_KNOWN_TYPE(10, std::complex) -// 10 = undefined type id +// 11 = undefined type id CAFFE_DECLARE_KNOWN_TYPE(12, Tensor) CAFFE_DECLARE_KNOWN_TYPE(13, std::string) diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 4ba677a473ef7a..5df218a89cc06d 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -192,7 +192,7 @@ def TypedDict(name, attrs, total=True): # type: ignore BUFFER_DEFINITION = CodeTemplate("""\ auto ${name}_ = c10::make_intrusive( - ${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false).release(); + ${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), ${THTensor}_new(), false).release(); auto ${name} = Tensor(${name}_, false);""") CONDITIONAL_INITIALIZER = CodeTemplate("""\ @@ -333,17 +333,18 @@ def __init__(self, reason): ALLOC_NOARGS_WRAP = { 'THTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()', + '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()', 'THBoolTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()', + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Byte), allocator(), false).release()', 'THIndexTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()', + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), allocator(), false).release()', 'THIntegerTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()', + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Int), allocator(), false).release()', 'THDenseTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()', + '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()', 'THDenseIndexTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()' + '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), ' + 'allocator(), false).release()' } ALLOC_WRAP = { diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index b09feb0dc1e7dd..25fd4fc5df4326 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -68,7 +68,7 @@ SparseTensor new_sparse(const SparseType& dtype) { } else { type_id = SparseCPUTensorId(); } - return SparseTensor(c10::make_intrusive(type_id, dtype.scalarType())); + return SparseTensor(c10::make_intrusive(type_id, scalarTypeToTypeMeta(dtype.scalarType()))); } /*** Helper methods ***/ diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index dc419b9ad52c01..61035f2c3d38f7 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -33,7 +33,7 @@ ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } caffe2::TypeMeta ${Type}::typeMeta() const { - AT_ERROR("typeMeta not defined for SparseType"); + return caffe2::TypeMeta::Make<${ScalarType}>(); } Backend ${Type}::backend() const { return Backend::${Backend}; diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 50f6bb83bb7edb..02bef7f89a6057 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -142,7 +142,7 @@ struct AT_API Tensor { return tensor_impl_->type_id(); } ScalarType scalar_type() const { - return tensor_impl_->scalar_type(); + return dataTypeToScalarType(tensor_impl_->dtype().id()); } const Storage& storage() const { return tensor_impl_->storage(); diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 3808aef75c4d57..5f7631512c715e 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -22,7 +22,7 @@ namespace torch { namespace autograd { Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge) - : TensorImpl(data.type().type_id(), data.type().scalarType(), data.type().allocator(), /* is variable */ true), + : TensorImpl(data.type().type_id(), data.type().typeMeta(), data.type().allocator(), /* is variable */ true), data_(std::move(data)), grad_fn_(std::move(gradient_edge.function)), requires_grad_(false), @@ -159,7 +159,7 @@ void Variable::Impl::set_data(Tensor new_data) { } // Updates metadata - scalar_type_ = new_data.type().scalarType(); + data_type_ = new_data.type().typeMeta(); type_id_ = new_data.type().type_id(); is_variable_ = true; data_ = std::move(new_data); diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index d7e8c31f8cb5cf..415820a3086164 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -336,7 +336,7 @@ struct PreprocessGraph { struct ContainerTensor : public at::TensorImpl { public: ContainerTensor() - : TensorImpl(at::UndefinedTensorId(), at::ScalarType::Undefined, nullptr, /* is_variable */ false) {} + : TensorImpl(at::UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is_variable */ false) {} virtual ~ContainerTensor() = default; virtual at::IntList sizes() const override { From 07fd4450ab8bcaf4c726e2e4afb738bf2b98365d Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 17 Sep 2018 09:34:52 -0700 Subject: [PATCH 190/237] Revert D9831398: [pytorch][PR] Update OpenMP cmake setting for xcode 9 compiler(AppleClang 9.0) Differential Revision: D9831398 Original commit changeset: db119d3f9c26 fbshipit-source-id: 4f183c9c178c159473bdaaa6299d4d5eb8afe549 --- caffe2/CMakeLists.txt | 8 ----- cmake/Dependencies.cmake | 65 +++++++++++++++++++++++----------------- torch/CMakeLists.txt | 13 ++------ 3 files changed, 41 insertions(+), 45 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 361ce228f3227a..4e9b3eef8937f4 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -206,14 +206,6 @@ if (${CAFFE2_LINK_LOCAL_PROTOBUF}) else() target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf) endif() -#cmake only check for separate OpenMP library on AppleClang 7+ -#https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252 -if (WITH_OPENMP AND CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR - CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") - target_link_libraries(caffe2 PRIVATE ${OpenMP_libomp_LIBRARY}) - endif() -endif() target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS}) target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS}) target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS}) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 8fdfbd8ca8eb4a..dbec0d07ebf6e3 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -456,37 +456,12 @@ endif() # ---[ OpenMP if(USE_OPENMP) - set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") - if(APPLE AND CMAKE_COMPILER_IS_GNUCC) - exec_program(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) - string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) - message(STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") - if(DARWIN_VERSION GREATER 9) - set(APPLE_OPENMP_SUCKS 1) - endif(DARWIN_VERSION GREATER 9) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion - OUTPUT_VARIABLE GCC_VERSION) - if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) - message(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") - message(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") - add_compile_options(-Wno-unknown-pragmas) - set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) - endif() - endif() - - if(WITH_OPENMP AND NOT CHECKED_OPENMP) - find_package(OpenMP) - set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP") - - # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached) - # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake - set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found") - endif() - + find_package(OpenMP) if(OPENMP_FOUND) message(STATUS "Adding " ${OpenMP_CXX_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") else() message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF") caffe2_update_option(USE_OPENMP OFF) @@ -981,6 +956,42 @@ if (NOT BUILD_ATEN_MOBILE) STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG}) STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE}) ENDIF() + + # OpenMP support? + SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") + IF (APPLE AND CMAKE_COMPILER_IS_GNUCC) + EXEC_PROGRAM (uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) + STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) + MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") + IF (DARWIN_VERSION GREATER 9) + SET(APPLE_OPENMP_SUCKS 1) + ENDIF (DARWIN_VERSION GREATER 9) + EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion + OUTPUT_VARIABLE GCC_VERSION) + IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) + MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") + MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") + add_compile_options(-Wno-unknown-pragmas) + SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) + ENDIF() + ENDIF() + + IF (WITH_OPENMP AND NOT CHECKED_OPENMP) + FIND_PACKAGE(OpenMP) + SET(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP") + + # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached) + # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake + SET(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found") + ENDIF() + + IF (OPENMP_FOUND) + MESSAGE(STATUS "Compiling with OpenMP support") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + ENDIF() + + SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) FIND_PACKAGE(MAGMA) diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 706ff0c2d7091b..7679f3d6467ba0 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -3,7 +3,7 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) return() endif() else() - cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + cmake_minimum_required(VERSION 3.0 FATAL_ERROR) project(torch CXX C) find_package(Caffe2 REQUIRED) option(USE_CUDA "Use CUDA" ON) @@ -322,15 +322,8 @@ if(OPENMP_FOUND) if (VERBOSE) message(STATUS "Compiling with OpenMP") endif() - target_compile_options(torch INTERFACE ${OpenMP_CXX_FLAGS}) - #cmake only check for separate OpenMP library on AppleClang 7+ - #https://github.com/Kitware/CMake/blob/42212f7539040139ecec092547b7d58ef12a4d72/Modules/FindOpenMP.cmake#L252 - if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR - CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") - target_link_libraries(torch ${OpenMP_libomp_LIBRARY}) - endif() - endif() + target_compile_options(torch INTERFACE -fopenmp) + target_link_libraries(torch -fopenmp) endif() if (NOT NO_API AND NOT USE_ROCM) From a7e3cd09e023685a96f65d8447490455a16a4ebb Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Mon, 17 Sep 2018 09:44:51 -0700 Subject: [PATCH 191/237] Fix ctc gradient handling (#11753) Summary: Fixes: #11750 Also fix cuda ctc with double to enable gradient check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11753 Differential Revision: D9861318 Pulled By: ezyang fbshipit-source-id: 2e7afea2b60dbbd891bb5d0bda61ee75fe01d933 --- aten/src/ATen/native/LossCTC.cpp | 2 +- aten/src/ATen/native/cuda/LossCTC.cu | 15 ++++++++++---- test/test_autograd.py | 29 ++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index ccae5fb75f5b01..b7a9c52c64bd4e 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -287,7 +287,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_ for (int64_t c = 0; c < num_labels; c++) { scalar_t& res = grad_a[t][c]; scalar_t lp = log_probs_a[t][c]; - res = std::exp(lp)-std::exp(res + nll - lp) * gr; + res = (std::exp(lp)-std::exp(res + nll - lp)) * gr; } } // zero the remainder diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index 16d7935f3d49fa..dc0b5af8d4e264 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -227,7 +227,7 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu... - constexpr int max_threads = 1024; + constexpr int max_threads = std::is_same::value ? 1024 : 896; // we need 72 or so 32 bit registers for double int threads_target = max_threads; while (threads_target / 2 >= 2*max_target_length+1) { threads_target /= 2; @@ -247,6 +247,7 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors return std::make_tuple(neg_log_likelihood, log_alpha); } @@ -452,7 +453,7 @@ __global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ grad scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c]; if (t < input_length) { scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c]; - res = std::exp(lp)-std::exp(res + nll - lp) * gr; + res = (std::exp(lp)-std::exp(res + nll - lp)) * gr; } else { res = 0.; @@ -505,7 +506,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta)) // As above, there may be better configurations to use. - constexpr int max_threads = 1024; + constexpr int max_threads = std::is_same::value ? 1024 : 896; // we need 72 or so 32 bit registers for double int threads_target = max_threads; while (threads_target / 2 >= 2*max_target_length+1) { threads_target /= 2; @@ -526,6 +527,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors } // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on @@ -550,7 +552,10 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ .sub_(log_probs.narrow(2, BLANK, 1)) .exp_() ); - // Tor the non-blank characters, we use a kernel to compute the subtrahend. + // scale by output gradient (blanks and first summand of non-blanks) + grad *= grad_out.view({1, batch_size, 1}); + + // For the non-blank characters, we use a kernel to compute the subtrahend. // Again we might configure block and grid in a better way. int threads_target = max_threads; while (threads_target / 2 >= max_target_length) { @@ -572,6 +577,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, num_labels, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors } else { // small problem, use naive algorithm // Still no block/grid configuration guru... int threads_input = max_threads; @@ -595,6 +601,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data(), tg_target_stride, batch_size, num_labels, BLANK); + THCudaCheck(cudaGetLastError()); // catch launch errors } return grad; } diff --git a/test/test_autograd.py b/test/test_autograd.py index 47ff4c66be0d33..4c4140e81114a7 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -1375,6 +1375,35 @@ def test_unused_output(self): expected_grad[:2] = grad_output self.assertEqual(x.grad.data, expected_grad) + def test_ctc_loss(self): + batch_size = 64 + num_labels = 101 + target_length = 15 + gradcheck_input_size = 10 + + # device, input_length + tests = [('cpu', 150)] + if torch.cuda.is_available(): + tests += [('cuda', 50), + ('cuda', 150)] + + for device, input_length in tests: + targets = torch.randint(1, num_labels, (batch_size, target_length), + device=device, dtype=torch.long) + x = torch.randn(gradcheck_input_size, device=device, requires_grad=True) + tile_factors = torch.randn(input_length * batch_size * num_labels // gradcheck_input_size + 1, + device=device) + input_lengths = [input_length for _ in range(batch_size)] + target_lengths = [target_length for _ in range(batch_size)] + + def ctc_after_softmax(x): + x_full = ((x[:, None] * tile_factors[None, :]).view(-1)[:input_length * batch_size * num_labels] + .view(input_length, batch_size, num_labels)) + log_probs = torch.log_softmax(x_full, 2) + return torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) + + gradcheck(ctc_after_softmax, [x]) + def test_gc_in_destructor(self): """ Previously, if a Function destructor triggered a garbage collection, From 7949250295d17bf6ebb953d268438c3a7df3a6c1 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 17 Sep 2018 09:47:28 -0700 Subject: [PATCH 192/237] Fixes for Torch Script C++ API (#11682) Summary: A couple fixes I deem necessary to the TorchScript C++ API after writing the tutorial: 1. When I was creating the custom op API, I created `torch/op.h` as the one-stop header for creating custom ops. I now notice that there is no good header for the TorchScript C++ story altogether, i.e. when you just want to load a script module in C++ without any custom ops necessarily. The `torch/op.h` header suits that purpose just as well of course, but I think we should rename it to `torch/script.h`, which seems like a great name for this feature. 2. The current API for the CMake we provided was that we defined a bunch of variables like `TORCH_LIBRARY_DIRS` and `TORCH_INCLUDES` and then expected users to add those variables to their targets. We also had a CMake function that did that for you automatically. I now realized a much smarter way of doing this is to create an `IMPORTED` target for the libtorch library in CMake, and then add all this stuff to the link interface of that target. Then all downstream users have to do is `target_link_libraries(my_target torch)` and they get all the proper includes, libraries and compiler flags added to their target. This means we can get rid of the CMake function and all that stuff. orionr AFAIK this is a much, much better way of doing all of this, no? 3. Since we distribute libtorch with `D_GLIBCXX_USE_CXX11_ABI=0`, dependent libraries must set this flag too. I now add this to the interface compile options of this imported target. 4. Fixes to JIT docs. These could likely be 4 different PRs but given the release I wouldn't mind landing them all asap. zdevito dzhulgakov soumith Pull Request resolved: https://github.com/pytorch/pytorch/pull/11682 Differential Revision: D9839431 Pulled By: goldsborough fbshipit-source-id: fdc47b95f83f22d53e1995aa683e09613b4bfe65 --- cmake/TorchConfig.cmake.in | 38 +++++++++++++----------- docs/source/jit.rst | 11 +++---- test/custom_operator/CMakeLists.txt | 15 +++------- test/custom_operator/op.cpp | 2 +- test/custom_operator/op.h | 2 +- test/custom_operator/test_custom_ops.cpp | 2 +- torch/CMakeLists.txt | 24 ++++++++++++++- torch/abi-check.cpp | 9 ++++++ torch/{op.h => script.h} | 0 torch/utils/cpp_extension.py | 2 +- 10 files changed, 66 insertions(+), 39 deletions(-) create mode 100644 torch/abi-check.cpp rename torch/{op.h => script.h} (100%) diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index f177a9c2c3504d..066a7e63f9c57a 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -7,15 +7,12 @@ # # TORCH_FOUND -- True if the system has the Torch library # TORCH_INCLUDE_DIRS -- The include directories for torch -# TORCH_LIBRARIES -- Libraries to link to +# TORCH_LIBRARIES -- Libraries to link against +# TORCH_CXX_FLAGS -- Additional (required) compiler flags # # and the following imported targets: # -# Torch -# -# and the following functions: -# -# torch_add_custom_op_library( ) +# torch if ($ENV{TORCH_INSTALL_PREFIX}) set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX}) @@ -26,13 +23,19 @@ else() endif() # Include directories. -set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include") +if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include") + set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include") +else() + set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/include") +endif() # Library dependencies. find_package(Caffe2 REQUIRED) + find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib") +add_library(torch SHARED IMPORTED) +set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS}) -set(TORCH_LIBRARIES ${TORCH_LIBRARY} ${Caffe2_MAIN_LIBS}) if (@USE_CUDA@) if(MSVC) set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") @@ -59,13 +62,12 @@ if (@USE_CUDA@) list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) endif() -# Creates a shared library with the correct include directories -# and linker flags set to include Torch header files and link with Torch -# libraries. Also sets the C++ standard version to C++11. All options -# can be override by specifying further options on the `` CMake target. -function(torch_add_custom_op_library name source_files) - add_library(${name} SHARED ${source_files}) - target_include_directories(${name} PUBLIC "${TORCH_INCLUDE_DIRS}") - target_link_libraries(${name} "${TORCH_LIBRARIES}") - set_property(TARGET ${name} PROPERTY CXX_STANDARD 11) -endfunction(torch_add_custom_op_library) +# When we build libtorch with the old GCC ABI, dependent libraries must too. +set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@") + +set_target_properties(torch PROPERTIES + IMPORTED_LOCATION ${TORCH_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${TORCH_INCLUDE_DIRS} + INTERFACE_COMPILE_OPTIONS ${TORCH_CXX_FLAGS} + CXX_STANDARD 11 +) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 005c09fa693624..f7a4411956a2c8 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -7,13 +7,13 @@ Torch Script .. currentmodule:: torch.jit Torch Script is a way to create serializable and optimizable models from PyTorch code. -Anything code written in Torch Script can be saved from your Python -process and loaded/run a process where there is no python dependency. +Any code written in Torch Script can be saved from your Python +process and loaded in a process where there is no Python dependency. We provide tools to incrementally transition a model from being a pure Python program -to a Torch Script program that can be run independently from python, for instance, in a standalone C++ process. +to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program. This makes it possible to train models in PyTorch using familiar tools and then export -the model to a production environment where it is not a good idea to run models as python programs +the model to a production environment where it is not a good idea to run models as Python programs for performance and multi-threading reasons. Creating Torch Script Code @@ -47,7 +47,7 @@ Mixing Tracing and Scripting ---------------------------- In many cases either tracing or script is an easier approach for converting a model. -We allow you to compose tracing and scripting to suite the particular requirements +We allow you to compose tracing and scripting to suit the particular requirements of a part of a model. Scripted functions can call traced ones. This is particularly useful when you need @@ -79,6 +79,7 @@ Example: :: import torch + @torch.jit.script def foo(x, y): if x.max() > y.max(): diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt index 14f605d4f5b3f7..f692bdfae123b9 100644 --- a/test/custom_operator/CMakeLists.txt +++ b/test/custom_operator/CMakeLists.txt @@ -1,19 +1,12 @@ # Basic CMake setup -cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +cmake_minimum_required(VERSION 3.1 FATAL_ERROR) project(custom_ops) find_package(Torch REQUIRED) -# This convenience function will create a shared library target, configure -# the right include directories and link against the right libraries. It is -# exactly equivalent to the following lines: -# -# add_library(custom_ops SHARED op.cpp) -# target_include_directories(custom_ops PUBLIC "${TORCH_INCLUDE_DIRS}") -# target_link_libraries(custom_ops "${TORCH_LIBRARIES}") -# set_property(TARGET custom_ops PROPERTY CXX_STANDARD 11) -# -torch_add_custom_op_library(custom_ops op.cpp) +add_library(custom_ops SHARED op.cpp) +target_compile_features(custom_ops PUBLIC cxx_range_for) +target_link_libraries(custom_ops ${TORCH_LIBRARIES}) add_executable(test_custom_ops test_custom_ops.cpp) target_link_libraries(test_custom_ops custom_ops) diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp index 113476f581c52d..f48189728ea293 100644 --- a/test/custom_operator/op.cpp +++ b/test/custom_operator/op.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/test/custom_operator/op.h b/test/custom_operator/op.h index b1c4b5b1647f0b..daa38b6965c5d8 100644 --- a/test/custom_operator/op.h +++ b/test/custom_operator/op.h @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp index 47e6f3618aad7c..918eadd1e869fd 100644 --- a/test/custom_operator/test_custom_ops.cpp +++ b/test/custom_operator/test_custom_ops.cpp @@ -1,4 +1,4 @@ -#include +#include #include "op.h" diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 7679f3d6467ba0..68eee299e65334 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -413,7 +413,7 @@ endif() install(DIRECTORY "${TORCH_SRC_DIR}/csrc" DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch FILES_MATCHING PATTERN "*.h") -install(FILES "${TORCH_SRC_DIR}/op.h" +install(FILES "${TORCH_SRC_DIR}/script.h" DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch) install(TARGETS torch @@ -488,6 +488,28 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) endif() endif() +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + message(STATUS "${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check") + execute_process( + COMMAND + "${CMAKE_CXX_COMPILER}" + "${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp" + "-o" + "${CMAKE_BINARY_DIR}/abi-check" + RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT) + if (ABI_CHECK_COMPILE_RESULT) + message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}") + endif() + execute_process( + COMMAND "${CMAKE_BINARY_DIR}/abi-check" + RESULT_VARIABLE ABI_CHECK_RESULT + OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI) + if (ABI_CHECK_RESULT) + message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}") + endif() + message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}") +endif() + # CMake config for external projects. configure_file( ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in diff --git a/torch/abi-check.cpp b/torch/abi-check.cpp new file mode 100644 index 00000000000000..e506eb7831cad4 --- /dev/null +++ b/torch/abi-check.cpp @@ -0,0 +1,9 @@ +#include + +int main() { +#ifdef _GLIBCXX_USE_CXX11_ABI + std::cout << _GLIBCXX_USE_CXX11_ABI; +#else + std::cout << 0; +#endif +} diff --git a/torch/op.h b/torch/script.h similarity index 100% rename from torch/op.h rename to torch/script.h diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 22940aaa5f7ea5..01961cf994ec73 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -288,7 +288,7 @@ def _add_gnu_abi_flag_if_binary(self, extension): # if the extension is compiled with gcc >= 5.1, # then we have to define _GLIBCXX_USE_CXX11_ABI=0 # so that the std::string in the API is resolved to - # non-C++11 symbols + # non-C++11 symbols. define = '-D_GLIBCXX_USE_CXX11_ABI=0' if is_binary_build(): if isinstance(extension.extra_compile_args, dict): From cdefc277954e4edfcaded72808bd5196e0d118a8 Mon Sep 17 00:00:00 2001 From: Chenguang Xi Date: Mon, 17 Sep 2018 10:14:08 -0700 Subject: [PATCH 193/237] Support lr adaption for SparseAdam and RowWiseSparseAdam (#11162) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11162 as title, fix pr test failure Reviewed By: chocjy Differential Revision: D9619308 fbshipit-source-id: 0a2228841ed8fadb15f07e94d3575aa701b10146 --- caffe2/python/operator_test/adam_test.py | 164 +++++++++++- caffe2/python/optimizer.py | 55 ++-- caffe2/sgd/adam_op.cc | 8 +- caffe2/sgd/adam_op.h | 323 ++++++++++++++++------- caffe2/sgd/adam_op_gpu.cu | 146 ++++++++-- 5 files changed, 541 insertions(+), 155 deletions(-) diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py index 9cf3f7c06b4385..8209b1c0493095 100644 --- a/caffe2/python/operator_test/adam_test.py +++ b/caffe2/python/operator_test/adam_test.py @@ -34,15 +34,18 @@ def ref_adam(param, mom1, mom2, grad, LR, ITER, @staticmethod def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER, - beta1, beta2, epsilon): + beta1, beta2, epsilon, output_grad=False): t = ITER + 1 - corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \ + corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad)) - param_out = param + corrected_local_rate * mom1_out / \ - (np.sqrt(mom2_out) + epsilon) - return (param_out, mom1_out, mom2_out) + grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon) + param_out = param + LR * grad_out + if output_grad: + return param_out, mom1_out, mom2_out, grad_out + else: + return param_out, mom1_out, mom2_out @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), @@ -176,6 +179,76 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER): ref_sparse, input_device_options=input_device_options) + @given(inputs=hu.tensors(n=4), + ITER=st.integers(min_value=0, max_value=10000), + LR=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta1=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta2=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + epsilon=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + data_strategy=st.data(), + **hu.gcs) + def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, + data_strategy, gc, dc): + param, mom1, mom2, grad = inputs + mom2 = np.absolute(mom2) + ITER = np.array([ITER], dtype=np.int64) + LR = np.array([LR], dtype=np.float32) + + # Create an indexing array containing values which index into grad + indices = data_strategy.draw( + hu.tensor( + max_dim=1, + min_value=1, + max_value=grad.shape[0], + dtype=np.int64, + elements=st.sampled_from(np.arange(grad.shape[0])), + ), + ) + + # Verify that the generated indices are unique + hypothesis.assume( + np.array_equal( + np.unique(indices.flatten()), + np.sort(indices.flatten()))) + + # Sparsify grad + grad = grad[indices] + + op = core.CreateOperator( + "SparseAdam", + ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], + ["param", "mom1", "mom2", "output_grad"], + beta1=beta1, beta2=beta2, epsilon=epsilon) + + def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, + beta1, beta2, epsilon, output_grad): + param_out = np.copy(param) + mom1_out = np.copy(mom1) + mom2_out = np.copy(mom2) + grad_out = np.copy(grad) + + for i, index in enumerate(indices): + param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ + self.ref_adam(param[index], mom1[index], mom2[index], + grad[i], LR, ITER, + beta1, beta2, epsilon, output_grad) + return (param_out, mom1_out, mom2_out, grad_out) + + # Iter lives on the CPU + input_device_options = {'iter': hu.cpu_do} + + self.assertReferenceChecks( + gc, op, + [param, mom1, mom2, indices, grad, LR, ITER], + functools.partial( + ref_sparse_output_grad, + beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), + input_device_options=input_device_options) + @given(inputs=hu.tensors(n=3), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, @@ -252,6 +325,87 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER): ref_row_wise_sparse, input_device_options=input_device_options) + @given(inputs=hu.tensors(n=3), + ITER=st.integers(min_value=0, max_value=10000), + LR=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta1=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + beta2=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + epsilon=st.floats(min_value=0.01, max_value=0.99, + allow_nan=False, allow_infinity=False), + data_strategy=st.data(), + **hu.gcs_cpu_only) + def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, + epsilon, data_strategy, gc, dc): + param, mom1, grad = inputs + ITER = np.array([ITER], dtype=np.int64) + LR = np.array([LR], dtype=np.float32) + + # Create a 1D row-wise average 2nd moment tensor. + mom2 = data_strategy.draw( + hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], + elements=hu.elements_of_type(dtype=np.float32)) + ) + mom2 = np.absolute(mom2) + + # Create an indexing array containing values which index into grad + indices = data_strategy.draw( + hu.tensor( + max_dim=1, + min_value=1, + max_value=grad.shape[0], + dtype=np.int64, + elements=st.sampled_from(np.arange(grad.shape[0])), + ), + ) + + # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment + # tensor that is strictly 1-dimensional and equal in length to the + # first dimension of the parameters, so indices must also be + # 1-dimensional. + indices = indices.flatten() + + hypothesis.note('indices.shape: %s' % str(indices.shape)) + + # Verify that the generated indices are unique + hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) + + # Sparsify grad + grad = grad[indices] + + op = core.CreateOperator( + "RowWiseSparseAdam", + ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], + ["param", "mom1", "mom2", "output_grad"], + beta1=beta1, beta2=beta2, epsilon=epsilon) + + def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, + beta1, beta2, epsilon, output_grad): + param_out = np.copy(param) + mom1_out = np.copy(mom1) + mom2_out = np.copy(mom2) + grad_out = np.copy(grad) + + for i, index in enumerate(indices): + param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ + self.ref_row_wise_adam(param[index], mom1[index], mom2[index], + grad[i], LR, ITER, + beta1, beta2, epsilon, output_grad) + return (param_out, mom1_out, mom2_out, grad_out) + + # Iter lives on the CPU + input_device_options = {'iter': hu.cpu_do} + + self.assertReferenceChecks( + gc, op, + [param, mom1, mom2, indices, grad, LR, ITER], + functools.partial( + ref_row_wise_sparse_output_grad, + beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), + input_device_options=input_device_options) + if __name__ == "__main__": import unittest diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index a48502c68a3cd6..482d16a0dfa6a6 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -930,19 +930,6 @@ def _run(self, net, param_init_net, param_info): **(self.init_kwargs) ) - if self.use_lr_adaption: - effective_grad = param_init_net.ConstantFill( - [param], - param + "_effgrad", - value=0.0 - ) - self._aux_params.local.append(effective_grad) - net.LearningRateAdaption( - [lr, grad, effective_grad], - [lr], - lr_alpha=self.lr_alpha, - normalized_lr_adaption=self.normalized_lr_adaption) - m1 = param_init_net.ConstantFill( [param], param + "_first_moment", @@ -973,35 +960,45 @@ def _run(self, net, param_init_net, param_info): 'If SparseAdam with rowWise=True, gradient must be '\ 'a gradientslice. PLease ensure that rowWise is not enabled '\ 'for the dense Adam optimizer, as it is not supported.' + + output_blobs = [param, m1, m2] + if self.use_lr_adaption: + effective_grad = str(param) + '_effective_grad' + output_blobs.append(effective_grad) + if isinstance(grad, core.GradientSlice): grad = self.dedup(net, self.sparse_dedup_aggregator, grad) if self.rowWise: op = 'RowWiseSparseAdam' else: op = 'SparseAdam' + net.__getattr__(op)( [param, m1, m2, grad.indices, grad.values, lr, iteration], - [param, m1, m2], + output_blobs, beta1=self.beta1, beta2=self.beta2, - epsilon=self.epsilon - ) + epsilon=self.epsilon) + if self.use_lr_adaption: + net.LearningRateAdaption( + [lr, grad.values, effective_grad], + [lr], + lr_alpha=self.lr_alpha, + normalized_lr_adaption=self.normalized_lr_adaption) else: + net.Adam( + [param, m1, m2, grad, lr, iteration], + output_blobs, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon) if self.use_lr_adaption: - net.Adam( - [param, m1, m2, grad, lr, iteration], - [param, m1, m2, effective_grad], - beta1=self.beta1, - beta2=self.beta2, - epsilon=self.epsilon) - else: - net.Adam( - [param, m1, m2, grad, lr, iteration], - [param, m1, m2], - beta1=self.beta1, - beta2=self.beta2, - epsilon=self.epsilon) + net.LearningRateAdaption( + [lr, grad, effective_grad], + [lr], + lr_alpha=self.lr_alpha, + normalized_lr_adaption=self.normalized_lr_adaption) def scale_learning_rate(self, scale): self.alpha *= scale diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc index 25414622bad754..623e93a07e3251 100644 --- a/caffe2/sgd/adam_op.cc +++ b/caffe2/sgd/adam_op.cc @@ -34,7 +34,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output .Output(0, "output_param", "Updated parameters") .Output(1, "output_moment_1", "Updated first moment") .Output(2, "output_moment_2", "Updated second moment") - .Output(3, "output_grad", "Effective grad") + .Output(3, "output_grad", "Optional Effective gradient") .Arg("beta1", "Default 0.9") .Arg("beta2", "Default 0.999") .Arg("epsilon", "Default 1e-5"); @@ -42,7 +42,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output REGISTER_CPU_OPERATOR(SparseAdam, SparseAdamOp); OPERATOR_SCHEMA(SparseAdam) .NumInputs(7) - .NumOutputs(3) + .NumOutputs(3, 4) .EnforceInplace({{0, 0}, {1, 1}, {2, 2}}) .SetDoc(R"DOC( @@ -62,6 +62,7 @@ OPERATOR_SCHEMA(SparseAdam) .Output(0, "output_param", "Updated parameters") .Output(1, "output_moment_1", "Updated first moment") .Output(2, "output_moment_2", "Updated second moment") + .Output(3, "output_grad", "Optional Effective gradient") .Arg("beta1", "Default 0.9") .Arg("beta2", "Default 0.999") .Arg("epsilon", "Default 1e-5"); @@ -71,7 +72,7 @@ REGISTER_CPU_OPERATOR( RowWiseSparseAdamOp); OPERATOR_SCHEMA(RowWiseSparseAdam) .NumInputs(7) - .NumOutputs(3) + .NumOutputs(3, 4) .EnforceInplace({{0, 0}, {1, 1}, {2, 2}}) .SetDoc(R"DOC( @@ -95,6 +96,7 @@ OPERATOR_SCHEMA(RowWiseSparseAdam) .Output(0, "output_param", "Updated parameters") .Output(1, "output_moment_1", "Updated first moment") .Output(2, "output_moment_2", "Updated second moment") + .Output(3, "output_grad", "Optional Effective gradient") .Arg("beta1", "Default 0.9") .Arg("beta2", "Default 0.999") .Arg("epsilon", "Default 1e-5"); diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h index 3cd301a8223009..699ba7aa5d23b1 100644 --- a/caffe2/sgd/adam_op.h +++ b/caffe2/sgd/adam_op.h @@ -195,58 +195,118 @@ class SparseAdamOp final : public Operator { auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data(); auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); - for (auto i = 0; i < n; ++i) { - auto idx = indices[i]; - - if (block_size == 1) { - float gi = gradIn[i]; - float mi = moment1Out[idx] = - moment1In[idx] * beta1_ + gi * (1 - beta1_); - float vi = moment2Out[idx] = - moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); - paramOut[idx] = - paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); - - } else { - auto offsetI = i * block_size; - auto offsetIdx = idx * block_size; + if (OutputSize() == 3) { + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + paramOut[idx] = paramIn[idx] + + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; + +#ifndef NDEBUG + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); +#endif + + adam_compute( + block_size, + paramIn + offsetIdx, + gradIn + offsetI, + moment1In + offsetIdx, + moment2In + offsetIdx, + paramOut + offsetIdx, + moment1Out + offsetIdx, + moment2Out + offsetIdx, + beta1_, + beta2_, + epsilon_, + correction, + lr, + &context_); + } + } + } else { + Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); + auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_); + paramOut[idx] = paramIn[idx] + lr[0] * ngi; + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; #ifndef NDEBUG - CAFFE_ENFORCE_GE( - Input(PARAM).size(), - block_size + offsetIdx, - this->debug_def().input(PARAM), - ", out of bound, idx:", - idx, - " for input i:", - i, - " and block size:", - block_size); - CAFFE_ENFORCE_GE( - Input(GRAD).size(), - block_size + offsetI, - this->debug_def().input(GRAD), - ", out of bound idx, idx:", - idx, - " for input i:", - i); + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); #endif - adam_compute( - block_size, - paramIn + offsetIdx, - gradIn + offsetI, - moment1In + offsetIdx, - moment2In + offsetIdx, - paramOut + offsetIdx, - moment1Out + offsetIdx, - moment2Out + offsetIdx, - beta1_, - beta2_, - epsilon_, - correction, - lr, - &context_); + adam_compute_output_grad( + block_size, + paramIn + offsetIdx, + gradIn + offsetI, + moment1In + offsetIdx, + moment2In + offsetIdx, + paramOut + offsetIdx, + moment1Out + offsetIdx, + moment2Out + offsetIdx, + gradOut + offsetI, + beta1_, + beta2_, + epsilon_, + correction, + lr, + &context_); + } } } return true; @@ -257,7 +317,7 @@ class SparseAdamOp final : public Operator { T beta2_; T epsilon_; INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER); - OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2); + OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD); }; template @@ -305,61 +365,126 @@ class RowWiseSparseAdamOp final : public Operator { auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data(); auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); - for (auto i = 0; i < n; ++i) { - auto idx = indices[i]; + if (OutputSize() == 3) { + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + paramOut[idx] = paramIn[idx] + + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; - if (block_size == 1) { - float gi = gradIn[i]; - float mi = moment1Out[idx] = - moment1In[idx] * beta1_ + gi * (1 - beta1_); - float vi = moment2Out[idx] = - moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); - paramOut[idx] = - paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); +#ifndef NDEBUG + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); +#endif - } else { - auto offsetI = i * block_size; - auto offsetIdx = idx * block_size; + const float* w = paramIn + offsetIdx; + const float* g = gradIn + offsetI; + const float* m1 = moment1In + offsetIdx; + const float* m2 = moment2In + idx; + float* nw = paramOut + offsetIdx; + float* nm1 = moment1Out + offsetIdx; + float* nm2 = moment2Out + idx; + + float m2_sum = 0.; + for (auto j = 0; j < block_size; ++j) { + float gj = g[j]; + m2_sum += gj * gj; + } + float vi = nm2[0] = + m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); + for (auto j = 0; j < block_size; ++j) { + float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); + nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + } + } + } + } else { + Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); + auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + + if (block_size == 1) { + float gi = gradIn[i]; + float mi = moment1Out[idx] = + moment1In[idx] * beta1_ + gi * (1 - beta1_); + float vi = moment2Out[idx] = + moment2In[idx] * beta2_ + gi * gi * (1 - beta2_); + float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_); + paramOut[idx] = paramIn[idx] + lr[0] * ngi; + + } else { + auto offsetI = i * block_size; + auto offsetIdx = idx * block_size; #ifndef NDEBUG - CAFFE_ENFORCE_GE( - Input(PARAM).size(), - block_size + offsetIdx, - this->debug_def().input(PARAM), - ", out of bound, idx:", - idx, - " for input i:", - i, - " and block size:", - block_size); - CAFFE_ENFORCE_GE( - Input(GRAD).size(), - block_size + offsetI, - this->debug_def().input(GRAD), - ", out of bound idx, idx:", - idx, - " for input i:", - i); + CAFFE_ENFORCE_GE( + Input(PARAM).size(), + block_size + offsetIdx, + this->debug_def().input(PARAM), + ", out of bound, idx:", + idx, + " for input i:", + i, + " and block size:", + block_size); + CAFFE_ENFORCE_GE( + Input(GRAD).size(), + block_size + offsetI, + this->debug_def().input(GRAD), + ", out of bound idx, idx:", + idx, + " for input i:", + i); #endif - const float* w = paramIn + offsetIdx; - const float* g = gradIn + offsetI; - const float* m1 = moment1In + offsetIdx; - const float* m2 = moment2In + idx; - float* nw = paramOut + offsetIdx; - float* nm1 = moment1Out + offsetIdx; - float* nm2 = moment2Out + idx; - - float m2_sum = 0.; - for (auto j = 0; j < block_size; ++j) { - float gj = g[j]; - m2_sum += gj * gj; - } - float vi = nm2[0] = - m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); - for (auto j = 0; j < block_size; ++j) { - float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); - nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); + const float* w = paramIn + offsetIdx; + const float* g = gradIn + offsetI; + const float* m1 = moment1In + offsetIdx; + const float* m2 = moment2In + idx; + float* nw = paramOut + offsetIdx; + float* nm1 = moment1Out + offsetIdx; + float* nm2 = moment2Out + idx; + float* ng = gradOut + offsetI; + + float m2_sum = 0.; + for (auto j = 0; j < block_size; ++j) { + float gj = g[j]; + m2_sum += gj * gj; + } + float vi = nm2[0] = + m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); + for (auto j = 0; j < block_size; ++j) { + float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); + float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_); + nw[j] = w[j] + lr[0] * ngi; + } } } } @@ -371,7 +496,7 @@ class RowWiseSparseAdamOp final : public Operator { T beta2_; T epsilon_; INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER); - OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2); + OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD); }; } // namespace caffe2 diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu index 41f70ca51d577b..2e142af682f795 100644 --- a/caffe2/sgd/adam_op_gpu.cu +++ b/caffe2/sgd/adam_op_gpu.cu @@ -1,6 +1,6 @@ -#include "caffe2/sgd/adam_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" +#include "caffe2/sgd/adam_op.h" namespace caffe2 { @@ -95,6 +95,55 @@ void adam_compute( N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr); } +__global__ void AdamComputeOutputGrad( + int N, + const float* w, + const float* g, + const float* m, + const float* v, + float* nw, + float* nm, + float* nv, + float* ng, + float beta1, + float beta2, + float eps_hat, + float correction, + const float* lr) { + CUDA_1D_KERNEL_LOOP(i, N) { + float gi = g[i]; + float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); + float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); + float ngi = ng[i] = correction * mi / (sqrtf(vi) + eps_hat); + nw[i] = w[i] + lr[0] * ngi; + } +} + +template <> +void adam_compute_output_grad( + int N, + const float* w, + const float* g, + const float* m, + const float* v, + float* nw, + float* nm, + float* nv, + float* ng, + float beta1, + float beta2, + float eps_hat, + float correction, + const float* lr, + CUDAContext* context) { + AdamComputeOutputGrad<<< + CAFFE_GET_BLOCKS(N), + CAFFE_CUDA_NUM_THREADS, + 0, + context->cuda_stream()>>>( + N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr); +} + template __global__ void SparseAdamKernel( const size_t N, @@ -123,9 +172,44 @@ __global__ void SparseAdamKernel( } } +template +__global__ void SparseAdamOutputGradKernel( + const size_t N, + const size_t grad_slice_sz, + const float beta1, + const float beta2, + const float epsilon, + float* param, + float* mom1, + float* mom2, + float* output_grad, + const SIndex* indices, + const float* grad, + const float correction, + const float* lr, + const float iter) { + CUDA_1D_KERNEL_LOOP(i, N) { + const size_t gradIdx = i; + const SIndex index = indices[i / grad_slice_sz]; + const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz); + + float m1n = mom1[paramIdx] = + mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1); + float m2n = mom2[paramIdx] = + mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2); + float gradOut = output_grad[gradIdx] = + correction * m1n / (sqrt(m2n) + epsilon); + param[paramIdx] += lr[0] * gradOut; + } +} + template <> template bool SparseAdamOp::DoRunWithType() { + Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM)); + Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1)); + Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2)); + auto N = Input(GRAD).size(); auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim()); const auto iter = @@ -133,24 +217,48 @@ bool SparseAdamOp::DoRunWithType() { const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) / (1.0f - std::pow(beta1_, iter + 1)); - SparseAdamKernel - <<>>( - N, - grad_slice_sz, - beta1_, - beta2_, - epsilon_, - Output(OUTPUT_PARAM)->template mutable_data(), - Output(OUTPUT_MOMENT_1)->template mutable_data(), - Output(OUTPUT_MOMENT_2)->template mutable_data(), - Input(INDICES).template data(), - Input(GRAD).template data(), - correction, - Input(LR).template data(), - iter); + if (OutputSize() == 3) { + SparseAdamKernel + <<>>( + N, + grad_slice_sz, + beta1_, + beta2_, + epsilon_, + Output(OUTPUT_PARAM)->template mutable_data(), + Output(OUTPUT_MOMENT_1)->template mutable_data(), + Output(OUTPUT_MOMENT_2)->template mutable_data(), + Input(INDICES).template data(), + Input(GRAD).template data(), + correction, + Input(LR).template data(), + iter); + } else { + Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); + SparseAdamOutputGradKernel + <<>>( + N, + grad_slice_sz, + beta1_, + beta2_, + epsilon_, + Output(OUTPUT_PARAM)->template mutable_data(), + Output(OUTPUT_MOMENT_1)->template mutable_data(), + Output(OUTPUT_MOMENT_2)->template mutable_data(), + Output(OUTPUT_GRAD)->template mutable_data(), + Input(INDICES).template data(), + Input(GRAD).template data(), + correction, + Input(LR).template data(), + iter); + } + return true; } From e125e61824a5287d602fd6317d6e13581b7682b3 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Mon, 17 Sep 2018 11:02:48 -0700 Subject: [PATCH 194/237] Fix flake8 Summary: Fix flake8 Reviewed By: ezyang Differential Revision: D9873872 fbshipit-source-id: 26e81238f22caaeccd2c8b4f39cedb6cfb5520dd --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index ccfced5f35bc83..4364b19ffdc52d 100644 --- a/setup.py +++ b/setup.py @@ -426,6 +426,7 @@ class build_deps(PytorchCommand): def run(self): print('setup.py::build_deps::run()') # Check if you remembered to check out submodules + def check_file(f): if not os.path.exists(f): print("Could not find {}".format(f)) From 39520ffec15ab7e97691fed048de1832e83785e8 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Mon, 17 Sep 2018 11:03:56 -0700 Subject: [PATCH 195/237] remove Type/Tensor/TensorMethods include order dependencies. (#11720) Summary: Previously, it was a necessity to include TensorMethods.h after Tensor.h in order to get the tensor method definitions. We abstracted this away from users by making sure ATen.h did this correctly; but we don't have any equivalent for ATen/core. In order to solve this dependency issue, we now forward declare Tensor in the Type declaration, which breaks the dependency cycle. Type.h now includes Tensor.h (for backwards compatibility) and Tensor.h now includes TensorMethods.h, so there is no longer include dependency restrictions. We could get rid of TensorMethods.h completely now, but that would involve coordinating a code generation change. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11720 Reviewed By: ezyang Differential Revision: D9841488 Pulled By: gchanan fbshipit-source-id: 1668199095e096c1790e646b5dc9f61ec1b33c0a --- aten/src/ATen/core/Tensor.h | 2 ++ aten/src/ATen/core/Type.h | 4 +++- aten/src/ATen/templates/Tensor.h | 2 ++ aten/src/ATen/templates/Type.h | 4 +++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h index ca23584f870ce7..1b2c0f0e288264 100644 --- a/aten/src/ATen/core/Tensor.h +++ b/aten/src/ATen/core/Tensor.h @@ -684,3 +684,5 @@ struct AT_API WeakTensor { c10::weak_intrusive_ptr weak_tensor_impl_; }; } // namespace at + +#include "ATen/core/TensorMethods.h" diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h index dea7844e93debb..1366f899c30b84 100644 --- a/aten/src/ATen/core/Type.h +++ b/aten/src/ATen/core/Type.h @@ -8,7 +8,6 @@ #include "ATen/core/Scalar.h" #include "ATen/core/ScalarType.h" #include "ATen/core/SparseTensorRef.h" -#include "ATen/core/Tensor.h" #include "ATen/core/ArrayRef.h" #include "ATen/core/Half.h" #include "ATen/core/TensorTypeIdRegistration.h" @@ -34,6 +33,7 @@ class Context; struct Allocator; struct Generator; struct Storage; +struct Tensor; static inline void noop_deleter(void*) {} @@ -630,3 +630,5 @@ struct AT_API Type { }; } // namespace at + +#include "ATen/core/Tensor.h" diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 02bef7f89a6057..85e7c84961d6ee 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -298,3 +298,5 @@ struct AT_API WeakTensor { c10::weak_intrusive_ptr weak_tensor_impl_; }; } // namespace at + +#include "ATen/core/TensorMethods.h" diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 8dda48749b0c97..0e00a5d3499fcd 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -8,7 +8,6 @@ #include "ATen/core/Scalar.h" #include "ATen/core/ScalarType.h" #include "ATen/core/SparseTensorRef.h" -#include "ATen/core/Tensor.h" #include "ATen/core/ArrayRef.h" #include "ATen/core/Half.h" #include "ATen/core/TensorTypeIdRegistration.h" @@ -34,6 +33,7 @@ class Context; struct Allocator; struct Generator; struct Storage; +struct Tensor; static inline void noop_deleter(void*) {} @@ -144,3 +144,5 @@ struct AT_API Type { }; } // namespace at + +#include "ATen/core/Tensor.h" From 47d65ed34fa6bec4c3038edf1ee5119347607664 Mon Sep 17 00:00:00 2001 From: vishwakftw Date: Mon, 17 Sep 2018 12:05:13 -0700 Subject: [PATCH 196/237] Fix issue 10492 (#11634) Summary: - pass infos vector by reference - checkErrors takes infos vector by reference - modified gesv tests to not cause infs or nans sporadically - also clean up error messages Reviewed By: ezyang Differential Revision: D9818550 Pulled By: soumith fbshipit-source-id: 00215205ff88767d6a5e921322394c5fd915d6d8 --- aten/src/ATen/native/Gesv.cpp | 5 +- aten/src/ATen/native/Gesv.h | 15 +++--- aten/src/ATen/native/cuda/Gesv.cu | 2 +- test/common.py | 65 ++++++++++++++++++++++++++ test/test_autograd.py | 76 ++++++------------------------- test/test_torch.py | 23 +++++----- 6 files changed, 101 insertions(+), 85 deletions(-) diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp index b45e2a4f98860e..dcb8a0964d2f90 100644 --- a/aten/src/ATen/native/Gesv.cpp +++ b/aten/src/ATen/native/Gesv.cpp @@ -44,7 +44,7 @@ template<> void lapackGesv( #endif template -static void applyGesv(Tensor& b, Tensor& A, std::vector infos) { +static void applyGesv(Tensor& b, Tensor& A, std::vector& infos) { #ifndef USE_LAPACK AT_ERROR("gesv: LAPACK library not found in compilation"); #endif @@ -117,8 +117,7 @@ std::tuple gesv_out( Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) { if (self.dim() > 2 || A.dim() > 2) { AT_ERROR("torch.gesv() with the `out` keyword does not support batching. " - "b.dim() (%lld) and A.dim() (%lld) must both be 2.", - (long long)self.dim(), (long long)A.dim()); + "b.dim() (", self.dim(), ") and A.dim() (", A.dim(), ") must both be 2."); } return at::_gesv_single_out(solution, lu, self, A); } diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h index 2d265520f3d21c..a3ba6ec1a8f127 100644 --- a/aten/src/ATen/native/Gesv.h +++ b/aten/src/ATen/native/Gesv.h @@ -5,26 +5,23 @@ namespace at { namespace native { static inline void checkInputs(const Tensor& self, const Tensor& A) { if (A.size(-1) != A.size(-2)) { AT_ERROR("A must be batches of square matrices, " - "but they are %lld by %lld matrices", + "but they are ", A.size(-1), " by ", A.size(-2), " matrices", (long long)A.size(-1), (long long)A.size(-2)); } if (A.size(-1) != self.size(-2)) { AT_ERROR("Incompatible matrix sizes for matmul: each A " - "matrix is %llu by %lld but each b matrix is %lld by %lld.", - (long long)A.size(-1), (long long)A.size(-1), - (long long)self.size(-2), (long long)self.size(-1)); + "matrix is ", A.size(-1), " by ", A.size(-1), + " but each b matrix is ", self.size(-2), " by ", self.size(-1)); } } -static inline void checkErrors(std::vector infos) { +static inline void checkErrors(std::vector& infos) { for (size_t i = 0; i < infos.size(); i++) { auto info = infos[i]; if (info < 0) { - AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value", - (long long)i, -info); + AT_ERROR("gesv: For batch ", i, ": Argument ", -info, " has illegal value."); } else if (info > 0) { - AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.", - (long long)i, info, info); + AT_ERROR("gesv: For batch ", i, ": U(", info, ",", info, ") is zero, singular U."); } } } diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index bc37e83990e192..80c7aaeb74f6a8 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -84,7 +84,7 @@ static inline Storage pin_memory(int64_t size, Tensor dummy) { name = static_cast(storage_##name.data()); template -static void applyGesv(Tensor& b, Tensor& A, std::vector infos) { +static void applyGesv(Tensor& b, Tensor& A, std::vector& infos) { #ifndef USE_MAGMA AT_ERROR("gesv: MAGMA library not found in " "compilation. Please rebuild with MAGMA."); diff --git a/test/common.py b/test/common.py index c1cd7999d82901..c415504e94656d 100644 --- a/test/common.py +++ b/test/common.py @@ -593,3 +593,68 @@ def find_free_port(): sockname = sock.getsockname() sock.close() return sockname[1] + + +# Methods for matrix generation +# Used in test_autograd.py and test_torch.py +def prod_single_zero(dim_size): + result = torch.randn(dim_size, dim_size) + result[0, 1] = 0 + return result + + +def random_square_matrix_of_rank(l, rank): + assert rank <= l + A = torch.randn(l, l) + u, s, v = A.svd() + for i in range(l): + if i >= rank: + s[i] = 0 + elif s[i] == 0: + s[i] = 1 + return u.mm(torch.diag(s)).mm(v.transpose(0, 1)) + + +def random_symmetric_matrix(l): + A = torch.randn(l, l) + for i in range(l): + for j in range(i): + A[i, j] = A[j, i] + return A + + +def random_symmetric_psd_matrix(l): + A = torch.randn(l, l) + return A.mm(A.transpose(0, 1)) + + +def random_symmetric_pd_matrix(l, eps=1e-5): + A = torch.randn(l, l) + return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps + + +def make_nonzero_det(A, sign=None, min_singular_value=0.1): + u, s, v = A.svd() + s[s < min_singular_value] = min_singular_value + A = u.mm(torch.diag(s)).mm(v.t()) + det = A.det().item() + if sign is not None: + if (det < 0) ^ (sign < 0): + A[0, :].neg_() + return A + + +def random_fullrank_matrix_distinct_singular_value(l, *batches): + if len(batches) == 0: + A = torch.randn(l, l) + u, _, v = A.svd() + s = torch.arange(1., l + 1).mul_(1.0 / (l + 1)) + return u.mm(torch.diag(s)).mm(v.t()) + else: + all_matrices = [] + for _ in range(0, torch.prod(torch.as_tensor(batches)).item()): + A = torch.randn(l, l) + u, _, v = A.svd() + s = torch.arange(1., l + 1).mul_(1.0 / (l + 1)) + all_matrices.append(u.mm(torch.diag(s)).mm(v.t())) + return torch.stack(all_matrices).reshape(*(batches + (l, l))) diff --git a/test/test_autograd.py b/test/test_autograd.py index 4c4140e81114a7..296eead471e558 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -14,8 +14,12 @@ from torch.autograd.gradcheck import gradgradcheck, gradcheck from torch.autograd.function import once_differentiable from torch.autograd.profiler import profile -from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \ - suppress_warnings, skipIfRocm +from common import (TEST_MKL, TestCase, run_tests, skipIfNoLapack, + suppress_warnings, skipIfRocm, + prod_single_zero, random_square_matrix_of_rank, + random_symmetric_matrix, random_symmetric_psd_matrix, + random_symmetric_pd_matrix, make_nonzero_det, + random_fullrank_matrix_distinct_singular_value) from torch.autograd import Variable, Function, detect_anomaly from torch.autograd.function import InplaceFunction from torch.testing import make_non_contiguous, randn_like @@ -2595,60 +2599,6 @@ def prod_zeros(dim_size, dim_select): return result -def prod_single_zero(dim_size): - result = torch.randn(dim_size, dim_size) - result[0, 1] = 0 - return result - - -def random_square_matrix_of_rank(l, rank): - assert rank <= l - A = torch.randn(l, l) - u, s, v = A.svd() - for i in range(l): - if i >= rank: - s[i] = 0 - elif s[i] == 0: - s[i] = 1 - return u.mm(torch.diag(s)).mm(v.transpose(0, 1)) - - -def random_symmetric_matrix(l): - A = torch.randn(l, l) - for i in range(l): - for j in range(i): - A[i, j] = A[j, i] - return A - - -def random_symmetric_psd_matrix(l): - A = torch.randn(l, l) - return A.mm(A.transpose(0, 1)) - - -def random_symmetric_pd_matrix(l, eps=1e-5): - A = torch.randn(l, l) - return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps - - -def make_nonzero_det(A, sign=None, min_singular_value=0.1): - u, s, v = A.svd() - s[s < min_singular_value] = min_singular_value - A = u.mm(torch.diag(s)).mm(v.t()) - det = A.det().item() - if sign is not None: - if (det < 0) ^ (sign < 0): - A[0, :].neg_() - return A - - -def random_fullrank_matrix_distinct_singular_value(l): - A = torch.randn(l, l) - u, _, v = A.svd() - s = torch.arange(1., l + 1).mul_(1.0 / (l + 1)) - return u.mm(torch.diag(s)).mm(v.t()) - - def uniform_scalar(offset=0, requires_grad=False): v = torch.rand(()) + offset v.requires_grad = requires_grad @@ -3180,11 +3130,15 @@ class dont_convert(tuple): 'tall_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])), ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS, 'large', NO_ARGS, [skipIfNoLapack]), - ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]), - ('gesv', (S, S, S), ((S, S, S),), 'batched', NO_ARGS, [skipIfNoLapack, skipIfRocm]), - ('gesv', (2, 3, S, S), ((2, 3, S, S),), 'batched_dims', NO_ARGS, [skipIfNoLapack, skipIfRocm]), - ('gesv', (2, 2, S, S), ((1, S, S),), 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack, skipIfRocm]), - ('gesv', (1, S, S), ((2, 2, S, S),), 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack, skipIfRocm]), + ('gesv', (S, S), (random_fullrank_matrix_distinct_singular_value(S),), '', NO_ARGS, [skipIfNoLapack]), + ('gesv', (S, S, S), (random_fullrank_matrix_distinct_singular_value(S, S),), + 'batched', NO_ARGS, [skipIfNoLapack, skipIfRocm]), + ('gesv', (2, 3, S, S), (random_fullrank_matrix_distinct_singular_value(S, 2, 3),), + 'batched_dims', NO_ARGS, [skipIfNoLapack, skipIfRocm]), + ('gesv', (2, 2, S, S), (random_fullrank_matrix_distinct_singular_value(S, 1),), + 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack, skipIfRocm]), + ('gesv', (1, S, S), (random_fullrank_matrix_distinct_singular_value(S, 2, 2),), + 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack, skipIfRocm]), ('fill_', (S, S, S), (1,), 'number'), ('fill_', (), (1,), 'number_scalar'), # FIXME: we should compute the derivative w.r.t torch.tensor(1) diff --git a/test/test_torch.py b/test/test_torch.py index cf938d4d0f50d7..fbabc12d32e051 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -3997,8 +3997,9 @@ def test_gesv(self): @staticmethod def _test_gesv_batched(self, cast): + from common import random_fullrank_matrix_distinct_singular_value as fullrank # test against gesv: one batch - A = cast(torch.randn(1, 5, 5)) + A = cast(fullrank(5, 1)) b = cast(torch.randn(1, 5, 10)) x_exp, LU_exp = torch.gesv(b.squeeze(0), A.squeeze(0)) x, LU = torch.gesv(b, A) @@ -4006,7 +4007,7 @@ def _test_gesv_batched(self, cast): self.assertEqual(LU, LU_exp.unsqueeze(0)) # test against gesv in a loop: four batches - A = cast(torch.randn(4, 5, 5)) + A = cast(fullrank(5, 4)) b = cast(torch.randn(4, 5, 10)) x_exp_list = list() @@ -4023,7 +4024,7 @@ def _test_gesv_batched(self, cast): self.assertEqual(LU, LU_exp) # basic correctness test - A = cast(torch.randn(3, 5, 5)) + A = cast(fullrank(5, 3)) b = cast(torch.randn(3, 5, 10)) x, LU = torch.gesv(b, A) self.assertEqual(torch.matmul(A, x), b) @@ -4033,7 +4034,7 @@ def _test_gesv_batched(self, cast): return import numpy from numpy.linalg import solve - A = cast(torch.randn(2, 2, 2)).permute(1, 0, 2) + A = cast(fullrank(2, 2)).permute(1, 0, 2) b = cast(torch.randn(2, 2, 2)).permute(2, 1, 0) x, _ = torch.gesv(b, A) x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())) @@ -4048,18 +4049,18 @@ def _test_gesv_batched_dims(self, cast): if not TEST_NUMPY: return - import numpy from numpy.linalg import solve + from common import random_fullrank_matrix_distinct_singular_value as fullrank # test against numpy.linalg.solve - A = cast(torch.randn(2, 1, 3, 4, 4)) + A = cast(fullrank(4, 2, 1, 3)) b = cast(torch.randn(2, 1, 3, 4, 6)) x, _ = torch.gesv(b, A) x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())) self.assertEqual(x.data, cast(x_exp)) # test column major format - A = cast(torch.randn(2, 1, 3, 4, 4)).transpose(-2, -1) + A = cast(fullrank(4, 2, 1, 3)).transpose(-2, -1) b = cast(torch.randn(2, 1, 3, 6, 4)).transpose(-2, -1) assert not A.is_contiguous() assert not b.is_contiguous() @@ -4068,21 +4069,21 @@ def _test_gesv_batched_dims(self, cast): self.assertEqual(x.data, cast(x_exp)) # broadcasting b - A = cast(torch.randn(2, 1, 3, 4, 4)) + A = cast(fullrank(4, 2, 1, 3)) b = cast(torch.randn(4, 6)) x, _ = torch.gesv(b, A) x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())) self.assertEqual(x.data, cast(x_exp)) # broadcasting A - A = cast(torch.randn(4, 4)) + A = cast(fullrank(4)) b = cast(torch.randn(2, 1, 3, 4, 2)) x, _ = torch.gesv(b, A) x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())) self.assertEqual(x.data, cast(x_exp)) # broadcasting both A & b - A = cast(torch.randn(1, 3, 1, 4, 4)) + A = cast(fullrank(4, 1, 3, 1)) b = cast(torch.randn(2, 1, 3, 4, 5)) x, _ = torch.gesv(b, A) x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())) @@ -4663,7 +4664,7 @@ def run_test(M, sign=1): # Single matrix, but full rank # This is for negative powers - from test_autograd import random_fullrank_matrix_distinct_singular_value + from common import random_fullrank_matrix_distinct_singular_value M = conv_fn(random_fullrank_matrix_distinct_singular_value(5)) run_test(M) run_test(M, sign=-1) From 73738ec570fa907045e89dd573d2fd945aa2585c Mon Sep 17 00:00:00 2001 From: Soumith Chintala Date: Mon, 17 Sep 2018 12:09:16 -0700 Subject: [PATCH 197/237] bump version to 1.0 (#11717) Summary: I'm just doing the honors and bumping the version to 1.0.0. 1.0 preview and RC releases will have the 1.0.0.dev{date} tag Pull Request resolved: https://github.com/pytorch/pytorch/pull/11717 Reviewed By: SsnL Differential Revision: D9840857 Pulled By: soumith fbshipit-source-id: 4c9c2e01dccb3c521dab26c49e1569d970a87ace --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4364b19ffdc52d..b61251c46c34fa 100644 --- a/setup.py +++ b/setup.py @@ -283,7 +283,7 @@ def patched_link(self, *args, **kwargs): # Version, create_version_file, and package_name ################################################################################ package_name = os.getenv('TORCH_PACKAGE_NAME', 'torch') -version = '0.5.0a0' +version = '1.0.0a0' if os.getenv('PYTORCH_BUILD_VERSION'): assert os.getenv('PYTORCH_BUILD_NUMBER') is not None build_number = int(os.getenv('PYTORCH_BUILD_NUMBER')) From 336323f53c21fb68c884fd071825ac31914300a0 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Mon, 17 Sep 2018 13:28:27 -0700 Subject: [PATCH 198/237] return aten::gt to the list of fusable operations, add expected graphs (#11150) Summary: Fixes one of #11118 issues. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11150 Differential Revision: D9861372 Pulled By: apaszke fbshipit-source-id: 98b196b89e991d3936360b30568360367fd32e8b --- test/test_jit.py | 23 +++++++++++++++++++++ torch/csrc/jit/fusers/cuda/fused_kernel.cpp | 8 ++++++- torch/csrc/jit/passes/graph_fuser.cpp | 3 +++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/test/test_jit.py b/test/test_jit.py index b715045b30a27c..6612a8fbebf6fb 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -365,6 +365,10 @@ def allSum(vs): return ge + def assertAllFused(self, graph): + self.assertTrue(all(node.kind() in {'prim::Constant', 'prim::FusionGroup'} for node in graph.nodes())) + self.assertTrue([node.kind() for node in graph.nodes()].count('prim::FusionGroup') == 1) + def assertExportImport(self, trace, inputs): graph = trace if isinstance(trace, torch._C.Graph) else trace.graph() m = torch.jit.ScriptModule() @@ -766,6 +770,7 @@ def test_comparison_gt_lt_cuda(self): y = torch.randn(4, 4, dtype=torch.float, device='cuda') ge = self.checkTrace(self.fn_test_comparison_gt_lt, (x, y)) + self.assertAllFused(ge.graph_for(x, y)) @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") @@ -782,6 +787,24 @@ def f(x, y): y = torch.randn(4, 4, dtype=torch.float, device='cuda') ge = self.checkTrace(f, (x, y)) + self.assertAllFused(ge.graph_for(x, y)) + + @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + @skipIfRocm + def test_comparison_eq_ne(self): + def f(x, y): + mask = (x == 0).type_as(x) + z = x * mask + y + mask = (x != 0).type_as(x) + z = z * mask + y + return z + + x = torch.randn(4, 4, dtype=torch.float, device='cuda') + y = torch.randn(4, 4, dtype=torch.float, device='cuda') + + ge = self.checkTrace(f, (x, y)) + self.assertAllFused(ge.graph_for(x, y)) @staticmethod def fn_test_relu(x, y): diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp index 90624307123b56..4067e6cc7bba7e 100644 --- a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp +++ b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp @@ -65,7 +65,13 @@ CUDAFusedKernel::CUDAFusedKernel( TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size)); ptx.resize(ptx_size); TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data())); - + CUcontext pctx = 0; + TORCH_CU_CHECK(cuCtxGetCurrent(&pctx)); + if (!pctx) { + std::unique_lock cudaFreeMutexLock( + *(THCCachingAllocator_getCudaFreeMutex())); + cudaFree(0); + } TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data())); TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str())); diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index e8123690789b24..4d69ed57e63786 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -228,6 +228,9 @@ struct GraphFuser { node->matches("aten::le(Tensor self, Tensor other) -> Tensor") || node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || + node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::gt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || + node->matches("aten::gt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") || node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || From 29610621ec511cd543bc809b40620852b46c56ff Mon Sep 17 00:00:00 2001 From: Jongsoo Park Date: Mon, 17 Sep 2018 13:40:52 -0700 Subject: [PATCH 199/237] 64B align for avx512 (#11748) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11748 For avx512, we need to align at a multiple of 64B not 32B Regardless of avx512, it's in general a good idea to be cache line aligned. Reviewed By: ilia-cher Differential Revision: D9845056 fbshipit-source-id: b1d3ed67749c0c1a64acd5cc230a1279e8023512 --- caffe2/core/allocator.h | 4 ++-- caffe2/python/hypothesis_test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h index 1e9be9f50b39d8..96bc720ccd59d1 100644 --- a/caffe2/core/allocator.h +++ b/caffe2/core/allocator.h @@ -11,8 +11,8 @@ CAFFE2_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill); namespace caffe2 { -// Use 32-byte alignment should be enough for computation up to AVX512. -constexpr size_t gCaffe2Alignment = 32; +// Use 64-byte alignment should be enough for computation up to AVX512. +constexpr size_t gCaffe2Alignment = 64; using MemoryDeleter = void (*)(void*); diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index 64174f6e71c676..5d41fdfb18262b 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -2229,7 +2229,7 @@ def ref_nhwc(x, scale, bias): in_place=st.booleans(), **hu.gcs) def test_unsafe_coalesce(self, sizes, in_place, gc, dc): - gAlignment = 32 + gAlignment = 64 Xs = [np.random.randn(size) .astype(np.random.choice([np.float32, np.float64, np.uint8])) for size in sizes] From 7671f4ab1c921888554bef92d1cee55785bcf232 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Mon, 17 Sep 2018 14:03:14 -0700 Subject: [PATCH 200/237] Add `math` to scope when using inf in tests (#11302) Summary: This fixes #8515 which was mostly issues in the test themselves. As long as `math` is imported in the scope in which the script runs it resolves to a `prim::Constant` with value `inf` correctly. This PR adds this to the `test_jit.py` tests involving `inf` and adds a test to demonstrate `inf` in a non-generated test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11302 Differential Revision: D9684336 Pulled By: driazati fbshipit-source-id: 73df2848dfdb45ab50690a7c88df8fda269a64eb --- test/test_jit.py | 23 +++++++++++++++++++---- torch/csrc/jit/export.cpp | 2 ++ torch/csrc/jit/import.cpp | 2 ++ torch/csrc/jit/interned_strings.h | 1 + torch/csrc/jit/ir.h | 6 ++++++ torch/csrc/jit/register_prim_ops.cpp | 17 +++++++++++++++++ torch/csrc/jit/script/compiler.cpp | 2 ++ 7 files changed, 49 insertions(+), 4 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 6612a8fbebf6fb..d91b5b4653d57c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -10,6 +10,7 @@ from torch.autograd.function import traceable from torch.testing import assert_allclose from torch.onnx import OperatorExportTypes +from torch._six import inf, PY2 from common import TestCase, run_tests, IS_WINDOWS, TEST_WITH_UBSAN, skipIfRocm, suppress_warnings from textwrap import dedent import os @@ -56,7 +57,6 @@ RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1 -PY2 = sys.version_info[0] == 2 PY35 = sys.version_info >= (3, 5) WINDOWS = sys.platform == 'win32' @@ -2621,6 +2621,13 @@ def foo(a): s = Variable(torch.rand(2)) self.assertEqual(s + s + s, foo(s)) + def test_inf(self): + @torch.jit.script + def foo(a): + return a < float('inf') + s = torch.rand(1) + self.assertTrue(foo(s)) + def test_add(self): def func(a, b): c = a + b @@ -7549,8 +7556,6 @@ def forward(self, x, y): 'test_var_dim_1d', 'test_var_dim_1d_neg0', 'test_var_dim_neg0', - 'test_norm_inf', - 'test_renorm_norm_inf', 'test_matrix_power_n=-1', # involves inverse 'test_matrix_power_n=-3', # involves inverse # skipped nn functional tests @@ -7638,6 +7643,12 @@ def the_method({}): ''' +def get_constant(x): + if x == inf or x == -inf: + return 'float(\'inf\')' if PY2 else 'math.inf' + return x + + # create a script function from (name, func_type, output_process_fn), # returns a function takes in (args, kwargs) and runs the compiled function and # then applies the post process fn to the outputs @@ -7653,7 +7664,7 @@ def script_fn(*args, **kwargs): actuals.append(name) tensors.append(arg) else: - actuals.append(str(arg)) + actuals.append(str(get_constant(arg))) kwargs_str = '' for k, v in kwargs.items(): kwargs_str += ', ' + k + '=' + str(v) @@ -7667,6 +7678,10 @@ def script_fn(*args, **kwargs): raise 'Unsupported function type' script = script_template.format(', '.join(formals), call) + + # for math.inf + import math + CU = torch.jit.CompilationUnit(script) self.assertExportImport(CU.the_method.graph, tensors) output = output_process_fn(CU.the_method(*tensors)) diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 0740d2e88a2c8a..3a0500b765dead 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -559,6 +559,8 @@ void ModuleEncoder::EncodeTypeInfo( type_proto->set_denotation("NoneType"); } else if (kind == TypeKind::GeneratorType) { type_proto->set_denotation("GeneratorType"); + } else if (kind == TypeKind::StringType) { + type_proto->set_denotation("StringType"); } else { throw std::runtime_error("unexpected type kind"); } diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index 8e6a07da0a00ca..84ffdd1c265466 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -260,6 +260,8 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) { return NoneType::get(); } else if (kind == "GeneratorType") { return GeneratorType::get(); + }else if (kind == "StringType") { + return StringType::get(); } else { throw std::runtime_error("unexpected string for type kind"); } diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index 60e086db187472..e1d76dde56c59d 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -51,6 +51,7 @@ namespace torch { namespace jit { _(prim, ImplicitTensorToNum) \ _(prim, IntToFloat) \ _(prim, FloatToInt) \ + _(prim, StringToFloat) \ _(prim, AutogradAdd) \ _(prim, GradOf) \ _(prim, AnyDefined) \ diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 58e702b771acaf..fd730adb69570a 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -1066,6 +1066,12 @@ friend struct Block; result->output()->setType(IntType::get()); return result; } + Node* createStringToFloat(Value* value) { + JIT_ASSERT(*value->type() == *StringType::get()); + auto* result = create(prim::StringToFloat, {value}); + result->output()->setType(FloatType::get()); + return result; + } Node* createPythonOp( THPObjectPtr&& pyobj, const std::string& cconv, diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 42184ac318ad58..1f8618121f1e28 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -13,10 +13,12 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -134,6 +136,21 @@ RegisterOperators reg({ return 0; }; }), + Operator( + prim::StringToFloat, + [](Node* node) -> Operation { + return [](Stack& stack) { + auto s = pop(stack).toString(); + if (s->string() != "inf") { + AT_ERROR( + "Only 'inf' can be cast to a float, but got '", + s->string(), + "'"); + } + push(stack, std::numeric_limits::infinity()); + return 0; + }; + }), Operator( prim::Undefined, [](Node* node) { diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index fc7bf068f20998..b66b96dd5eb6fb 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -76,6 +76,8 @@ static Value* typeCast(const SourceRange& loc, Value* value, TypePtr dst) { n = graph.createFloatToInt(value); } else if(dst->isSubtypeOf(FloatType::get()) && orig->isSubtypeOf(IntType::get())) { n = graph.createIntToFloat(value); + } else if(dst->isSubtypeOf(FloatType::get()) && orig->isSubtypeOf(StringType::get())) { + n = graph.createStringToFloat(value); } else { throw ErrorReport(loc) << "Cannot cast type '" << orig->str() << "' to type '" << dst->str() << "'."; From 7df6650e9c23c507137b9af7ffb94053b8a86a2d Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Mon, 17 Sep 2018 14:36:18 -0700 Subject: [PATCH 201/237] Fix empty embedding bag on cuda (#11740) Summary: Fixes https://github.com/pytorch/pytorch/issues/11739 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11740 Differential Revision: D9881392 Pulled By: SsnL fbshipit-source-id: 2964d314f199dd9b4bb69e36592b67efdf5e0760 --- aten/src/ATen/native/cuda/EmbeddingBag.cu | 18 ++++++++++++++---- test/test_nn.py | 21 +++++++++------------ torch/csrc/autograd/engine.cpp | 11 ++++++----- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 853c04deb2215c..afa4c8e1916604 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -73,14 +73,22 @@ __global__ void EmbeddingBag_updateOutputKernel( } } if (mode == MODE_MEAN) { - weightFeatSum = weightFeatSum / static_cast(bag_size_); - bag_size[bag] = bag_size_; + if (end == begin) { + bag_size[bag] = 0; + } else { + weightFeatSum = weightFeatSum / static_cast(bag_size_); + bag_size[bag] = bag_size_; + } } if (mode == MODE_MEAN || mode == MODE_SUM) { output[bag * featureSize + featureDim] = static_cast(weightFeatSum); } else if (mode == MODE_MAX) { + if (end == begin) { + // If bag is empty, set output to 0. + weightFeatMax = 0; + } max_indices[bag * featureSize + featureDim] = maxWord; output[bag * featureSize + featureDim] = weightFeatMax; } @@ -268,8 +276,10 @@ __global__ void EmbeddingBag_accGradParametersKernel_max( int64_t bag = chunk / chunksPerBag; int64_t word_idx = max_indices[bag * stride + featureDim]; - - atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]); + if (word_idx >= 0) { + // If bag is empty, we have max_indices[idx] set to -1 in forward. + atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]); + } } } } diff --git a/test/test_nn.py b/test/test_nn.py index ac2496620c3977..29e7c6ba8ebcc9 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -1956,8 +1956,7 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double): input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=torch.long) # Empty list is only handled in CPU for now - offsets = torch.tensor([0, 3], device=device, dtype=torch.long) if cuda \ - else torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long) + offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long) grad_output = torch.tensor( [1, 2, @@ -2008,33 +2007,31 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double): [3, 4]], device=device, dtype=dtype) output = es(input, offsets) - output.backward(grad_output if cuda else grad_output_with_empty) + output.backward(grad_output_with_empty) es_weight_grad = es.weight.grad.data if sparse: - es_weight_grad = es.weight.grad.data.to_dense() - self.assertEqual( - output.data, - expected_output if cuda else expected_output_with_empty) + es_weight_grad = es.weight.grad.to_dense() + self.assertEqual(output, expected_output_with_empty) self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype]) # check same example except as 2D (2 x 3) - input = input.data.view(2, -1) + input = input.view(2, -1) es.zero_grad() output = es(input) output.backward(grad_output) - es_weight_grad = es.weight.grad.data + es_weight_grad = es.weight.grad if sparse: - es_weight_grad = es.weight.grad.data.to_dense() - self.assertEqual(output.data, expected_output) + es_weight_grad = es.weight.grad.to_dense() + self.assertEqual(output, expected_output) self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype]) # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length def _test_vs_Embedding(N, D, B, L, max_norm=None): es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, dtype) e = nn.Embedding(N, D, max_norm=max_norm).to(device, dtype) - e.weight.data.copy_(es.weight.data) + e.weight.data.copy_(es.weight) input = torch.randint(N, (B, L), device=device, dtype=torch.long) offsets = torch.arange(0, B, device=device, dtype=torch.long).mul_(L) grad_output = torch.rand(B, D, device=device, dtype=dtype) diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index a8dc01fa5ec3a8..d0ecc017b42b50 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -302,7 +303,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const std::stringstream ss; ss << "invalid number of gradients - expected "; ss << edges.size() << ", but got " << grads.size(); - throw std::runtime_error(format_error(ss.str())); + AT_ERROR(format_error(ss.str())); } for (size_t i = 0; i < grads.size(); i++) { const auto& edge = edges[i]; @@ -314,7 +315,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const // FIXME: TestJit.test_ge_optimized fails this assertion. // std::stringstream ss; // ss << "undefined gradient at index " << i; - // throw std::runtime_error(format_error(ss.str())); + // AT_ERROR(format_error(ss.str())); continue; } if (!grads[i].sizes().equals(metadata.shape())) { @@ -323,7 +324,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const ss << "invalid gradient at index " << i << " - got "; ss << grads[i].sizes() << " but expected shape compatible with "; ss << metadata.shape(); - throw std::runtime_error(format_error(ss.str())); + AT_ERROR(format_error(ss.str())); } grads[i] = at::sum_to(grads[i], metadata.shape()); } @@ -331,14 +332,14 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const std::stringstream ss; ss << "invalid gradient at index " << i << " - expected type "; ss << metadata.type() << " but got " << grads[i].type(); - throw std::runtime_error(format_error(ss.str())); + AT_ERROR(format_error(ss.str())); } const auto output_device = output.is_cuda() ? output.get_device() : -1; if (output_device != metadata.device()) { std::stringstream ss; ss << "invalid gradient at index " << i << " - expected device "; ss << metadata.device() << " but got " << output_device; - throw std::runtime_error(format_error(ss.str())); + AT_ERROR(format_error(ss.str())); } } } From 3ce17bf8f6a2c4239085191ea60d6ee51cd620a5 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Mon, 17 Sep 2018 15:22:13 -0700 Subject: [PATCH 202/237] Generate ATen/core to source if env GEN_TO_SOURCE is set. (#11759) Summary: It is currently tedious to change code generation because it takes two steps: change the code gen, then gen.py fails because of file mismatch. Just add an environment option of generating directly to source. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11759 Differential Revision: D9867259 Pulled By: gchanan fbshipit-source-id: 3cf8024d9e302f382cf8b8a44cb843fb086f8597 --- aten/src/ATen/gen.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 66444e1837c540..3112e5ff0424ab 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -41,7 +41,11 @@ parser.add_argument( '-d', '--install_dir', help='output directory', default='ATen') options = parser.parse_args() -core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None +gen_to_source = os.environ.get('GEN_TO_SOURCE') # update source directly as part of gen +if not gen_to_source: + core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None +else: + core_install_dir = os.path.join(options.source_path, 'core') if options.install_dir is not None and not os.path.exists(options.install_dir): os.makedirs(options.install_dir) @@ -50,8 +54,8 @@ class FileManager(object): - def __init__(self, prefix=None): - self.install_dir = '{}/{}'.format(options.install_dir, prefix) if prefix else options.install_dir + def __init__(self, install_dir=None): + self.install_dir = install_dir if install_dir else options.install_dir self.filenames = set() self.outputs_written = False self.undeclared_files = [] @@ -133,7 +137,7 @@ def check_all_files_written(self): context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}()); """) -core_file_manager = FileManager('core_tmp') +core_file_manager = FileManager(core_install_dir) file_manager = FileManager() cuda_file_manager = FileManager() From ca5def1b8fc4c5e23bd32dce4cb247df6d4bb7c7 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Mon, 17 Sep 2018 16:28:29 -0700 Subject: [PATCH 203/237] Expose annotations (#11649) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11649 Putting annotations in python interface Reviewed By: duc0 Differential Revision: D9784750 fbshipit-source-id: d877c886ac52559ca3f009a1fd848dd1779b7d04 --- .../nomnigraph/Representations/NeuralNet.h | 3 --- caffe2/opt/converter.cc | 15 +++++++++++++++ caffe2/opt/converter.h | 17 +++++++++++++++-- caffe2/python/nomnigraph.py | 1 + caffe2/python/nomnigraph_test.py | 16 ++++++++++++++++ caffe2/python/pybind_state_nomni.cc | 17 ++++++++++++++++- 6 files changed, 63 insertions(+), 6 deletions(-) diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index 09273a5fe15981..3abcb2d1057ae9 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -53,9 +53,6 @@ class CAFFE2_API Annotation { return kind_; } - Annotation(const Annotation&) = delete; - Annotation& operator=(Annotation&) = delete; - private: const AnnotationKind kind_; }; diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 80e2308eabf3cd..3d0f2ce2aa643a 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -379,6 +379,21 @@ caffe2::OperatorDef convertToOperatorDef( return op; } +Caffe2Annotation getOrAddCaffe2Annotation( + nom::repr::NNGraph::NodeRef& instrNode) { + auto* nnOp = repr::nn::get(instrNode); + auto* annotation = nnOp->getAnnotation(); + if (!annotation) { + auto new_annot = util::make_unique(); + new_annot->setOperatorDef(convertToOperatorDef(instrNode)); + nnOp->setAnnotation(std::move(new_annot)); + annotation = nnOp->getAnnotation(); + } + CAFFE_ENFORCE(isa(annotation)); + auto c2_annotation = dyn_cast(annotation); + return *c2_annotation; +} + caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m) { auto predictNet = caffe2::NetDef(); return convertToCaffe2Proto(m, predictNet); diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h index 31281ab90572ec..fdfa7f9ab37583 100644 --- a/caffe2/opt/converter.h +++ b/caffe2/opt/converter.h @@ -19,8 +19,12 @@ class Caffe2Annotation : public nom::repr::Annotation { : Annotation(AnnotationKind::Caffe2), Device(device) {} virtual ~Caffe2Annotation() {} - void setDevice(std::string device) { Device = device; } - const std::string getDevice() const { return Device; } + void setDevice(std::string device) { + Device = device; + } + const std::string getDevice() const { + return Device; + } void setDeviceType(int device) { DeviceType = device; @@ -33,6 +37,11 @@ class Caffe2Annotation : public nom::repr::Annotation { OpDef = opDef; OpDefExists = true; } + + bool hasOperatorDef() const { + return OpDefExists; + } + const caffe2::OperatorDef& getOperatorDef() const { CAFFE_ENFORCE( OpDefExists, @@ -73,6 +82,10 @@ CAFFE2_API std::unique_ptr convertToNeuralNetOpera CAFFE2_API caffe2::OperatorDef convertToOperatorDef( const nom::repr::NNGraph::NodeRef& instrNode); +// If the annotation doesn't exist, attempt to add it +CAFFE2_API Caffe2Annotation +getOrAddCaffe2Annotation(nom::repr::NNGraph::NodeRef& instrNode); + class CAFFE2_API Converter { public: explicit Converter() = default; diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py index 708eae6b2a071c..417ad64ccc2bb2 100644 --- a/caffe2/python/nomnigraph.py +++ b/caffe2/python/nomnigraph.py @@ -72,3 +72,4 @@ def render(s): NNSubgraph = C.NNSubgraph NNMatchGraph = C.NNMatchGraph Graph = C.Graph +Annotation = C.Annotation diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 7739ac05f2979f..1956d2e9f01b68 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -174,3 +174,19 @@ def test_convertToProto(self): assert a == b for a, b in zip(new_netdef.external_output, net.Proto().external_output): assert a == b + + def test_annotation_basic(self): + annot = ng.Annotation() + annot.setDevice("woot") + assert annot.getDevice() == "woot" + annot.setDeviceType(7) + assert annot.getDeviceType() == 7 + + def test_annotation_from_graph(self): + nn = ng.NNModule() + node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp")) + annot = node.getAnnotation() + annot.setDeviceType(7) + node.setAnnotation(annot) + new_annot = node.getAnnotation() + assert new_annot.getDeviceType() == 7 diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc index a8cbb661d69c21..fa7baf60c92541 100644 --- a/caffe2/python/pybind_state_nomni.cc +++ b/caffe2/python/pybind_state_nomni.cc @@ -198,7 +198,14 @@ void addNomnigraphMethods(pybind11::module& m) { CAFFE_ENFORCE(nn::is(n)); return nn::get(n); }, - py::return_value_policy::reference_internal); + py::return_value_policy::reference_internal) + .def( + "getAnnotation", + [](NNGraph::NodeRef n) { return getOrAddCaffe2Annotation(n); }) + .def("setAnnotation", [](NNGraph::NodeRef n, Caffe2Annotation annot) { + auto* nnOp = nn::get(n); + nnOp->setAnnotation(nom::util::make_unique(annot)); + }); py::class_ nnop(m, "NeuralNetOperator"); py::class_ nndata(m, "NeuralNetData"); @@ -282,6 +289,14 @@ void addNomnigraphMethods(pybind11::module& m) { } return NNSubgraph(); }); + + // Annotation API + py::class_ annotation(m, "Annotation"); + annotation.def(py::init<>()) + .def("setDevice", &Caffe2Annotation::setDevice) + .def("getDevice", &Caffe2Annotation::getDevice) + .def("setDeviceType", &Caffe2Annotation::setDeviceType) + .def("getDeviceType", &Caffe2Annotation::getDeviceType); } REGISTER_PYBIND_ADDITION(addNomnigraphMethods); From 3819d25418b30955880cbd666e63ae69274cd77d Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Mon, 17 Sep 2018 17:25:22 -0700 Subject: [PATCH 204/237] Clean up converter and accept less-valid networks Summary: Cleaning up converter.cc and allowing networks that have "pass through" inputs (that are also outputs but aren't actually consumed by the network) Reviewed By: duc0 Differential Revision: D9759435 fbshipit-source-id: 1ddfcc60a1b865a06682e4022230dfecc4b89ec3 --- caffe2/opt/converter.cc | 36 +++++++++++++++------------ caffe2/opt/converter.h | 2 +- caffe2/python/transformations_test.py | 4 +-- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 3d0f2ce2aa643a..f9956060b75cfd 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -264,8 +264,7 @@ std::unique_ptr convertToNeuralNetOperator( /// \brief Ingest a caffe2 protobuf model and output an NNModule. /// \param net The caffe2 protobuf NetDef -/// \param blobMap [optional][output] A pointer to a blobMap to be populated with all the output blobs of the NetDef by name->NodeRef -repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut) { +repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict) { repr::NNModule module; repr::NNGraph& dfg = module.dataFlow; repr::NNCFGraph& cfg = module.controlFlow; @@ -285,7 +284,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map> currentBasicBlock = auto bbNode = cfg.createNode(util::make_unique>()); @@ -323,17 +321,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map(input)); + } + } } for (const auto& outputName : net.external_output()) { @@ -345,9 +352,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map* blobMapOut = nullptr); +CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false); CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&); diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 1a579b519fe09c..383b8410ea6ae2 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -396,8 +396,8 @@ def test_converterEnforceUnusedInputs(self): net = core.Net("net") net.Relu(["X"], ["Y"]) net.Proto().external_input.extend(["fake"]) - with self.assertRaises(Exception): - transformer.AddNNPACK(net) # just testing the converter + # This should now work + transformer.AddNNPACK(net) # just testing the converter def test_converterEnforceUnusedOutputs(self): net = core.Net("net") From 7d0657f13cc285a3e0dec2a6f35420af77adafa4 Mon Sep 17 00:00:00 2001 From: zrphercule Date: Mon, 17 Sep 2018 17:26:32 -0700 Subject: [PATCH 205/237] Migrate test in cpp/api/ to use gtest (#11556) Summary: The second part of T32009899 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11556 Differential Revision: D9888224 Pulled By: zrphercule fbshipit-source-id: cb0d0ba5d9c7ad601ee3bce0d932ce9cbbc40908 --- test/cpp/api/static.cpp | 71 +++++++++++++++++++---------------------- torch/CMakeLists.txt | 24 ++++++++++++-- 2 files changed, 53 insertions(+), 42 deletions(-) diff --git a/test/cpp/api/static.cpp b/test/cpp/api/static.cpp index 121478c928ac1b..827ff251a7c483 100644 --- a/test/cpp/api/static.cpp +++ b/test/cpp/api/static.cpp @@ -1,4 +1,5 @@ -#include + +#include "gtest/gtest.h" #include #include @@ -22,43 +23,35 @@ torch::detail::enable_if_module_t f(T&& m) { return true; } -TEST_CASE("static") { - SECTION("all_of") { - REQUIRE(torch::all_of<>::value == true); - REQUIRE(torch::all_of::value == true); - REQUIRE(torch::all_of::value == true); - REQUIRE(torch::all_of::value == false); - REQUIRE(torch::all_of::value == false); - REQUIRE(torch::all_of::value == false); - } - SECTION("any_of") { - REQUIRE(torch::any_of<>::value == false); - REQUIRE(torch::any_of::value == true); - REQUIRE(torch::any_of::value == true); - REQUIRE(torch::any_of::value == false); - REQUIRE(torch::any_of::value == true); - } - SECTION("enable_if_module_t") { - REQUIRE(f(torch::nn::LinearImpl(1, 2)) == true); - REQUIRE(f(5) == false); - } - SECTION("check_not_lvalue_references") { - REQUIRE(torch::detail::check_not_lvalue_references() == true); - REQUIRE( - torch::detail::check_not_lvalue_references() == true); - REQUIRE( - torch::detail::check_not_lvalue_references() == - false); - REQUIRE(torch::detail::check_not_lvalue_references() == true); - REQUIRE( - torch::detail::check_not_lvalue_references() == false); - } - SECTION("apply") { - std::vector v; - torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5); - REQUIRE(v.size() == 5); - for (size_t i = 0; i < v.size(); ++i) { - REQUIRE(v.at(i) == 1 + i); - } +TEST(TestStatic, All_Of){ + EXPECT_TRUE(torch::all_of<>::value); + EXPECT_TRUE(torch::all_of::value); + EXPECT_TRUE((torch::all_of::value)); + EXPECT_FALSE(torch::all_of::value); + EXPECT_FALSE((torch::all_of::value)); + EXPECT_FALSE((torch::all_of::value)); +} +TEST(TestStatic, Any_Of){ + EXPECT_FALSE(torch::any_of<>::value); + EXPECT_TRUE(bool((torch::any_of::value))); + EXPECT_TRUE(bool((torch::any_of::value))); + EXPECT_FALSE(bool((torch::any_of::value))); +} +TEST(TestStatic, Enable_If_Module){ + EXPECT_TRUE(f(torch::nn::LinearImpl(1, 2))); + EXPECT_FALSE(f(5)); + EXPECT_TRUE(torch::detail::check_not_lvalue_references()); + EXPECT_TRUE((torch::detail::check_not_lvalue_references())); + EXPECT_FALSE( + (torch::detail::check_not_lvalue_references())); + EXPECT_TRUE(torch::detail::check_not_lvalue_references()); + EXPECT_FALSE(torch::detail::check_not_lvalue_references()); +} +TEST(TestStatic, Apply){ + std::vector v; + torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5); + EXPECT_EQ(v.size(), 5); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v.at(i), i + 1); } } diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 68eee299e65334..69b4963277a69c 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -436,8 +436,10 @@ if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) endif() if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) - set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api") + #Catch test of api. + #TODO: Change all these tests to Google test. + set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api") set(TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/any.cpp ${TORCH_API_TEST_DIR}/cursor.cpp @@ -450,7 +452,6 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) ${TORCH_API_TEST_DIR}/parallel.cpp ${TORCH_API_TEST_DIR}/rnn.cpp ${TORCH_API_TEST_DIR}/sequential.cpp - ${TORCH_API_TEST_DIR}/static.cpp ${TORCH_API_TEST_DIR}/tensor_cuda.cpp ${TORCH_API_TEST_DIR}/tensor.cpp ${TORCH_API_TEST_DIR}/jit.cpp @@ -471,19 +472,36 @@ if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM) target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB}) + #Google test of api. + set(TORCH_API_GTEST_DIR "${TORCH_ROOT}/test/cpp/api/") + add_executable(gtest_api + ${TORCH_API_GTEST_DIR}/static.cpp + ) + target_include_directories(gtest_api PRIVATE ${ATen_CPU_INCLUDE}) + target_link_libraries(gtest_api torch gtest_main) + if (USE_CUDA) + target_link_libraries(gtest_api ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) + endif() + + #Adding compile options for both tests. if (NOT MSVC) if (APPLE) target_compile_options(test_api PRIVATE # Clang has an unfixed bug leading to spurious missing braces # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629 -Wno-missing-braces) - else() + target_compile_options(gtest_api PRIVATE + -Wno-missing-braces) + else() target_compile_options(test_api PRIVATE # Considered to be flaky. See the discussion at # https://github.com/pytorch/pytorch/pull/9608 -Wno-maybe-uninitialized # gcc gives nonsensical warnings about variadic.h -Wno-unused-but-set-parameter) + target_compile_options(gtest_api PRIVATE + -Wno-maybe-uninitialized + -Wno-unused-but-set-parameter) endif() endif() endif() From 24a8c13f3620b6f2cdef16e8749b7e49b50de3ed Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Mon, 17 Sep 2018 17:29:34 -0700 Subject: [PATCH 206/237] Add barrier to fix distributed test flakiness (#11775) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11775 This should fix #11582. Reviewed By: ezyang Differential Revision: D9885546 fbshipit-source-id: 3544f42ebe8b595cdf6941859c67484d3ea9b3f8 --- test/test_distributed.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_distributed.py b/test/test_distributed.py index f3250644bd621a..d311f74e3fe0f1 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -428,7 +428,6 @@ def test_broadcast(self): ) @skip_if_no_cuda_distributed @skip_if_no_gpu - @unittest.skip("Flaky test, see pytorch#11582") def test_broadcast_cuda(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -663,7 +662,6 @@ def test_all_reduce_sum(self): ) @skip_if_no_cuda_distributed @skip_if_no_gpu - @unittest.skip("Flaky test, see pytorch#11582") def test_all_reduce_sum_cuda(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -952,7 +950,6 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu") @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped") @skip_if_no_gpu - @unittest.skip("Flaky test, see pytorch#11582") def test_broadcast_multigpu(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -1213,7 +1210,6 @@ def test_DistributedDataParallelCPU(self): "Only Nccl & Gloo backend support DistributedDataParallel") @skip_if_no_cuda_distributed @skip_if_no_gpu - @unittest.skip("Flaky test, see pytorch#11582") def test_DistributedDataParallel(self): group, group_id, rank = self._init_global_test() rank_to_GPU = self._init_multigpu_helper() @@ -1296,6 +1292,12 @@ def _run(self, rank): sys.exit(SKIP_IF_BACKEND_UNAVAILABLE) # sys.exit(0) raise + + # Execute barrier prior to running test to ensure that every process + # has finished initialization and that the following test + # immediately exiting due to a skip doesn't cause flakiness. + self._barrier() + # self.id() == e.g. '__main__.TestDistributed.test_get_rank' # We're retreiving a corresponding test and executing it. getattr(self, self.id().split(".")[2])() From d4dde0bcaf044802e881d73e5ed03cdb357177b1 Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Mon, 17 Sep 2018 18:05:58 -0700 Subject: [PATCH 207/237] Detect number of amd gpus in ROCM CI (#11771) Summary: We now have CI machines with different number of amd gpus. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11771 Differential Revision: D9889837 Pulled By: bddppq fbshipit-source-id: dacf728a282f209e3f2419da186e59528a08ca6a --- .jenkins/caffe2/test.sh | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index ac29672c8e2ced..585f994367f339 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -49,6 +49,20 @@ fi mkdir -p $TEST_DIR/{cpp,python} +if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then + export LANG=C.UTF-8 + export LC_ALL=C.UTF-8 + + # Pin individual runs to specific gpu so that we can schedule + # multiple jobs on machines that have multi-gpu. + NUM_AMD_GPUS=$(/opt/rocm/bin/rocminfo | grep 'Device Type.*GPU' | wc -l) + if (( $NUM_AMD_GPUS == 0 )); then + echo >&2 "No AMD GPU detected!" + exit 1 + fi + export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % $NUM_AMD_GPUS)) +fi + cd "${WORKSPACE}" # C++ tests @@ -106,9 +120,6 @@ fi rocm_ignore_test=() if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then - export LANG=C.UTF-8 - export LC_ALL=C.UTF-8 - # Currently these tests are failing on ROCM platform: # Unknown reasons, need to debug @@ -123,10 +134,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Our cuda top_k op has some asm code, the hipified version doesn't # compile yet, so we don't have top_k operator for now rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") - - # Our AMD CI boxes have 4 gpus on each - # Remove this once we have added multi-gpu support - export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4)) fi # Python tests From e8ecbcdf010d1e65384ba2d1f8760cc557c02883 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Mon, 17 Sep 2018 18:11:11 -0700 Subject: [PATCH 208/237] Move IValue to ATen/core (#11610) Summary: unblocks D9202320 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11610 Differential Revision: D9774853 Pulled By: bwasti fbshipit-source-id: 4798223f6de680a7152283e8cad8814da7f90209 --- aten/src/ATen/Formatting.h | 25 +- aten/src/ATen/{ => core}/Formatting.cpp | 4 +- aten/src/ATen/core/Formatting.h | 26 ++ aten/src/ATen/{ => core}/Tensor.cpp | 6 +- .../jit => aten/src/ATen/core}/ivalue.cpp | 9 +- aten/src/ATen/core/ivalue.h | 425 +++++++++++++++++ setup.py | 1 - torch/CMakeLists.txt | 1 - torch/csrc/jit/ivalue.h | 426 +----------------- torch/csrc/jit/source_range.h | 1 + 10 files changed, 464 insertions(+), 460 deletions(-) rename aten/src/ATen/{ => core}/Formatting.cpp (99%) create mode 100644 aten/src/ATen/core/Formatting.h rename aten/src/ATen/{ => core}/Tensor.cpp (77%) rename {torch/csrc/jit => aten/src/ATen/core}/ivalue.cpp (86%) create mode 100644 aten/src/ATen/core/ivalue.h diff --git a/aten/src/ATen/Formatting.h b/aten/src/ATen/Formatting.h index 6a8b502cc978a8..392e2a27b0130c 100644 --- a/aten/src/ATen/Formatting.h +++ b/aten/src/ATen/Formatting.h @@ -1,24 +1 @@ -#pragma once - -#include -#include "ATen/Type.h" -#include "ATen/core/Scalar.h" - -namespace at { - -AT_API std::ostream& operator<<(std::ostream & out, IntList list); -AT_API std::ostream& operator<<(std::ostream & out, Backend b); -AT_API std::ostream& operator<<(std::ostream & out, const Type & t); -AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize); -static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { - return print(out,t,80); -} -static inline void print(const Tensor & t, int64_t linesize=80) { - print(std::cout,t,linesize); -} - -static inline std::ostream& operator<<(std::ostream & out, Scalar s) { - return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong()); -} - -} +#include diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp similarity index 99% rename from aten/src/ATen/Formatting.cpp rename to aten/src/ATen/core/Formatting.cpp index 390230316bd0dc..f13b0082d90d10 100644 --- a/aten/src/ATen/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -1,6 +1,4 @@ -#include "ATen/Formatting.h" - -#include +#include "ATen/core/Formatting.h" #include #include diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h new file mode 100644 index 00000000000000..c6ac26b8a9e0e3 --- /dev/null +++ b/aten/src/ATen/core/Formatting.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at { + +AT_API std::ostream& operator<<(std::ostream & out, IntList list); +AT_API std::ostream& operator<<(std::ostream & out, Backend b); +AT_API std::ostream& operator<<(std::ostream & out, const Type & t); +AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize); +static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { + return print(out,t,80); +} +static inline void print(const Tensor & t, int64_t linesize=80) { + print(std::cout,t,linesize); +} + +static inline std::ostream& operator<<(std::ostream & out, Scalar s) { + return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong()); +} + +} diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp similarity index 77% rename from aten/src/ATen/Tensor.cpp rename to aten/src/ATen/core/Tensor.cpp index 860a5d2ab0afe1..924688d40b9551 100644 --- a/aten/src/ATen/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -1,6 +1,6 @@ -#include -#include -#include +#include +#include +#include #include diff --git a/torch/csrc/jit/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp similarity index 86% rename from torch/csrc/jit/ivalue.cpp rename to aten/src/ATen/core/ivalue.cpp index 315da36deb196f..3d2b56893e7188 100644 --- a/torch/csrc/jit/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -1,12 +1,15 @@ -#include "torch/csrc/jit/assertions.h" -#include "torch/csrc/jit/ivalue.h" -#include +#include +#include #define TORCH_FORALL_TAGS(_) \ _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList) namespace torch { namespace jit { +AT_API c10::intrusive_ptr ConstantString::create(std::string str_) { + return c10::make_intrusive(std::move(str_)); +} + namespace { template diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h new file mode 100644 index 00000000000000..914598f6ceb426 --- /dev/null +++ b/aten/src/ATen/core/ivalue.h @@ -0,0 +1,425 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace torch { namespace jit { + +template +using Shared = c10::intrusive_ptr; + +// string +struct AT_API ConstantString final : c10::intrusive_ptr_target { + private: + const std::string str_; + public: + ConstantString(std::string str) + : str_(std::move(str)) {} + static c10::intrusive_ptr create(std::string str_); + const std::string & string() const { + return str_; + } + operator const std::string & () const { + return string(); + } + AT_API friend std::ostream& operator<<( + std::ostream& out, + const ConstantString& v); +}; + +// non-mutable list +template +struct AT_CORE_EXPORT ConstantList final : c10::intrusive_ptr_target { + private: + const std::vector elements_; + public: + ConstantList(std::vector elements_) + : elements_(std::move(elements_)) {} + static c10::intrusive_ptr> create(std::vector elements_) { + return c10::make_intrusive>(std::move(elements_)); + } + const std::vector& elements() const { + return elements_; + } + operator const std::vector&() const { + return elements(); + } +}; + +struct IValue; +using Tuple = ConstantList; +using IntList = ConstantList; +using TensorList = ConstantList; +using DoubleList = ConstantList; + +// IValue is the generic tagged union used by the interpreter to hold +// all value types. +// It is a 16-byte object with an 8-byte payload and an 8-byte tag. +// The tag is currently 4 bytes to determine the type, and 1 byte +// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs +// retain/release calls. + +#define TORCH_FORALL_TAGS(_) \ + _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList) + +struct AT_API IValue final { + IValue() + : payload{0} + , tag(Tag::None) + , is_intrusive_ptr(false) {} + IValue(const IValue& rhs) + : payload(rhs.payload), + tag(rhs.tag), + is_intrusive_ptr(rhs.is_intrusive_ptr) { + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); + } + } + IValue(IValue&& rhs) noexcept : IValue() { + swap(rhs); + } + ~IValue() { + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); + } + } + IValue & operator=(IValue && rhs) & noexcept { + IValue(std::move(rhs)).swap(*this); // this also sets rhs to None + return *this; + } + IValue & operator=(IValue const & rhs) & { + IValue(rhs).swap(*this); + return *this; + } + void swap(IValue & rhs) noexcept { + std::swap(payload, rhs.payload); + std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); + std::swap(tag, rhs.tag); + } + // Accessors for subtypes are arranged together below + // While some of these accessors could be generated through templates, + // we prefer to write them manually for clarity + + // Tensor + IValue(at::Tensor t) + : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { + // Note: the undefined tensor is not refcounted, so while it + // is tagged as a tensor, is_intrusive_ptr is set to false. + // This is not an optional optimization: our incref call + // *will not* do the right thing when called on an + // undefined tensor. + payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); + } + bool isTensor() const { return Tag::Tensor == tag; } + at::Tensor toTensor() && { + AT_ASSERT(isTensor()); + return at::Tensor(moveToIntrusivePtr()); + } + at::Tensor toTensor() const & { + AT_ASSERT(isTensor()); + return at::Tensor(toIntrusivePtr()); + } + + // Tuple + IValue(c10::intrusive_ptr v); + bool isTuple() const { return Tag::Tuple == tag; } + c10::intrusive_ptr toTuple() && { + AT_ASSERT(isTuple()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toTuple() const & { + AT_ASSERT(isTuple()); + return toIntrusivePtr(); + } + + // Double + IValue(double d) + : tag(Tag::Double), is_intrusive_ptr(false) { + payload.as_double = d; + } + bool isDouble() const { return Tag::Double == tag; } + double toDouble() const { + AT_ASSERT(isDouble()); + return payload.as_double; + } + + // Int + IValue(int64_t i) + : tag(Tag::Int), is_intrusive_ptr(false) { + payload.as_int = i; + } + + // allow you to pass literals (3, 4) without ambiguity + IValue(int32_t i) + : IValue(static_cast(i)) {} + IValue(bool b) + : IValue(static_cast(b)) {} + + bool isInt() const { return Tag::Int == tag; } + + int64_t toInt() const { + AT_ASSERT(isInt()); + return payload.as_int; + } + + // IntList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + IValue(at::ArrayRef v) + : IValue(v.vec()) {} + bool isIntList() const { return Tag::IntList == tag; } + c10::intrusive_ptr toIntList() && { + AT_ASSERT(isIntList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toIntList() const & { + AT_ASSERT(isIntList()); + return toIntrusivePtr(); + } + + const std::vector& toIntListRef() const; + const std::vector& toDoubleListRef() const; + const std::vector& toTensorListRef() const; + + // ConstantString + IValue(c10::intrusive_ptr v); + IValue(std::string v); + bool isString() const { return Tag::String == tag; } + c10::intrusive_ptr toString() && { + AT_ASSERT(isString()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toString() const & { + AT_ASSERT(isString()); + return toIntrusivePtr(); + } + + // DoubleList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isDoubleList() const { return Tag::DoubleList == tag; } + c10::intrusive_ptr toDoubleList() && { + AT_ASSERT(isDoubleList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toDoubleList() const & { + AT_ASSERT(isDoubleList()); + return toIntrusivePtr(); + } + + //TensorList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isTensorList() const { return Tag::TensorList == tag; } + c10::intrusive_ptr toTensorList() && { + AT_ASSERT(isTensorList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toTensorList() const & { + AT_ASSERT(isTensorList()); + return toIntrusivePtr(); + } + + // None + bool isNone() { + return Tag::None == tag; + } + std::string toNone() const { + return "None"; + } + // Scalar, which gets encoded as either an Int or a Double + IValue(at::Scalar s) + : IValue() { + if(s.isFloatingPoint()) { + *this = s.toDouble(); + } else { + *this = s.toLong(); + } + } + bool isScalar() { + return isDouble() || isInt(); + } + at::Scalar toScalar() const { + if(isDouble()) + return toDouble(); + else if(isInt()) + return toInt(); + else + throw std::runtime_error("IValue is not a Scalar"); + } + + // for debugging + std::string tagKind() const { + switch(tag) { + #define DEFINE_CASE(x) case Tag::x: return #x; + TORCH_FORALL_TAGS(DEFINE_CASE) + #undef DEFINE_CASE + } + return "Invalid Tag"; + } + + // generic v.to() implementations + // that can be used in special functions like pop/push + // that use template meta-programming. + // prefer the directly named methods when you can, + // since they are simpler to understand + + // Note: if you get linker errors saying one of these is missing, + // change it to ... && = delete; and you will see better error messages for why + // However, we cannot commit this because some compiler versions barf on it. + template + T to() &&; + template + T to() const &; + + AT_API friend std::ostream& operator<<(std::ostream& out, const IValue& v); + + private: + // NOTE: IValue tags are intentionally private. In the future we may encode + // this value different (e.g. using NaN boxing), and this would make it more + // costly to determine the tag for all types vs just determining if something + // is a particular type. Instead we want clients to use the `isX` methods when + // possible. If for perf. reasons you really, absolutely, must have a jump + // table, then we can revisit this. + enum class Tag : uint32_t { +#define DEFINE_TAG(x) x, + TORCH_FORALL_TAGS(DEFINE_TAG) +#undef DEFINE_TAG + }; + + template> + c10::intrusive_ptr moveToIntrusivePtr() { + auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + clearToNone(); + return t; + } + template> + c10::intrusive_ptr toIntrusivePtr() const { + auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + auto p = r; + r.release(); + return p; + } + void clearToNone() { + payload.as_int = 0; + tag = Tag::None; + is_intrusive_ptr = false; + } + union { + int64_t as_int; + double as_double; + c10::intrusive_ptr_target* as_intrusive_ptr; + } payload; + Tag tag; + bool is_intrusive_ptr; +}; + +#undef TORCH_FORALL_TAGS + + +#define DEFINE_TO(type, method_name) \ +template<> \ +inline type IValue::to() && { \ + return std::move(*this).method_name(); \ +} \ +template<> \ +inline type IValue::to() const & { \ + return this->method_name(); \ +} +DEFINE_TO(at::Tensor, toTensor) +DEFINE_TO(c10::intrusive_ptr, toTuple) +DEFINE_TO(double, toDouble) +DEFINE_TO(int64_t, toInt) +DEFINE_TO(c10::intrusive_ptr, toDoubleList) +DEFINE_TO(c10::intrusive_ptr, toIntList) +DEFINE_TO(c10::intrusive_ptr, toTensorList) +DEFINE_TO(c10::intrusive_ptr, toString) +DEFINE_TO(at::Scalar, toScalar) +DEFINE_TO(bool, toInt) +DEFINE_TO(std::vector, toIntListRef) +DEFINE_TO(std::vector, toDoubleListRef) +DEFINE_TO(std::vector, toTensorListRef) + +#undef DEFINE_TO + +#define DEFINE_TO_WITH_BODY(type, body) \ +template<> \ +inline type IValue::to() && { \ + body(std::move(*this)); \ +} \ +template<> \ +inline type IValue::to() const & { \ + body((*this)); \ +} + +#define SCALAR_TYPE_BODY(this) return static_cast(this.toInt()); +#define LAYOUT_BODY(this) return static_cast(this.toInt()); +#define DEVICE_BODY(this) \ + /* NB: const_list might be a move of the vector, so we need to */ \ + /* assign it to prevent its deallocation. */ \ + auto&& const_list = this.toIntList(); \ + const auto& elems = const_list->elements(); \ + AT_ASSERT(elems.size() == 2); \ + return at::Device(static_cast(elems[0]), elems[1]); + +DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY) +DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY) +DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY) + +#undef DEFINE_TO_WITH_BODY +#undef SCALAR_TYPE_BODY +#undef LAYOUT_BODY +#undef DEVICE_BODY + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::Tuple), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::IntList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(IntList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::String), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::string v) +: IValue(ConstantString::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::DoubleList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(DoubleList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::TensorList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(TensorList::create(std::move(v))) {} + +inline const std::vector& IValue::toIntListRef() const { + return toIntList()->elements(); +} + +inline const std::vector& IValue::toDoubleListRef() const { + return toDoubleList()->elements(); +} + +inline const std::vector& IValue::toTensorListRef() const { + return toTensorList()->elements(); +} + + +}} diff --git a/setup.py b/setup.py index b61251c46c34fa..37eaf51f0d3908 100644 --- a/setup.py +++ b/setup.py @@ -924,7 +924,6 @@ def run(self): "torch/csrc/byte_order.cpp", "torch/csrc/jit/batched/BatchTensor.cpp", "torch/csrc/jit/init.cpp", - "torch/csrc/jit/ivalue.cpp", "torch/csrc/jit/passes/onnx.cpp", "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp", "torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 69b4963277a69c..2c1ff700dd2858 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -153,7 +153,6 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp ${TORCH_SRC_DIR}/csrc/jit/constants.cpp ${TORCH_SRC_DIR}/csrc/jit/ir.cpp - ${TORCH_SRC_DIR}/csrc/jit/ivalue.cpp ${TORCH_SRC_DIR}/csrc/jit/operator.cpp ${TORCH_SRC_DIR}/csrc/jit/operator.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 16af0c9b292d5f..fb248e509853c8 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -1,425 +1 @@ -#pragma once - -#include "torch/csrc/jit/assertions.h" -#include "torch/csrc/WindowsTorchApiMacro.h" - -#include - -#include - -namespace torch { namespace jit { - -template -using Shared = c10::intrusive_ptr; - -// string -struct TORCH_API ConstantString final : c10::intrusive_ptr_target { - private: - const std::string str_; - public: - ConstantString(std::string str) - : str_(std::move(str)) {} - static c10::intrusive_ptr create(std::string str_) { - return c10::make_intrusive(std::move(str_)); - } - const std::string & string() const { - return str_; - } - operator const std::string & () const { - return string(); - } - TORCH_API friend std::ostream& operator<<(std::ostream& out, const ConstantString & v); -}; - - -// non-mutable list -template -struct TORCH_API ConstantList final : c10::intrusive_ptr_target { - private: - const std::vector elements_; - public: - ConstantList(std::vector elements_) - : elements_(std::move(elements_)) {} - static c10::intrusive_ptr> create(std::vector elements_) { - return c10::make_intrusive>(std::move(elements_)); - } - const std::vector& elements() const { - return elements_; - } - operator const std::vector&() const { - return elements(); - } -}; - -struct IValue; -using Tuple = ConstantList; -using IntList = ConstantList; -using TensorList = ConstantList; -using DoubleList = ConstantList; - -// IValue is the generic tagged union used by the interpreter to hold -// all value types. -// It is a 16-byte object with an 8-byte payload and an 8-byte tag. -// The tag is currently 4 bytes to determine the type, and 1 byte -// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs -// retain/release calls. - -#define TORCH_FORALL_TAGS(_) \ - _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList) - -struct TORCH_API IValue final { - IValue() - : payload{0} - , tag(Tag::None) - , is_intrusive_ptr(false) {} - IValue(const IValue& rhs) - : payload(rhs.payload), - tag(rhs.tag), - is_intrusive_ptr(rhs.is_intrusive_ptr) { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); - } - } - IValue(IValue&& rhs) noexcept : IValue() { - swap(rhs); - } - ~IValue() { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); - } - } - IValue & operator=(IValue && rhs) & noexcept { - IValue(std::move(rhs)).swap(*this); // this also sets rhs to None - return *this; - } - IValue & operator=(IValue const & rhs) & { - IValue(rhs).swap(*this); - return *this; - } - void swap(IValue & rhs) noexcept { - std::swap(payload, rhs.payload); - std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); - std::swap(tag, rhs.tag); - } - // Accessors for subtypes are arranged together below - // While some of these accessors could be generated through templates, - // we prefer to write them manually for clarity - - // Tensor - IValue(at::Tensor t) - : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { - // Note: the undefined tensor is not refcounted, so while it - // is tagged as a tensor, is_intrusive_ptr is set to false. - // This is not an optional optimization: our incref call - // *will not* do the right thing when called on an - // undefined tensor. - payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); - } - bool isTensor() const { return Tag::Tensor == tag; } - at::Tensor toTensor() && { - JIT_ASSERT(isTensor()); - return at::Tensor(moveToIntrusivePtr()); - } - at::Tensor toTensor() const & { - JIT_ASSERT(isTensor()); - return at::Tensor(toIntrusivePtr()); - } - - // Tuple - IValue(c10::intrusive_ptr v); - bool isTuple() const { return Tag::Tuple == tag; } - c10::intrusive_ptr toTuple() && { - JIT_ASSERT(isTuple()); - return moveToIntrusivePtr(); - } - c10::intrusive_ptr toTuple() const & { - JIT_ASSERT(isTuple()); - return toIntrusivePtr(); - } - - // Double - IValue(double d) - : tag(Tag::Double), is_intrusive_ptr(false) { - payload.as_double = d; - } - bool isDouble() const { return Tag::Double == tag; } - double toDouble() const { - JIT_ASSERT(isDouble()); - return payload.as_double; - } - - // Int - IValue(int64_t i) - : tag(Tag::Int), is_intrusive_ptr(false) { - payload.as_int = i; - } - - // allow you to pass literals (3, 4) without ambiguity - IValue(int32_t i) - : IValue(static_cast(i)) {} - IValue(bool b) - : IValue(static_cast(b)) {} - - bool isInt() const { return Tag::Int == tag; } - - int64_t toInt() const { - JIT_ASSERT(isInt()); - return payload.as_int; - } - - // IntList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - IValue(at::ArrayRef v) - : IValue(v.vec()) {} - bool isIntList() const { return Tag::IntList == tag; } - c10::intrusive_ptr toIntList() && { - JIT_ASSERT(isIntList()); - return moveToIntrusivePtr(); - } - c10::intrusive_ptr toIntList() const & { - JIT_ASSERT(isIntList()); - return toIntrusivePtr(); - } - - const std::vector& toIntListRef() const; - const std::vector& toDoubleListRef() const; - const std::vector& toTensorListRef() const; - - // ConstantString - IValue(c10::intrusive_ptr v); - IValue(std::string v); - bool isString() const { return Tag::String == tag; } - c10::intrusive_ptr toString() && { - JIT_ASSERT(isString()); - return moveToIntrusivePtr(); - } - c10::intrusive_ptr toString() const & { - JIT_ASSERT(isString()); - return toIntrusivePtr(); - } - - // DoubleList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - bool isDoubleList() const { return Tag::DoubleList == tag; } - c10::intrusive_ptr toDoubleList() && { - JIT_ASSERT(isDoubleList()); - return moveToIntrusivePtr(); - } - c10::intrusive_ptr toDoubleList() const & { - JIT_ASSERT(isDoubleList()); - return toIntrusivePtr(); - } - - //TensorList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - bool isTensorList() const { return Tag::TensorList == tag; } - c10::intrusive_ptr toTensorList() && { - JIT_ASSERT(isTensorList()); - return moveToIntrusivePtr(); - } - c10::intrusive_ptr toTensorList() const & { - JIT_ASSERT(isTensorList()); - return toIntrusivePtr(); - } - - // None - bool isNone() { - return Tag::None == tag; - } - std::string toNone() const { - return "None"; - } - // Scalar, which gets encoded as either an Int or a Double - IValue(at::Scalar s) - : IValue() { - if(s.isFloatingPoint()) { - *this = s.toDouble(); - } else { - *this = s.toLong(); - } - } - bool isScalar() { - return isDouble() || isInt(); - } - at::Scalar toScalar() const { - if(isDouble()) - return toDouble(); - else if(isInt()) - return toInt(); - else - throw std::runtime_error("IValue is not a Scalar"); - } - - // for debugging - std::string tagKind() const { - switch(tag) { - #define DEFINE_CASE(x) case Tag::x: return #x; - TORCH_FORALL_TAGS(DEFINE_CASE) - #undef DEFINE_CASE - } - return "Invalid Tag"; - } - - // generic v.to() implementations - // that can be used in special functions like pop/push - // that use template meta-programming. - // prefer the directly named methods when you can, - // since they are simpler to understand - - // Note: if you get linker errors saying one of these is missing, - // change it to ... && = delete; and you will see better error messages for why - // However, we cannot commit this because some compiler versions barf on it. - template - T to() &&; - template - T to() const &; - - TORCH_API friend std::ostream& operator<<(std::ostream & out, const IValue & v); - -private: - // NOTE: IValue tags are intentionally private. In the future we may encode - // this value different (e.g. using NaN boxing), and this would make it more - // costly to determine the tag for all types vs just determining if something - // is a particular type. Instead we want clients to use the `isX` methods when - // possible. If for perf. reasons you really, absolutely, must have a jump - // table, then we can revisit this. - enum class Tag : uint32_t { -#define DEFINE_TAG(x) x, - TORCH_FORALL_TAGS(DEFINE_TAG) -#undef DEFINE_TAG - }; - - template> - c10::intrusive_ptr moveToIntrusivePtr() { - auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); - clearToNone(); - return t; - } - template> - c10::intrusive_ptr toIntrusivePtr() const { - auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); - auto p = r; - r.release(); - return p; - } - void clearToNone() { - payload.as_int = 0; - tag = Tag::None; - is_intrusive_ptr = false; - } - union { - int64_t as_int; - double as_double; - c10::intrusive_ptr_target* as_intrusive_ptr; - } payload; - Tag tag; - bool is_intrusive_ptr; -}; - -#undef TORCH_FORALL_TAGS - - -#define DEFINE_TO(type, method_name) \ -template<> \ -inline type IValue::to() && { \ - return std::move(*this).method_name(); \ -} \ -template<> \ -inline type IValue::to() const & { \ - return this->method_name(); \ -} -DEFINE_TO(at::Tensor, toTensor) -DEFINE_TO(c10::intrusive_ptr, toTuple) -DEFINE_TO(double, toDouble) -DEFINE_TO(int64_t, toInt) -DEFINE_TO(c10::intrusive_ptr, toDoubleList) -DEFINE_TO(c10::intrusive_ptr, toIntList) -DEFINE_TO(c10::intrusive_ptr, toTensorList) -DEFINE_TO(c10::intrusive_ptr, toString) -DEFINE_TO(at::Scalar, toScalar) -DEFINE_TO(bool, toInt) -DEFINE_TO(std::vector, toIntListRef) -DEFINE_TO(std::vector, toDoubleListRef) -DEFINE_TO(std::vector, toTensorListRef) - -#undef DEFINE_TO - -#define DEFINE_TO_WITH_BODY(type, body) \ -template<> \ -inline type IValue::to() && { \ - body(std::move(*this)); \ -} \ -template<> \ -inline type IValue::to() const & { \ - body((*this)); \ -} - -#define SCALAR_TYPE_BODY(this) return static_cast(this.toInt()); -#define LAYOUT_BODY(this) return static_cast(this.toInt()); -#define DEVICE_BODY(this) \ - /* NB: const_list might be a move of the vector, so we need to */ \ - /* assign it to prevent its deallocation. */ \ - auto && const_list = this.toIntList(); \ - const auto & elems = const_list->elements(); \ - JIT_ASSERT(elems.size() == 2); \ - return at::Device(static_cast(elems[0]), elems[1]); - -DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY) -DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY) -DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY) - -#undef DEFINE_TO_WITH_BODY -#undef SCALAR_TYPE_BODY -#undef LAYOUT_BODY -#undef DEVICE_BODY - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::Tuple), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::IntList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(IntList::create(std::move(v))) {} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::String), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::string v) -: IValue(ConstantString::create(std::move(v))) {} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::DoubleList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(DoubleList::create(std::move(v))) {} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::TensorList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(TensorList::create(std::move(v))) {} - -inline const std::vector& IValue::toIntListRef() const { - return toIntList()->elements(); -} - -inline const std::vector& IValue::toDoubleListRef() const { - return toDoubleList()->elements(); -} - -inline const std::vector& IValue::toTensorListRef() const { - return toTensorList()->elements(); -} - - -}} +#include diff --git a/torch/csrc/jit/source_range.h b/torch/csrc/jit/source_range.h index 0139c2527513a3..227315e643fe4b 100644 --- a/torch/csrc/jit/source_range.h +++ b/torch/csrc/jit/source_range.h @@ -1,5 +1,6 @@ #pragma once #include "torch/csrc/jit/source_location.h" +#include "torch/csrc/jit/assertions.h" namespace torch { From 7f0dd2487d70015218657e61c68750b8169bf9e8 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Mon, 17 Sep 2018 18:11:19 -0700 Subject: [PATCH 209/237] Move AT_HOST_DEVICE macro to Macros.h (#10945) Summary: ``` I'm using AT_HOST_DEVICE outside of Half.h in an upcoming PR. Since this changes code without making any semantic changes, I wanted to make this change in a separate PR. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/10945 Differential Revision: D9539821 Pulled By: colesbury fbshipit-source-id: 0daae40ea78b077a543f7bfeec06b225634540de --- aten/src/ATen/core/Half-inl.h | 82 ++++++++++++++--------------- aten/src/ATen/core/Half.h | 22 +++----- aten/src/ATen/core/Macros.h | 11 ++++ aten/src/ATen/core/TensorAccessor.h | 41 +++++---------- 4 files changed, 71 insertions(+), 85 deletions(-) diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h index 42633508e4b714..75ff2a2fe6937f 100644 --- a/aten/src/ATen/core/Half-inl.h +++ b/aten/src/ATen/core/Half-inl.h @@ -16,7 +16,7 @@ namespace at { /// Constructors -inline AT_HOSTDEVICE Half::Half(float value) { +inline AT_HOST_DEVICE Half::Half(float value) { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) x = __half_as_short(__float2half(value)); #else @@ -26,7 +26,7 @@ inline AT_HOSTDEVICE Half::Half(float value) { /// Implicit conversions -inline AT_HOSTDEVICE Half::operator float() const { +inline AT_HOST_DEVICE Half::operator float() const { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return __half2float(*reinterpret_cast(&x)); #else @@ -35,10 +35,10 @@ inline AT_HOSTDEVICE Half::operator float() const { } #ifdef __CUDACC__ -inline AT_HOSTDEVICE Half::Half(const __half& value) { +inline AT_HOST_DEVICE Half::Half(const __half& value) { x = *reinterpret_cast(&value); } -inline AT_HOSTDEVICE Half::operator __half() const { +inline AT_HOST_DEVICE Half::operator __half() const { return *reinterpret_cast(&x); } #endif @@ -53,140 +53,140 @@ inline __device__ Half __ldg(const Half* ptr) { /// Arithmetic -inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator+(const Half& a, const Half& b) { return static_cast(a) + static_cast(b); } -inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator-(const Half& a, const Half& b) { return static_cast(a) - static_cast(b); } -inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator*(const Half& a, const Half& b) { return static_cast(a) * static_cast(b); } -inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { +inline AT_HOST_DEVICE Half operator/(const Half& a, const Half& b) { return static_cast(a) / static_cast(b); } -inline AT_HOSTDEVICE Half operator-(const Half& a) { +inline AT_HOST_DEVICE Half operator-(const Half& a) { return -static_cast(a); } -inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator+=(Half& a, const Half& b) { a = a + b; return a; } -inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator-=(Half& a, const Half& b) { a = a - b; return a; } -inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator*=(Half& a, const Half& b) { a = a * b; return a; } -inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { +inline AT_HOST_DEVICE Half& operator/=(Half& a, const Half& b) { a = a / b; return a; } /// Arithmetic with floats -inline AT_HOSTDEVICE float operator+(Half a, float b) { +inline AT_HOST_DEVICE float operator+(Half a, float b) { return static_cast(a) + b; } -inline AT_HOSTDEVICE float operator-(Half a, float b) { +inline AT_HOST_DEVICE float operator-(Half a, float b) { return static_cast(a) - b; } -inline AT_HOSTDEVICE float operator*(Half a, float b) { +inline AT_HOST_DEVICE float operator*(Half a, float b) { return static_cast(a) * b; } -inline AT_HOSTDEVICE float operator/(Half a, float b) { +inline AT_HOST_DEVICE float operator/(Half a, float b) { return static_cast(a) / b; } -inline AT_HOSTDEVICE float operator+(float a, Half b) { +inline AT_HOST_DEVICE float operator+(float a, Half b) { return a + static_cast(b); } -inline AT_HOSTDEVICE float operator-(float a, Half b) { +inline AT_HOST_DEVICE float operator-(float a, Half b) { return a - static_cast(b); } -inline AT_HOSTDEVICE float operator*(float a, Half b) { +inline AT_HOST_DEVICE float operator*(float a, Half b) { return a * static_cast(b); } -inline AT_HOSTDEVICE float operator/(float a, Half b) { +inline AT_HOST_DEVICE float operator/(float a, Half b) { return a / static_cast(b); } -inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator+=(float& a, const Half& b) { return a += static_cast(b); } -inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator-=(float& a, const Half& b) { return a -= static_cast(b); } -inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator*=(float& a, const Half& b) { return a *= static_cast(b); } -inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { +inline AT_HOST_DEVICE float& operator/=(float& a, const Half& b) { return a /= static_cast(b); } /// Arithmetic with doubles -inline AT_HOSTDEVICE double operator+(Half a, double b) { +inline AT_HOST_DEVICE double operator+(Half a, double b) { return static_cast(a) + b; } -inline AT_HOSTDEVICE double operator-(Half a, double b) { +inline AT_HOST_DEVICE double operator-(Half a, double b) { return static_cast(a) - b; } -inline AT_HOSTDEVICE double operator*(Half a, double b) { +inline AT_HOST_DEVICE double operator*(Half a, double b) { return static_cast(a) * b; } -inline AT_HOSTDEVICE double operator/(Half a, double b) { +inline AT_HOST_DEVICE double operator/(Half a, double b) { return static_cast(a) / b; } -inline AT_HOSTDEVICE double operator+(double a, Half b) { +inline AT_HOST_DEVICE double operator+(double a, Half b) { return a + static_cast(b); } -inline AT_HOSTDEVICE double operator-(double a, Half b) { +inline AT_HOST_DEVICE double operator-(double a, Half b) { return a - static_cast(b); } -inline AT_HOSTDEVICE double operator*(double a, Half b) { +inline AT_HOST_DEVICE double operator*(double a, Half b) { return a * static_cast(b); } -inline AT_HOSTDEVICE double operator/(double a, Half b) { +inline AT_HOST_DEVICE double operator/(double a, Half b) { return a / static_cast(b); } /// Arithmetic with ints -inline AT_HOSTDEVICE Half operator+(Half a, int b) { +inline AT_HOST_DEVICE Half operator+(Half a, int b) { return a + static_cast(b); } -inline AT_HOSTDEVICE Half operator-(Half a, int b) { +inline AT_HOST_DEVICE Half operator-(Half a, int b) { return a - static_cast(b); } -inline AT_HOSTDEVICE Half operator*(Half a, int b) { +inline AT_HOST_DEVICE Half operator*(Half a, int b) { return a * static_cast(b); } -inline AT_HOSTDEVICE Half operator/(Half a, int b) { +inline AT_HOST_DEVICE Half operator/(Half a, int b) { return a / static_cast(b); } -inline AT_HOSTDEVICE Half operator+(int a, Half b) { +inline AT_HOST_DEVICE Half operator+(int a, Half b) { return static_cast(a) + b; } -inline AT_HOSTDEVICE Half operator-(int a, Half b) { +inline AT_HOST_DEVICE Half operator-(int a, Half b) { return static_cast(a) - b; } -inline AT_HOSTDEVICE Half operator*(int a, Half b) { +inline AT_HOST_DEVICE Half operator*(int a, Half b) { return static_cast(a) * b; } -inline AT_HOSTDEVICE Half operator/(int a, Half b) { +inline AT_HOST_DEVICE Half operator/(int a, Half b) { return static_cast(a) / b; } diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h index e2cbd1174fc91c..47a8e8e52d2adb 100644 --- a/aten/src/ATen/core/Half.h +++ b/aten/src/ATen/core/Half.h @@ -30,14 +30,6 @@ #include #endif -#ifndef AT_HOSTDEVICE -#ifdef __CUDACC__ -#define AT_HOSTDEVICE __host__ __device__ -#else -#define AT_HOSTDEVICE -#endif -#endif - namespace at { namespace detail { @@ -55,18 +47,18 @@ struct alignas(2) Half { // HIP wants __host__ __device__ tag, CUDA does not #ifdef __HIP_PLATFORM_HCC__ - AT_HOSTDEVICE Half() = default; + AT_HOST_DEVICE Half() = default; #else Half() = default; #endif - constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){}; - inline AT_HOSTDEVICE Half(float value); - inline AT_HOSTDEVICE operator float() const; + constexpr AT_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits){}; + inline AT_HOST_DEVICE Half(float value); + inline AT_HOST_DEVICE operator float() const; #ifdef __CUDACC__ - inline AT_HOSTDEVICE Half(const __half& value); - inline AT_HOSTDEVICE operator __half() const; + inline AT_HOST_DEVICE Half(const __half& value); + inline AT_HOST_DEVICE operator __half() const; #endif }; @@ -191,5 +183,3 @@ AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value); } // namespace at #include "ATen/core/Half-inl.h" - -#undef AT_HOSTDEVICE diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h index 67efa523ac2bba..244124475bc08f 100644 --- a/aten/src/ATen/core/Macros.h +++ b/aten/src/ATen/core/Macros.h @@ -39,6 +39,17 @@ #define AT_CORE_API AT_CORE_IMPORT #endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +#ifdef __CUDACC__ +// Designates functions callable from the host (CPU) and the device (GPU) +#define AT_HOST_DEVICE __host__ __device__ +#define AT_DEVICE __device__ +#define AT_HOST __host__ +#else +#define AT_HOST_DEVICE +#define AT_HOST +#define AT_DEVICE +#endif + // Disable the copy and assignment operator for a class. Note that this will // disable the usage of the class in std containers. #define AT_DISABLE_COPY_AND_ASSIGN(classname) \ diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index f686f0797b079d..d8a851d998332a 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -2,6 +2,7 @@ #include #include +#include namespace at { @@ -20,18 +21,6 @@ struct RestrictPtrTraits { }; #endif -#ifndef AT_HOSTDEVICE -#ifdef __CUDACC__ -#define AT_HOSTDEVICE __host__ __device__ -#define AT_HOST __host__ -#define AT_DEVICE __device__ -#else -#define AT_HOSTDEVICE -#define AT_HOST -#define AT_DEVICE -#endif -#endif - // TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. // For CUDA tensors it is used in device code (only). This means that we restrict ourselves // to functions and types available there (e.g. IntList isn't). @@ -42,7 +31,7 @@ class TensorAccessorBase { public: typedef typename PtrTraits::PtrType PtrType; - AT_HOSTDEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + AT_HOST_DEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : data_(data_), sizes_(sizes_), strides_(strides_) {} AT_HOST IntList sizes() const { return IntList(sizes_,N); @@ -50,10 +39,10 @@ class TensorAccessorBase { AT_HOST IntList strides() const { return IntList(strides_,N); } - AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; } - AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; } - AT_HOSTDEVICE T *data() { return data_; } - AT_HOSTDEVICE const T *data() const { return data_; } + AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; } + AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; } + AT_HOST_DEVICE T *data() { return data_; } + AT_HOST_DEVICE const T *data() const { return data_; } protected: PtrType data_; const int64_t* sizes_; @@ -69,14 +58,14 @@ class TensorAccessor : public TensorAccessorBase { public: typedef typename PtrTraits::PtrType PtrType; - AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : TensorAccessorBase(data_,sizes_,strides_) {} - AT_HOSTDEVICE TensorAccessor operator[](int64_t i) { + AT_HOST_DEVICE TensorAccessor operator[](int64_t i) { return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); } - AT_HOSTDEVICE const TensorAccessor operator[](int64_t i) const { + AT_HOST_DEVICE const TensorAccessor operator[](int64_t i) const { return TensorAccessor(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1); } }; @@ -86,9 +75,9 @@ class TensorAccessor : public TensorAccessorBase { public: typedef typename PtrTraits::PtrType PtrType; - AT_HOSTDEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) + AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_) : TensorAccessorBase(data_,sizes_,strides_) {} - AT_HOSTDEVICE T & operator[](int64_t i) { + AT_HOST_DEVICE T & operator[](int64_t i) { return this->data_[this->strides_[0]*i]; } }; @@ -112,8 +101,8 @@ class PackedTensorAccessorBase { std::copy(sizes_, sizes_ + N, std::begin(this->sizes_)); std::copy(strides_, strides_ + N, std::begin(this->strides_)); } - AT_HOSTDEVICE int64_t stride(int64_t i) const { return strides_[i]; } - AT_HOSTDEVICE int64_t size(int64_t i) const { return sizes_[i]; } + AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; } + AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; } protected: PtrType data_; int64_t sizes_[N]; @@ -157,7 +146,3 @@ class PackedTensorAccessor : public PackedTensorAccessorBase Date: Mon, 17 Sep 2018 21:21:16 -0700 Subject: [PATCH 210/237] SNNTest with Data Preproc Service (#11707) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11707 Trigger SNN offline training test with data preproc service. Reviewed By: xsh6528 Differential Revision: D9826978 fbshipit-source-id: f98405ca1e61a7662bf0d9313aaba42436025a83 --- caffe2/python/pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py index ee38fe52df8c4e..ade4e6ac9248e8 100644 --- a/caffe2/python/pipeline.py +++ b/caffe2/python/pipeline.py @@ -324,7 +324,8 @@ def _pipe_step( elif hasattr(input, 'reader'): reader = input.reader() else: - raise ValueError('in must be a reader, queue or stream.') + raise ValueError( + 'Input must be a reader, queue or stream. Got {}'.format(type(input))) if processor is not None: reader = ProcessingReader(reader, processor) From a7cbcb1bb9f1b8244f2441a0fcc8d491677c988b Mon Sep 17 00:00:00 2001 From: Mingzhe Li Date: Mon, 17 Sep 2018 21:39:10 -0700 Subject: [PATCH 211/237] Enable build_python on windows (#11385) Summary: The PR aims to resolve issues related to BUILD_PYTHON and BUILD_TEST after FULL_CAFFE2 is removed on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11385 Reviewed By: orionr Differential Revision: D9884906 Pulled By: mingzhe09088 fbshipit-source-id: fc114c0cbff6223f1ec261161e4caecc1fef5dd6 --- caffe2/CMakeLists.txt | 14 ++ caffe2/core/common_gpu.h | 2 +- .../include/nomnigraph/Graph/Graph.h | 8 +- .../nomnigraph/Representations/NeuralNet.h | 2 +- .../Transformations/SubgraphMatcher.h | 2 +- caffe2/core/registry.h | 4 +- caffe2/utils/math_gpu.cu | 184 +++++++++--------- cmake/Dependencies.cmake | 15 +- setup.py | 6 - tools/build_pytorch_libs.bat | 2 +- torch/CMakeLists.txt | 1 + 11 files changed, 130 insertions(+), 110 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 4e9b3eef8937f4..c90179cea1ddcd 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -291,6 +291,11 @@ if (MSVC AND NOT BUILD_SHARED_LIBS) # as the latter is not respected by nvcc target_compile_definitions(caffe2 PUBLIC "AT_CORE_STATIC_WINDOWS=1") endif() +if (MSVC AND BUILD_SHARED_LIBS) + # ONNX is linked statically and needs to be exported from this library + # to be used externally. Make sure that references match the export. + target_compile_options(caffe2 PRIVATE "-DONNX_BUILD_MAIN_LIB") +endif() # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) @@ -492,6 +497,9 @@ if (BUILD_PYTHON) if (APPLE) set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() + if (WIN32) + set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") + endif() target_include_directories(caffe2_pybind11_state PRIVATE $) target_include_directories(caffe2_pybind11_state PRIVATE ${Caffe2_CPU_INCLUDE}) @@ -517,6 +525,9 @@ if (BUILD_PYTHON) if (APPLE) set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() + if (WIN32) + set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") + endif() target_include_directories(caffe2_pybind11_state_gpu PRIVATE $) target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE}) target_link_libraries( @@ -542,6 +553,9 @@ if (BUILD_PYTHON) if (APPLE) set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") endif() + if (WIN32) + set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") + endif() target_include_directories(caffe2_pybind11_state_hip PRIVATE $) target_include_directories(caffe2_pybind11_state_hip PRIVATE ${Caffe2_CPU_INCLUDE}) target_link_libraries( diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h index 4658cd0d756099..b73a6aefa406a0 100644 --- a/caffe2/core/common_gpu.h +++ b/caffe2/core/common_gpu.h @@ -116,7 +116,7 @@ inline int CudaVersion() { return CUDA_VERSION; } /** * Returns the number of devices. */ -int NumCudaDevices(); +CAFFE2_CUDA_API int NumCudaDevices(); /** * Check if the current running session has a cuda gpu present. diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h index c6b10f0912eca8..568d46a61ff561 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h @@ -39,7 +39,7 @@ class Node; // \brief Edge within a Graph. template -class CAFFE2_API Edge : public StorageType { +class Edge : public StorageType { public: using NodeRef = typename Graph::NodeRef; Edge(NodeRef tail, NodeRef head, U... args) @@ -73,7 +73,7 @@ class CAFFE2_API Edge : public StorageType { // \brief Node within a Graph. template -class CAFFE2_API Node : public StorageType, public Notifier> { +class Node : public StorageType, public Notifier> { public: using NodeRef = typename Graph::NodeRef; using EdgeRef = typename Graph::EdgeRef; @@ -152,7 +152,7 @@ class CAFFE2_API Node : public StorageType, public Notifier> { /// for example. /// template -class CAFFE2_API Subgraph { +class Subgraph { public: Subgraph() { DEBUG_PRINT("Creating instance of Subgraph: %p\n", this); @@ -219,7 +219,7 @@ class CAFFE2_API Subgraph { /// Everything is owned by the graph to simplify storage concerns. /// template -class CAFFE2_API Graph { +class Graph { public: using SubgraphType = Subgraph; using NodeRef = Node*; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index 3abcb2d1057ae9..2a03e428619b30 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -424,7 +424,7 @@ CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m); template struct CAFFE2_EXPORT NodeHelper {}; -struct CAFFE2_API NNNodeMatchCriteria { +struct NNNodeMatchCriteria { std::function predicate; std::string debugString; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h index 66494dba3275ce..a303324fbb5701 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h @@ -29,7 +29,7 @@ namespace matcher { */ template -class CAFFE2_API MatchNode { +class MatchNode { public: static const int kStarCount = -1; diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h index f353e8d6601fab..7db975077ea8b9 100644 --- a/caffe2/core/registry.h +++ b/caffe2/core/registry.h @@ -35,7 +35,7 @@ namespace caffe2 { * objects. */ template -class CAFFE2_API Registry { +class Registry { public: typedef std::function Creator; @@ -104,7 +104,7 @@ class CAFFE2_API Registry { }; template -class CAFFE2_API Registerer { +class Registerer { public: Registerer( const SrcType& key, diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 920bffc0ae3bfb..2bb11cd22ad70e 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -130,7 +130,7 @@ __global__ void BroadcastBinaryOpCUDAKernel( } template -void BinaryOpWith2DBroadcasting( +CAFFE2_CUDA_EXPORT void BinaryOpWith2DBroadcasting( const int rows, const int cols, const bool rowwise_broadcast, @@ -177,7 +177,7 @@ void BinaryOpWith2DBroadcasting( } template -void BroadcastBinaryOpImpl( +CAFFE2_CUDA_EXPORT void BroadcastBinaryOpImpl( const int* A_dims, const int* B_dims, const int* C_dims, @@ -212,7 +212,7 @@ void BroadcastBinaryOpImpl( } template -void BroadcastBinaryOp( +CAFFE2_CUDA_EXPORT void BroadcastBinaryOp( const int A_ndim, const int* A_dims, const int B_ndim, @@ -294,7 +294,7 @@ void BroadcastBinaryOp( } \ } \ template <> \ - void Func( \ + CAFFE2_CUDA_EXPORT void Func( \ const int N, const T* x, T* y, CUDAContext* context) { \ Func##CUDAKernel<<< \ CAFFE_GET_BLOCKS(N), \ @@ -362,7 +362,7 @@ DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv, utils::Inv) #define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \ template <> \ - void SinCos( \ + CAFFE2_CUDA_EXPORT void SinCos( \ const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \ SinCosCUDAKernel<<< \ CAFFE_GET_BLOCKS(N), \ @@ -376,7 +376,7 @@ CAFFE2_SPECIALIZED_CUDA_SINCOS(double) #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Func( \ + CAFFE2_CUDA_EXPORT void Func( \ const int N, \ const TIn* A, \ const TIn* B, \ @@ -444,7 +444,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( #define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Rowwise##Func( \ + CAFFE2_CUDA_EXPORT void Rowwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -463,7 +463,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( context->cuda_stream()>>>(size, cols_div, Op(), A, B, C); \ } \ template <> \ - void Rowwise##Func( \ + CAFFE2_CUDA_EXPORT void Rowwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -482,7 +482,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( context->cuda_stream()>>>(size, cols_div, Op(), A, B, C); \ } \ template <> \ - void Colwise##Func( \ + CAFFE2_CUDA_EXPORT void Colwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -501,7 +501,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION( context->cuda_stream()>>>(size, cols_div, Op(), A, B, C); \ } \ template <> \ - void Colwise##Func( \ + CAFFE2_CUDA_EXPORT void Colwise##Func( \ const int rows, \ const int cols, \ const TIn* A, \ @@ -573,7 +573,7 @@ DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) #define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Func( \ + CAFFE2_CUDA_EXPORT void Func( \ const int A_ndim, \ const int* A_dims, \ const int B_ndim, \ @@ -638,7 +638,7 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ template <> \ - void Funcname( \ + CAFFE2_CUDA_EXPORT void Funcname( \ const int N, \ const T* src, \ T* dst, \ @@ -669,7 +669,7 @@ DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max) // Caffe2 gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -710,7 +710,7 @@ void Gemm( } template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -781,7 +781,7 @@ void Gemm( } template <> -void BiasCHW( +CAFFE2_CUDA_EXPORT void BiasCHW( const float* bias, const float* bias_multiplier, const int bias_channels, @@ -803,7 +803,7 @@ void BiasCHW( } template <> -void GemmBatched( +CAFFE2_CUDA_EXPORT void GemmBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -869,7 +869,7 @@ void GemmBatched( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -930,7 +930,7 @@ void GemmStridedBatched( } template <> -void GemmBatched( +CAFFE2_CUDA_EXPORT void GemmBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1059,7 +1059,7 @@ void GemmBatched( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1168,7 +1168,7 @@ void GemmStridedBatched( // No change, but required. Defer to default CUDA engine template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -1186,7 +1186,7 @@ void Gemm( } template <> -void Gemm( +CAFFE2_CUDA_EXPORT void Gemm( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -1245,7 +1245,7 @@ void Gemm( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1282,7 +1282,7 @@ void GemmStridedBatched( } template <> -void GemmStridedBatched( +CAFFE2_CUDA_EXPORT void GemmStridedBatched( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int batch_size, @@ -1321,7 +1321,7 @@ void GemmStridedBatched( #endif // CUDA_VERSION >= 9000 template <> -void GemmEx( +CAFFE2_CUDA_EXPORT void GemmEx( const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B, const int M, @@ -1362,7 +1362,7 @@ void GemmEx( } template <> -void Gemv( +CAFFE2_CUDA_EXPORT void Gemv( const CBLAS_TRANSPOSE trans_A, const int M, const int N, @@ -1415,7 +1415,7 @@ __global__ void AddStripedBatchKernel( #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T) \ template <> \ - void AddStripedBatch( \ + CAFFE2_CUDA_EXPORT void AddStripedBatch( \ const int N, \ const T* first, \ T* Y, \ @@ -1434,7 +1434,7 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16); #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH template <> -void Gemv( +CAFFE2_CUDA_EXPORT void Gemv( const CBLAS_TRANSPOSE trans_A, const int M, const int N, @@ -1514,7 +1514,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) { #define CAFFE2_SPECIALIZED_CUDA_SET(T) \ template <> \ - void Set( \ + CAFFE2_CUDA_API void Set( \ const size_t N, const T alpha, T* Y, CUDAContext* context) { \ if (N == 0) { \ return; \ @@ -1542,7 +1542,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t); #undef CAFFE2_SPECIALIZED_CUDA_SET template <> -void Set( +CAFFE2_CUDA_EXPORT void Set( const size_t N, const float16 alpha, float16* Y, @@ -1577,7 +1577,7 @@ UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) { } // namespace template <> -void RandUniform( +CAFFE2_CUDA_EXPORT void RandUniform( const size_t n, const float min, const float max, @@ -1592,7 +1592,7 @@ void RandUniform( } template <> -void RandUniform( +CAFFE2_CUDA_EXPORT void RandUniform( const size_t n, const double min, const double max, @@ -1608,7 +1608,7 @@ void RandUniform( } template <> -void RandUniform( +CAFFE2_CUDA_EXPORT void RandUniform( const size_t n, const int min, const int max, @@ -1642,7 +1642,7 @@ size_t HandleOddLengthRandGaussian( } template <> -void RandGaussian( +CAFFE2_CUDA_EXPORT void RandGaussian( const size_t n, const float mean, const float std, @@ -1658,7 +1658,7 @@ void RandGaussian( } template <> -void RandGaussian( +CAFFE2_CUDA_EXPORT void RandGaussian( const size_t n, const double mean, const double std, @@ -1671,7 +1671,7 @@ void RandGaussian( } template <> -void Dot( +CAFFE2_CUDA_EXPORT void Dot( const int n, const float* a, const float* b, @@ -1683,7 +1683,7 @@ void Dot( } template <> -void Dot( +CAFFE2_CUDA_EXPORT void Dot( const int n, const float16* a, const float16* b, @@ -1760,7 +1760,7 @@ __global__ void SumConvertKernel(float* sum, T* dest) { } template -void SumGenericIter( +CAFFE2_CUDA_EXPORT void SumGenericIter( const int N, IterT it, T*& dest, @@ -1789,7 +1789,7 @@ void SumGenericIter( } // namespace template <> -void Sum( +CAFFE2_CUDA_EXPORT void Sum( const int N, const float* x, float* y, @@ -1804,7 +1804,7 @@ void Sum( } template <> -void Sum( +CAFFE2_CUDA_EXPORT void Sum( const int N, const int32_t* x, int32_t* y, @@ -1829,7 +1829,7 @@ struct FloatTransform { #define CAFFE2_MATH_SUM_FUNC(T) \ template <> \ - void Sum( \ + CAFFE2_CUDA_EXPORT void Sum( \ const int N, \ const T* x, \ T* y, \ @@ -1861,7 +1861,7 @@ struct SqrTransform { } // namespace template <> -void SumSqr( +CAFFE2_CUDA_EXPORT void SumSqr( const int N, const float* x, float* y, @@ -1880,7 +1880,7 @@ void SumSqr( #define CAFFE2_MATH_SUMSQR_FUNC(T) \ template <> \ - void SumSqr( \ + CAFFE2_CUDA_EXPORT void SumSqr( \ const int N, \ const T* x, \ T* y, \ @@ -1920,7 +1920,7 @@ SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) { } // namespace template <> -void Select( +CAFFE2_CUDA_EXPORT void Select( const int N, const int D, const float* x, @@ -1935,7 +1935,7 @@ void Select( } template <> -void Select( +CAFFE2_CUDA_EXPORT void Select( const int N, const int D, const float16* x, @@ -1985,7 +1985,7 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) { } // namespace template <> -void Powx( +CAFFE2_CUDA_EXPORT void Powx( const int N, const float* a, const float b, @@ -2000,7 +2000,7 @@ void Powx( #define DELEGATE_CUBLAS_SCALE_FUNCTION(TAlpha, TData, CuBLASFunc) \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha alpha, \ const TData* x, \ @@ -2024,7 +2024,7 @@ void Powx( } \ } \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha* alpha, \ const TData* x, \ @@ -2051,7 +2051,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal) #define CAFFE2_SPECIALIZED_CUDA_SCALE(TAlpha, TData) \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha alpha, \ const TData* x, \ @@ -2078,7 +2078,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal) context->cuda_stream()>>>(N, alpha, x, y); \ } \ template <> \ - void Scale( \ + CAFFE2_CUDA_EXPORT void Scale( \ const int N, \ const TAlpha* alpha, \ const TData* x, \ @@ -2098,7 +2098,7 @@ CAFFE2_SPECIALIZED_CUDA_SCALE(std::int64_t, std::int64_t) #undef CAFFE2_SPECIALIZED_CUDA_SCALE template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float16 alpha, const float16* x, @@ -2129,7 +2129,7 @@ void Scale( } template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float16* alpha, const float16* x, @@ -2160,7 +2160,7 @@ void Scale( } template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float alpha, const float16* x, @@ -2193,7 +2193,7 @@ void Scale( } template <> -void Scale( +CAFFE2_CUDA_EXPORT void Scale( const int N, const float* alpha, const float16* x, @@ -2224,7 +2224,7 @@ void Scale( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float alpha, const float* X, @@ -2236,7 +2236,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float alpha, const double* X, @@ -2250,7 +2250,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float alpha, const float16* X, @@ -2273,7 +2273,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float* alpha, const float* X, @@ -2285,7 +2285,7 @@ void Axpy( } template <> -void Axpy( +CAFFE2_CUDA_EXPORT void Axpy( const int N, const float* alpha, const float16* X, @@ -2379,7 +2379,7 @@ __global__ void AxpbyCUDAKernel( #define CAFFE2_SPECIALIZED_CUDA_AXPBY(TCoeff, TData) \ template <> \ - void Axpby( \ + CAFFE2_CUDA_EXPORT void Axpby( \ const int n, \ const TCoeff a, \ const TData* x, \ @@ -2393,7 +2393,7 @@ __global__ void AxpbyCUDAKernel( context->cuda_stream()>>>(n, a, x, b, y); \ } \ template <> \ - void Axpby( \ + CAFFE2_CUDA_EXPORT void Axpby( \ const int n, \ const TCoeff* a, \ const TData* x, \ @@ -2468,7 +2468,7 @@ __global__ void Im2ColNCHWCUDAKernel( } template -__global__ void Im2ColNHWCCUDAKernel( +__global__ void Im2ColNHWCCUDAKernel( const int n, const int input_h, const int input_w, @@ -2519,7 +2519,7 @@ __global__ void Im2ColNHWCCUDAKernel( } template -__global__ void Col2ImNCHWCUDAKernel( +__global__ void Col2ImNCHWCUDAKernel( const int n, const int input_h, const int input_w, @@ -2574,7 +2574,7 @@ __global__ void Col2ImNCHWCUDAKernel( } template -__global__ void Col2ImNHWCCUDAKernel( +__global__ void Col2ImNHWCCUDAKernel( const int n, const int input_w, const int channels, @@ -2627,7 +2627,7 @@ __global__ void Col2ImNHWCCUDAKernel( } template -__global__ void Im2ColNdNCHWCUDAKernel( +__global__ void Im2ColNdNCHWCUDAKernel( const int outer_size, const int inner_size, const int kernel_size, @@ -2683,7 +2683,7 @@ __global__ void Im2ColNdNCHWCUDAKernel( } template -void Im2ColNdNCHWCUDAImpl( +CAFFE2_CUDA_EXPORT void Im2ColNdNCHWCUDAImpl( const int img_size, const int col_size, const int* img_shape, @@ -2730,7 +2730,7 @@ void Im2ColNdNCHWCUDAImpl( } template -void Col2ImNdNCHWCUDAImpl( +CAFFE2_CUDA_EXPORT void Col2ImNdNCHWCUDAImpl( const int img_size, const int col_size, const int* img_shape, @@ -2780,7 +2780,7 @@ void Col2ImNdNCHWCUDAImpl( } // namespace template <> -void Im2Col( +CAFFE2_CUDA_EXPORT void Im2Col( const int channels, const int height, const int width, @@ -2826,7 +2826,7 @@ void Im2Col( } template <> -void Im2Col( +CAFFE2_CUDA_EXPORT void Im2Col( const int channels, const int height, const int width, @@ -2874,7 +2874,7 @@ void Im2Col( } template <> -void Col2Im( +CAFFE2_CUDA_EXPORT void Col2Im( const int channels, const int height, const int width, @@ -2920,7 +2920,7 @@ void Col2Im( } template <> -void Col2Im( +CAFFE2_CUDA_EXPORT void Col2Im( const int channels, const int height, const int width, @@ -2968,7 +2968,7 @@ void Col2Im( } template <> -void Im2ColNd( +CAFFE2_CUDA_EXPORT void Im2ColNd( const int N, const int img_size, const int col_size, @@ -2999,7 +2999,7 @@ void Im2ColNd( } template <> -void Col2ImNd( +CAFFE2_CUDA_EXPORT void Col2ImNd( const int N, const int img_size, const int col_size, @@ -3030,7 +3030,7 @@ void Col2ImNd( } template <> -void CopyMatrix( +CAFFE2_CUDA_EXPORT void CopyMatrix( const size_t itemsize, const int M, const int N, @@ -3082,7 +3082,7 @@ CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(TIndex) #undef CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX template <> -void CopyVector( +CAFFE2_CUDA_EXPORT void CopyVector( const int N, const float* src, float* dst, @@ -3152,7 +3152,7 @@ __global__ void ColwiseReduceKernel( #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T) \ template <> \ - void RowwiseMax( \ + CAFFE2_CUDA_EXPORT void RowwiseMax( \ const int N, const int D, const T* x, T* y, CUDAContext* context) { \ RowwiseReduceKernel<<< \ std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \ @@ -3166,7 +3166,7 @@ CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float) #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T) \ template <> \ - void ColwiseMax( \ + CAFFE2_CUDA_EXPORT void ColwiseMax( \ const int N, const int D, const T* x, T* y, CUDAContext* context) { \ ColwiseReduceKernel<<< \ std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \ @@ -3188,7 +3188,7 @@ maximum_kernel(const int N, const float alpha, const float* x, float* y) { } // namespace template <> -void Maximum( +CAFFE2_CUDA_EXPORT void Maximum( const int N, const float alpha, const float* x, @@ -3241,7 +3241,7 @@ __global__ void ReduceTensorCUDAKernel( } template -void ReduceTensorCUDAImpl( +CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl( const int outer_size, const int inner_size, const int* dims, @@ -3275,7 +3275,7 @@ void ReduceTensorCUDAImpl( } template -void ReduceTensorCUDA( +CAFFE2_CUDA_EXPORT void ReduceTensorCUDA( const int num_dims, const int* dims, const int num_axes, @@ -3353,7 +3353,7 @@ void ReduceTensorCUDA( #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \ template <> \ - void ReduceMin( \ + CAFFE2_CUDA_EXPORT void ReduceMin( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3382,7 +3382,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \ template <> \ - void ReduceMax( \ + CAFFE2_CUDA_EXPORT void ReduceMax( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3411,7 +3411,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \ template <> \ - void ReduceSum( \ + CAFFE2_CUDA_EXPORT void ReduceSum( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3440,7 +3440,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double) #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \ template <> \ - void ReduceMean( \ + CAFFE2_CUDA_EXPORT void ReduceMean( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3496,7 +3496,7 @@ __global__ void BroadcastCUDAKernel( } template -void BroadcastCUDAImpl( +CAFFE2_CUDA_EXPORT void BroadcastCUDAImpl( const int X_ndim, const int* X_dims, const int* Y_dims, @@ -3534,7 +3534,7 @@ void BroadcastCUDAImpl( #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \ template <> \ - void Broadcast( \ + CAFFE2_CUDA_EXPORT void Broadcast( \ const int X_ndim, \ const int* X_dims, \ const int Y_ndim, \ @@ -3676,7 +3676,7 @@ __global__ void MomentsCUDAKernel( } template -void MomentsCUDAImpl( +CAFFE2_CUDA_EXPORT void MomentsCUDAImpl( const int outer_size, const int inner_size, const int* dims, @@ -3700,7 +3700,7 @@ void MomentsCUDAImpl( } template -void MomentsCUDA( +CAFFE2_CUDA_EXPORT void MomentsCUDA( const int num_dims, const int* dims, const int num_axes, @@ -3783,7 +3783,7 @@ void MomentsCUDA( #define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T) \ template <> \ - void Moments( \ + CAFFE2_CUDA_EXPORT void Moments( \ const int num_dims, \ const int* dims, \ const int num_axes, \ @@ -3819,7 +3819,7 @@ DELEGATE_INV_STD_KERNEL_FUNCTION(float, rsqrtf) #define CAFFE2_SPECIALIZED_CUDA_INV_STD(T) \ template <> \ - void InvStd( \ + CAFFE2_CUDA_EXPORT void InvStd( \ const int N, \ const T epsilon, \ const T* var, \ @@ -3861,7 +3861,7 @@ __global__ void TransposeCUDAKernel( } template -void TransposeCUDAImpl( +CAFFE2_CUDA_EXPORT void TransposeCUDAImpl( const int* dims, const int* axes, const T* X, @@ -3886,7 +3886,7 @@ void TransposeCUDAImpl( #define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \ template <> \ - void Transpose( \ + CAFFE2_CUDA_EXPORT void Transpose( \ const int ndim, \ const int* dims, \ const int* axes, \ @@ -3933,7 +3933,7 @@ __global__ void AffineChannelCUDAKernel( #define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T, kOrder) \ template <> \ - void AffineChannel( \ + CAFFE2_CUDA_EXPORT void AffineChannel( \ const int N, \ const int C, \ const int HxW, \ diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index dbec0d07ebf6e3..4fbf634cf7ac71 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -349,7 +349,9 @@ if(BUILD_PYTHON) execute_process( COMMAND "which" "python" RESULT_VARIABLE _exitcode OUTPUT_VARIABLE _py_exe) if(${_exitcode} EQUAL 0) - string(STRIP ${_py_exe} PYTHON_EXECUTABLE) + if (NOT MSVC) + string(STRIP ${_py_exe} PYTHON_EXECUTABLE) + endif() message(STATUS "Setting Python to ${PYTHON_EXECUTABLE}") endif() endif() @@ -388,7 +390,11 @@ if(BUILD_PYTHON) pycmd_no_exit(_py_lib _exitcode "from sysconfig import get_paths; print(get_paths()['stdlib'])") if("${_exitcode}" EQUAL 0 AND EXISTS "${_py_lib}" AND EXISTS "${_py_lib}") SET(PYTHON_LIBRARY "${_py_lib}") - message(STATUS "Setting Python's library to ${_py_lib}") + if (MSVC) + STRING(REPLACE "Lib" "libs" _py_static_lib ${_py_lib}) + link_directories(${_py_static_lib}) + endif() + message(STATUS "Setting Python's library to ${PYTHON_LIBRARY}") endif() endif(NOT DEFINED PYTHON_LIBRARY) @@ -793,6 +799,11 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # We will build onnx as static libs and embed it directly into the binary. set(BUILD_SHARED_LIBS OFF) + if (MSVC AND BUILD_SHARED_LIBS) + # That also means we want to export all symbols from the shared + # library we are building + set(ONNX_BUILD_MAIN_LIB ON) + endif() set(ONNX_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME}) set(ONNX_USE_LITE_PROTO ${CAFFE2_USE_LITE_PROTO}) # If linking local protobuf, make sure ONNX has the same protobuf diff --git a/setup.py b/setup.py index 37eaf51f0d3908..5fbe75833da0fb 100644 --- a/setup.py +++ b/setup.py @@ -875,12 +875,6 @@ def run(self): CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib')) if USE_ROCM: CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib')) - # Windows needs direct access to ONNX libraries as well - # as through Caffe2 library - CAFFE2_LIBS += [ - os.path.join(lib_path, 'onnx.lib'), - os.path.join(lib_path, 'onnx_proto.lib'), - ] if DEBUG: PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobufd.lib') else: diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat index 34d7ea1ac0d206..acc5bed4a98bfb 100755 --- a/tools/build_pytorch_libs.bat +++ b/tools/build_pytorch_libs.bat @@ -176,7 +176,7 @@ goto:eof -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^ -DNO_API=ON ^ -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^ - -DBUILD_PYTHON=OFF ^ + -DBUILD_PYTHON=%BUILD_PYTHON% ^ -DBUILD_BINARY=%BUILD_BINARY% ^ -DBUILD_TEST=OFF ^ -DINSTALL_TEST=%INSTALL_TEST% ^ diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 2c1ff700dd2858..58814d21ffa301 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -343,6 +343,7 @@ if(USE_CUDA) set(TORCH_CUDA_LIBRARIES ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib ${CUDA_LIBRARIES}) + set_target_properties(torch PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED") target_include_directories(torch PRIVATE "${NVTOOLEXT_HOME}/include") elseif(APPLE) set(TORCH_CUDA_LIBRARIES From 3cbec5453ba92d1f6def57f958b2b8bb14da1ce3 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Mon, 17 Sep 2018 21:47:00 -0700 Subject: [PATCH 212/237] Reorder statements for readability (#11764) Summary: I was reading this a couple times before figuring out it's also the entry point for the MPI_COMM_WORLD. Reordered statements and added comment to clarify. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11764 Differential Revision: D9882834 Pulled By: pietern fbshipit-source-id: a9282d55368815925fd695a2541354e5aec599da --- torch/lib/c10d/ProcessGroupMPI.cpp | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp index 03699962c15b59..63846b443ea072 100644 --- a/torch/lib/c10d/ProcessGroupMPI.cpp +++ b/torch/lib/c10d/ProcessGroupMPI.cpp @@ -253,33 +253,30 @@ std::shared_ptr ProcessGroupMPI::createProcessGroupMPI( MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &size)); MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); - globalLock.unlock(); - if (rank < 0 || size < 0) { throw std::runtime_error("Failed to get the world_size / rank"); } + // If no ranks are specified, assume we're creating the root group if (ranks.empty()) { + globalLock.unlock(); return std::make_shared(rank, size, MPI_COMM_WORLD); - } else { - std::unique_lock globalLock(pgGlobalMutex_); + } - MPI_Group worldGroup; - MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup)); + MPI_Group worldGroup; + MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup)); - MPI_Group ranksGroup; - MPI_CHECK( - MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup)); + MPI_Group ranksGroup; + MPI_CHECK(MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup)); - MPI_Comm groupComm; - MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm)); + MPI_Comm groupComm; + MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm)); - MPI_CHECK(MPI_Group_free(&worldGroup)); - MPI_CHECK(MPI_Group_free(&ranksGroup)); + MPI_CHECK(MPI_Group_free(&worldGroup)); + MPI_CHECK(MPI_Group_free(&ranksGroup)); - globalLock.unlock(); - return std::make_shared(rank, size, groupComm); - } + globalLock.unlock(); + return std::make_shared(rank, size, groupComm); } ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pgComm) From a02685e1092d264b34d4511b3a5a12c8ec3840e4 Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Mon, 17 Sep 2018 21:47:43 -0700 Subject: [PATCH 213/237] Fix test_torch's test_potri (#11770) Summary: tset_potri -> test_potri, even though it has been like this for a long time More a curiosity than grave functionality... Pull Request resolved: https://github.com/pytorch/pytorch/pull/11770 Reviewed By: ezyang Differential Revision: D9884767 Pulled By: soumith fbshipit-source-id: 9bedde2e94ade281ab1ecc2293ca3cb1a0107387 --- test/test_torch.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_torch.py b/test/test_torch.py index fbabc12d32e051..ad803347317d80 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5233,7 +5233,7 @@ def test_potrs(self): self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12) @skipIfNoLapack - def tset_potri(self): + def test_potri(self): a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23), (-6.05, -3.30, 5.36, -4.44, 1.08), (-0.45, 2.58, -2.70, 0.27, 9.04), @@ -5241,7 +5241,7 @@ def tset_potri(self): (-9.67, -5.14, -7.26, 6.08, -6.87))).t() # make sure 'a' is symmetric PSD - a = a * a.t() + a = torch.mm(a, a.t()) # compute inverse directly inv0 = torch.inverse(a) @@ -5252,13 +5252,13 @@ def tset_potri(self): self.assertLessEqual(inv0.dist(inv1), 1e-12) # upper Triangular Test - chol = torch.potrf(a, 'U') - inv1 = torch.potri(chol, 'U') + chol = torch.potrf(a, True) + inv1 = torch.potri(chol, True) self.assertLessEqual(inv0.dist(inv1), 1e-12) # lower Triangular Test - chol = torch.potrf(a, 'L') - inv1 = torch.potri(chol, 'L') + chol = torch.potrf(a, False) + inv1 = torch.potri(chol, False) self.assertLessEqual(inv0.dist(inv1), 1e-12) @skipIfNoLapack From bd43d64dd5b7e96b70e84add0a2d6c5256632a25 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 17 Sep 2018 21:57:37 -0700 Subject: [PATCH 214/237] Add strides to Tensor (#11763) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11763 baseline-std vector ``` ============================================================================ caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative time/iter iters/s ============================================================================ TensorConstructionDestruction 6.74us 148.26K TensorShareData 5.89us 169.78K TensorShareExternalPointer 1.01us 994.35K TensorReallocation 2.46us 405.78K ============================================================================ ============================================================================ caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative time/iter iters/s ============================================================================ TensorConstructionDestruction 7.50us 133.27K TensorShareData 7.07us 141.38K TensorShareExternalPointer 1.05us 955.19K TensorReallocation 2.55us 391.62K ============================================================================ ``` baseline-smallvector ``` ============================================================================ caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative time/iter iters/s ============================================================================ TensorConstructionDestruction 6.56us 152.34K TensorShareData 5.84us 171.32K TensorShareExternalPointer 962.49ns 1.04M TensorReallocation 2.32us 431.73K ============================================================================ ============================================================================ caffe2/caffe2/fb/benchmarks/core_overhead_benchmark.ccrelative time/iter iters/s ============================================================================ TensorConstructionDestruction 6.29us 159.04K TensorShareData 5.73us 174.39K TensorShareExternalPointer 914.90ns 1.09M TensorReallocation 2.29us 435.80K ============================================================================ ``` Reviewed By: ezyang Differential Revision: D9694097 fbshipit-source-id: c462e770a4b40e640d8c9d38e0ae7036a4e6e84a --- aten/src/ATen/DimVector.h | 11 +----- aten/src/ATen/core/DimVector.h | 11 ++++++ caffe2/core/tensor.h | 12 ++++++ caffe2/core/tensor_impl.h | 68 ++++++++++++++++++++++++++++++++-- 4 files changed, 89 insertions(+), 13 deletions(-) create mode 100644 aten/src/ATen/core/DimVector.h diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h index 9bbf37b5a9f4d4..cb652fffcb1481 100644 --- a/aten/src/ATen/DimVector.h +++ b/aten/src/ATen/DimVector.h @@ -1,11 +1,2 @@ #pragma once - -#include -#include - -namespace at { - -/// A container for sizes or strides -using DimVector = SmallVector; - -} +#include diff --git a/aten/src/ATen/core/DimVector.h b/aten/src/ATen/core/DimVector.h new file mode 100644 index 00000000000000..a98c841a94777b --- /dev/null +++ b/aten/src/ATen/core/DimVector.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace at { + +/// A container for sizes or strides +using DimVector = SmallVector; + +} // namespace at diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index a167658060e7fc..27c09f00c4c1e2 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -286,6 +286,18 @@ class CAFFE2_API Tensor final { return impl_.get()->canonical_axis_index(axis_index); } + inline int64_t stride(int64_t dim) const { + return impl_.get()->stride(dim); + } + + inline at::DimVector strides() { + return impl_.get()->strides(); + } + + inline bool is_contiguous() const { + return impl_.get()->is_contiguous(); + } + template inline bool IsType() const { return impl_.get()->IsType(); diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 3cd5bd23d446fb..3f42ed36b30954 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -1,7 +1,7 @@ #pragma once +#include #include - #include #include "caffe2/core/allocator.h" @@ -133,12 +133,17 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { return; } if (data_type_ != src.meta()) { + CAFFE_ENFORCE_WITH_CALLER( + src.is_contiguous(), + "Right now only copy of contiguous source Tensor is supported."); storage_ = at::Storage(GetDeviceType(), src.meta()); data_type_ = src.meta(); } if (src.size() == -1) { dims_.clear(); numel_ = -1; + strides_.clear(); + is_contiguous_ = true; storage_.reset(); data_type_ = TypeMeta(); return; @@ -203,6 +208,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); CAFFE_ENFORCE_GE_WITH_CALLER( num, 0, "`num` must be non-negative for Extend"); + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now Extend is only supported for contiguous Tensor."); auto newDims = dims_; newDims[0] += num; if (!storage_.data()) { @@ -243,6 +251,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * that the extra capacity after the end of the shurnk tensor is maintained. */ void ShrinkTo(TIndex outer_dim) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ShrinkTo is only supported on contiguous Tensor."); CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D"); CAFFE_ENFORCE_WITH_CALLER( outer_dim <= dims_[0], @@ -266,6 +277,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { */ template void ReserveSpace(const T& outer_dim) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ReserveSpace is only supported for contiguous Tensor."); CAFFE_ENFORCE( numel_ != -1, "size should be initialized before calling ReserveSpace"); CAFFE_ENFORCE( @@ -335,6 +349,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * sugar wrapper that essentially calls Resize(src_tensor.dims()). */ inline void ResizeLike(const TensorImpl& src_tensor) { + CAFFE_ENFORCE_WITH_CALLER( + src_tensor.is_contiguous(), + "Right now ResizeLike is only supported for contiguous Tensor."); // Note: need casting for different context types. if (static_cast(this) != static_cast(&src_tensor)) { Resize(src_tensor.dims()); @@ -346,6 +363,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { * This requires the total size of the tensor to remains constant. */ inline void Reshape(const std::vector& dims) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now Reshape is only supported for contiguous Tensor."); TIndex new_size = 1; for (auto d : dims) { CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); @@ -454,6 +474,9 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { const TypeMeta& data_type, size_t capacity = 0, MemoryDeleter d = nullptr) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ShareExternalPointer is only supported for contiguos Tensor."); CAFFE_ENFORCE_WITH_CALLER( data_type.id() != TypeIdentifier::uninitialized(), "To share with a raw external pointer you need to pass in an " @@ -706,6 +729,25 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { return canonical_axis_index_(axis_index, ndim()); } + inline int64_t stride(int64_t dim) const { +#ifndef NDEBUG + // TODO: dim wrapping? + CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER( + dim, 0, "Cannot have negative dimension index"); +#endif + return strides_[dim]; + } + + // TODO: Change to ArrayRef later + inline at::DimVector strides() { + return strides_; + } + + inline bool is_contiguous() const { + return is_contiguous_; + } + /** * Checks if the tensor content is of the given data type. */ @@ -772,9 +814,11 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { } protected: - using DimVector = std::vector; - DimVector dims_; // sizes_ + // TODO: change to DimVector + std::vector dims_; // sizes_ + at::DimVector strides_; TIndex numel_ = -1; // numel_ + bool is_contiguous_ = true; // we decide to keep reserved_ and it will // live in Tensor after the split // The logic is that if Extend() or ReserveSpace() were ever called, @@ -796,6 +840,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { new_numel *= src[i]; dims_[i] = src[i]; } + update_strides(); numel_ = new_numel; return numel_ != old_numel; } @@ -803,6 +848,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { bool SetDims() { auto old_numel = numel_; dims_.resize(0); + update_strides(); numel_ = 1; return numel_ != old_numel; } @@ -814,6 +860,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { auto old_numel = numel_; dims_.resize(1); dims_[0] = d0; + update_strides(); numel_ = d0; return numel_ != old_numel; } @@ -823,6 +870,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { dims_.resize(2); dims_[0] = d0; dims_[1] = d1; + update_strides(); numel_ = d0 * d1; return numel_ != old_numel; } @@ -833,6 +881,7 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { dims_[0] = d0; dims_[1] = d1; dims_[2] = d2; + update_strides(); numel_ = d0 * d1 * d2; return numel_ != old_numel; } @@ -845,9 +894,22 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { dims_[1] = d1; dims_[2] = d2; dims_[3] = d3; + update_strides(); numel_ = d0 * d1 * d2 * d3; return numel_ != old_numel; } + + inline void update_strides() { + strides_.resize(dims_.size()); + if (ndim() > 0) { + int last_idx = ndim() - 1; + strides_[last_idx] = 1; + for (auto i = last_idx - 1; i >= 0; --i) { + strides_[i] = strides_[i + 1] * std::max(dims_[i + 1], 1); + } + } + is_contiguous_ = true; + } }; } From 63c811b3a68d9c23addc219cffb691678b7407c2 Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Mon, 17 Sep 2018 23:15:57 -0700 Subject: [PATCH 215/237] Include some JIT things in C++ docs (#11712) Summary: Since we're making parts of the JIT public as part of loading script modules, they should be on the cppdocs website. Orthogonal: We decided not to export things like `IValue` into the `torch` namespace, so `RegisterOperators` shouldn't be there either. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11712 Differential Revision: D9837578 Pulled By: goldsborough fbshipit-source-id: 4c06d2fa9dd4b4216951f27424c2ce795febab9c --- docs/cpp/Doxyfile | 4 ++++ docs/cpp/check-doxygen.sh | 5 +++-- test/custom_operator/op.cpp | 2 +- torch/csrc/jit/import.h | 11 +++++++++-- torch/script.h | 5 ----- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile index 3c942a9025c2dd..37abadc83ff7ca 100644 --- a/docs/cpp/Doxyfile +++ b/docs/cpp/Doxyfile @@ -752,6 +752,10 @@ WARN_LOGFILE = INPUT = ../../torch/csrc/api/include \ ../../torch/csrc/api/src \ + ../../torch/csrc/jit/custom_operator.h \ + ../../torch/csrc/jit/import.h \ + ../../torch/csrc/jit/ivalue.h \ + ../../torch/csrc/jit/script/module.h \ ../../aten/src/ATen/ATen.h \ ../../aten/src/ATen/Backend.h \ ../../aten/src/ATen/Device.h \ diff --git a/docs/cpp/check-doxygen.sh b/docs/cpp/check-doxygen.sh index 375347974ac749..0af06ad2350c39 100755 --- a/docs/cpp/check-doxygen.sh +++ b/docs/cpp/check-doxygen.sh @@ -21,11 +21,12 @@ ignore_warning "warning: source ../../build/aten/src/ is not a readable file" ignore_warning "warning: source ../../build/aten/src/ATen/Tensor.h is not a readable file" ignore_warning "warning: source ../../build/aten/src/ATen/Functions.h is not a readable file" ignore_warning "warning: documented symbol \`torch::nn::FunctionalImpl::FunctionalImpl' was not declared or defined" +ignore_warning "functional.h:81: warning: Found ';' while parsing initializer list!" # Count the number of remaining warnings. -warnings=$(grep 'warning:' doxygen-log.txt | wc -l) +warnings="$(grep 'warning:' doxygen-log.txt | wc -l)" -if [[ $warnings != 0 ]]; then +if [[ "$warnings" -ne "0" ]]; then echo "Filtered output" cat doxygen-log.txt rm -f doxygen-log.txt original-doxygen-log.txt diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp index f48189728ea293..c7f7ee1e7ab3f2 100644 --- a/test/custom_operator/op.cpp +++ b/test/custom_operator/op.cpp @@ -16,7 +16,7 @@ std::vector custom_op( } static auto registry = - torch::RegisterOperators() + torch::jit::RegisterOperators() // We parse the schema for the user. .op("custom::op", &custom_op) // User provided schema. Among other things, allows defaulting values, diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h index 21f9d605ec59b3..6ce901c4369961 100644 --- a/torch/csrc/jit/import.h +++ b/torch/csrc/jit/import.h @@ -3,7 +3,8 @@ #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/script/module.h" -namespace torch { namespace jit { +namespace torch { +namespace jit { using ModuleLookup = std::function( const std::vector&)>; @@ -12,6 +13,12 @@ TORCH_API void import_ir_module( ModuleLookup module_lookup, const std::string& filename); +/// Loads a serialized `script::Module` from the given `filename`. +/// +/// The file stored at the location given in `filename` must contain a +/// serialized `script::Module`, exported either via `ScriptModule.save()` in +/// Python or `torch::jit::ExportModule` in C++. TORCH_API std::shared_ptr load(const std::string& filename); -}} +} // namespace jit +} // namespace torch diff --git a/torch/script.h b/torch/script.h index 9033857d672925..01f9e38cd7766d 100644 --- a/torch/script.h +++ b/torch/script.h @@ -6,8 +6,3 @@ #include #include - -namespace torch { -using jit::createOperator; -using jit::RegisterOperators; -} // namespace torch From 407a9fee0c8fb172459ac9b3556419305e47f8a7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 17 Sep 2018 23:24:49 -0700 Subject: [PATCH 216/237] make copy constructed tensor a leaf variable when using torch.tensor(sourceTensor) (#11061) Summary: - fix https://github.com/pytorch/pytorch/issues/10876 - the cause of the bug is because copy constructor cannot distinguish between default value of requires_grad and requires_grad=False, thus it makes a copy from source tensor along with its grad_fn if requires_grad=True at source - with this fix, the behavior becomes ``` >>> source = torch.randn(2, 2, requires_grad=True) >>> copy = torch.tensor(source, requires_grad=True) >>> print(copy) tensor([[-1.2001, 1.9869], [-1.0134, 1.3096]], grad_fn=) >>> source = torch.randn(2, 2, requires_grad=True) >>> copy = torch.tensor(source, requires_grad=False) >>> print(copy) tensor([[-0.7402, 0.0467], [ 0.4344, -0.0420]]) >>> source = torch.randn(2, 2, requires_grad=True) >>> copy = torch.tensor(source) >>> print(copy) tensor([[-0.7402, 0.0467], [ 0.4344, -0.0420]]) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/11061 Differential Revision: D9569714 Pulled By: weiyangfb fbshipit-source-id: ea368688bdc0f1ce5997870e164e42835b64b4a1 --- test/test_torch.py | 25 +++++++++++++++++++++++++ torch/_torch_docs.py | 7 +++++++ torch/csrc/utils/tensor_new.cpp | 16 +++++++++++++--- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/test/test_torch.py b/test/test_torch.py index ad803347317d80..8fcca02207c89c 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -2396,6 +2396,31 @@ def test_tensor_factory(self): a[0] = 7. self.assertEqual(5., res1[0].item()) + def test_tensor_factory_copy_var(self): + # default copy from var + source = torch.randn(5, 5, requires_grad=True) + copy = torch.tensor(source) + self.assertEqual(copy.data, source.data) + self.assertTrue(source.requires_grad) + self.assertTrue(copy.is_leaf) + self.assertFalse(copy.requires_grad) + + # copy with requires_grad=False + source = torch.randn(5, 5, requires_grad=True) + copy = torch.tensor(source, requires_grad=False) + self.assertEqual(copy.data, source.data) + self.assertTrue(source.requires_grad) + self.assertTrue(copy.is_leaf) + self.assertFalse(copy.requires_grad) + + # copy with requires_grad=True + source = torch.randn(5, 5, requires_grad=True) + copy = torch.tensor(source, requires_grad=True) + self.assertEqual(copy.data, source.data) + self.assertTrue(source.requires_grad) + self.assertTrue(copy.is_leaf) + self.assertTrue(copy.requires_grad) + def test_tensor_factory_type_inference(self): def test_inference(default_dtype): saved_dtype = torch.get_default_dtype() diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 64d93a5916ab16..432cb64d722765 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -3818,6 +3818,13 @@ def parse_kwargs(desc): If you have a NumPy ``ndarray`` and want to avoid a copy, use :func:`torch.from_numpy`. +.. warning:: + + When data is a tensor `x`, :func:`torch.tensor` reads out 'the data' from whatever it is passed, + and constructs a leaf variable. Therefore ``torch.tensor(x)`` is equivalent to ``x.clone().detach()`` + and ``torch.tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``. + The equivalents use ``clone()`` and ``detach()`` are recommended. + Args: {data} {dtype} diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index ebed9c07fad7a7..1b6ec75d13ebc2 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -197,7 +197,7 @@ void recursive_store(char* data, IntList sizes, IntList strides, int64_t dim, Tensor internal_new_from_data(const Type & type, at::optional device_opt, PyObject* data, bool copy_variables, bool copy_numpy, - bool type_inference) { + bool type_inference, bool args_requires_grad=false) { int32_t device_index = -1; if (device_opt.has_value()) { device_index = device_opt->index(); @@ -207,6 +207,10 @@ Tensor internal_new_from_data(const Type & type, at::optional device_opt } if (THPVariable_Check(data)) { + PyErr_WarnEx(PyExc_UserWarning, + "To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() " + "or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).", 1); + auto var = reinterpret_cast(data)->cdata; auto type_inference_device_type = device_opt.has_value() ? device_opt->type() : torch::getDeviceType(var.type()); @@ -216,8 +220,11 @@ Tensor internal_new_from_data(const Type & type, at::optional device_opt *torch::getLayout(type.backend()), type_inference_device_type); const auto& type_to_use = type_inference ? type_inference_type : type; - return copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) : + auto new_tensor = copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) : new_with_type_conversion(type_to_use, var, device_index); + new_tensor.detach_(); // making copy constructed tensor a leaf node + new_tensor.set_requires_grad(args_requires_grad); + return new_tensor; } #ifdef USE_NUMPY @@ -500,13 +507,16 @@ Tensor tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) { auto r = parser.parse(args, kwargs, parsed_args); if (r.idx == 0) { bool type_inference = r.isNone(1); + // args_requires_grad=True if requires_grad is set to True (requires_grad=True) + bool args_requires_grad = r.toBool(3); return internal_new_from_data( typeWithDefault(r, 1, 2, type), r.deviceOptional(2), r.pyobject(0), true, true, - type_inference) + type_inference, + args_requires_grad) .set_requires_grad(r.toBool(3)); } throw std::runtime_error("tensor(): invalid arguments"); From e734c94fa2d4399d33b37a3dcc9096a0dbb776e5 Mon Sep 17 00:00:00 2001 From: Marc Ferradou Date: Mon, 17 Sep 2018 23:44:09 -0700 Subject: [PATCH 217/237] Quick update to embedding_bag doc (#11784) Summary: Related to #11624 adding maxes to the function def of embedding_bag. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11784 Differential Revision: D9892598 Pulled By: ezyang fbshipit-source-id: e6372ccf631826ddf1e1885b2f8f75f354a36c0b --- torch/nn/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 61c0fdce5fe048..3c67393a01119b 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1206,7 +1206,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2, def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, mode='mean', sparse=False): - r"""Computes sums or means of 'bags' of embeddings, without instantiating the + r"""Computes sums, means or maxes of 'bags' of embeddings, without instantiating the intermediate embeddings. See :class:`torch.nn.EmbeddingBag` for more details. From 91b6458e2d0dba935da2cc7c2cdc6d7907bc3f48 Mon Sep 17 00:00:00 2001 From: nehz Date: Tue, 18 Sep 2018 01:11:55 -0700 Subject: [PATCH 218/237] Container __getitem__ slicing for subclasses (#11694) Summary: Simple change to allow ModuleList subclasses's `__getitem__(slice)` to return class of subclass rather than ModuleList Pull Request resolved: https://github.com/pytorch/pytorch/pull/11694 Differential Revision: D9892824 Pulled By: ezyang fbshipit-source-id: b75e9c196487f55cb93f0dab6c20d850e8e759ff --- torch/nn/modules/container.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py index 01e12e621ba4ac..15590dbe5c9a92 100644 --- a/torch/nn/modules/container.py +++ b/torch/nn/modules/container.py @@ -63,7 +63,7 @@ def _get_item_by_idx(self, iterator, idx): def __getitem__(self, idx): if isinstance(idx, slice): - return Sequential(OrderedDict(list(self._modules.items())[idx])) + return self.__class__(OrderedDict(list(self._modules.items())[idx])) else: return self._get_item_by_idx(self._modules.values(), idx) @@ -132,7 +132,7 @@ def _get_abs_string_index(self, idx): def __getitem__(self, idx): if isinstance(idx, slice): - return ModuleList(list(self._modules.values())[idx]) + return self.__class__(list(self._modules.values())[idx]) else: return self._modules[self._get_abs_string_index(idx)] @@ -334,7 +334,7 @@ def __init__(self, parameters=None): def __getitem__(self, idx): if isinstance(idx, slice): - return ParameterList(list(self._parameters.values())[idx]) + return self.__class__(list(self._parameters.values())[idx]) else: idx = operator.index(idx) if not (-len(self) <= idx < len(self)): From e2bc95e1bd39f4cd1813a21290f2f8f6fb51c0eb Mon Sep 17 00:00:00 2001 From: Xingdong Zuo Date: Tue, 18 Sep 2018 07:33:31 -0700 Subject: [PATCH 219/237] add `ModuleList.insert` (#11664) Summary: fixes #11652 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11664 Differential Revision: D9892845 Pulled By: ezyang fbshipit-source-id: 2c910d6bc0b28a999e25beca6e398fd0f35535c5 --- test/test_nn.py | 3 +++ torch/nn/modules/container.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/test/test_nn.py b/test/test_nn.py index 29e7c6ba8ebcc9..abeddaa2f0cc9a 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -1137,6 +1137,9 @@ def check(): modules += [nn.Conv2d(3, 4, 3)] module_list += [modules[-1]] check() + modules.insert(1, nn.Linear(3, 2)) + module_list.insert(1, modules[1]) + check() modules.append(nn.Tanh()) module_list.append(modules[-1]) check() diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py index 15590dbe5c9a92..dc8f61c2309c21 100644 --- a/torch/nn/modules/container.py +++ b/torch/nn/modules/container.py @@ -164,6 +164,17 @@ def __dir__(self): keys = [key for key in keys if not key.isdigit()] return keys + def insert(self, index, module): + r"""Insert a given module before a given index in the list. + + Arguments: + index (int): index to insert. + module (nn.Module): module to insert + """ + for i in range(len(self._modules), index, -1): + self._modules[str(i)] = self._modules[str(i - 1)] + self._modules[str(index)] = module + def append(self, module): r"""Appends a given module to the end of the list. From 4ee0a78ee6452b7c1584b6020aef791b9d3e8ab6 Mon Sep 17 00:00:00 2001 From: Amitesh Arora Date: Tue, 18 Sep 2018 07:36:15 -0700 Subject: [PATCH 220/237] varargs for meshgrid (#11600) Summary: Adds vararg support for meshgrid and adds checks for all the tensor arguments to have the same dtype and device. Fixes: [#10823](https://github.com/pytorch/pytorch/issues/10823), #11446 The earlier pull request closed without any changes because I had some rebasing issues, so I made another pull request to close out #10823. Sorry for the inconvenience. Differential Revision: D9892876 Pulled By: ezyang fbshipit-source-id: 93d96cafc876102ccbad3ca2cc3d81cb4c9bf556 --- aten/src/ATen/native/TensorShape.cpp | 4 +++ test/test_torch.py | 7 +++++ torch/_torch_docs.py | 33 ------------------------ torch/functional.py | 38 ++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index c22374d0964e81..634e7a443d21fd 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -596,6 +596,10 @@ std::vector meshgrid(TensorList tensors) { AT_ERROR("Expected scalar or 1D tensor in the tensor list but got: ", tensors[i]); } } + for(int64_t i = 0; i < size - 1; i++){ + AT_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype"); + AT_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device"); + } std::vector grids; for(int64_t i = 0; i < size; i++) { std::vector view_shape(size, 1); diff --git a/test/test_torch.py b/test/test_torch.py index 8fcca02207c89c..9a67440a9961a0 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -8899,6 +8899,10 @@ def test_meshgrid(self): self.assertEqual(grid_a.shape, torch.Size([1, 3, 2])) self.assertEqual(grid_b.shape, torch.Size([1, 3, 2])) self.assertEqual(grid_c.shape, torch.Size([1, 3, 2])) + grid_a2, grid_b2, grid_c2 = torch.meshgrid(a, b, c) + self.assertEqual(grid_a2.shape, torch.Size([1, 3, 2])) + self.assertEqual(grid_b2.shape, torch.Size([1, 3, 2])) + self.assertEqual(grid_c2.shape, torch.Size([1, 3, 2])) expected_grid_a = torch.ones(1, 3, 2, dtype=torch.int64) expected_grid_b = torch.tensor([[[1, 1], [2, 2], @@ -8909,6 +8913,9 @@ def test_meshgrid(self): self.assertTrue(grid_a.equal(expected_grid_a)) self.assertTrue(grid_b.equal(expected_grid_b)) self.assertTrue(grid_c.equal(expected_grid_c)) + self.assertTrue(grid_a2.equal(expected_grid_a)) + self.assertTrue(grid_b2.equal(expected_grid_b)) + self.assertTrue(grid_c2.equal(expected_grid_c)) @unittest.skipIf(torch.cuda.is_available(), "CUDA is available, can't test CUDA not built error") def test_cuda_not_built(self): diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 432cb64d722765..9abb9f1bbf76d3 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -5997,36 +5997,3 @@ def parse_kwargs(desc): >>> [7, 8, 9]])) (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9])) """) - - -add_docstr(torch.meshgrid, - r""" -meshgrid(seq) -> seq - -Take a sequence of :math:`N` tensors, each of which can be either scalar or 1-dimensional -vector, and create :math:`N` N-dimensional grids, where the :math:`i` th grid is defined by -expanding the :math:`i` th input over dimensions defined by other inputs. - -Arguments: - seq (sequence of Tensors): sequence of scalars or 1 dimensional tensors. Scalars will be - treated as tensors of size :math:`(1,)` automatically. - -Returns: - seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`, - then the output would also has :math:`k` tensors, where all tensors are - of size :math:`(N_1, N_2, \ldots , N_k)`. - -Example:: - - >>> x = torch.tensor([1, 2, 3]) - >>> y = torch.tensor([4, 5, 6]) - >>> grid_x, grid_y = torch.meshgrid([x, y]) - >>> grid_x - tensor([[1, 1, 1], - [2, 2, 2], - [3, 3, 3]]) - >>> grid_y - tensor([[4, 5, 6], - [4, 5, 6], - [4, 5, 6]]) -""") diff --git a/torch/functional.py b/torch/functional.py index e31903db5a7129..40c74ba59abd94 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -16,6 +16,7 @@ 'isfinite', 'isinf', 'isnan', + 'meshgrid', 'split', 'stft', 'tensordot', @@ -278,6 +279,43 @@ def isinf(tensor): return tensor.abs() == inf +def meshgrid(*tensors, **kwargs): + r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional +vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by +expanding the :math:`i`th input over dimensions defined by other inputs. + + + Args: + tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be + treated as tensors of size :math:`(1,)` automatically + + Returns: + seq (sequence of Tensors): If the input has :math:`k` tensors of size + :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors, + where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. + + Example:: + + >>> x = torch.tensor([1, 2, 3]) + >>> y = torch.tensor([4, 5, 6]) + >>> grid_x, grid_y = torch.meshgrid(x, y) + >>> grid_x + tensor([[1, 1, 1], + [2, 2, 2], + [3, 3, 3]]) + >>> grid_y + tensor([[4, 5, 6], + [4, 5, 6], + [4, 5, 6]]) + """ + if kwargs: + raise TypeError("meshgrid() got an unexpected keyword argument '%s'" % (list(kwargs)[0],)) + if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)): + # the old interface of passing the operands as one list argument + tensors = tensors[0] + return torch._C._VariableFunctions.meshgrid(tensors) + + def stft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, pad_mode='reflect', normalized=False, onesided=True): r"""Short-time Fourier transform (STFT). From e00fb69b251c7a1af2b172cc5c059945686b71b5 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Tue, 18 Sep 2018 07:59:41 -0700 Subject: [PATCH 221/237] Use CATCH prefix to avoid name conflicts with Caffe2. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11780 Differential Revision: D9889925 Pulled By: gchanan fbshipit-source-id: 5eca849c36ced00b8ae7482b7945b445a3e1687e --- aten/src/ATen/test/apply_test.cpp | 104 +++--- aten/src/ATen/test/apply_utils_test.cpp | 14 +- aten/src/ATen/test/basic.cpp | 152 ++++----- aten/src/ATen/test/broadcast_test.cpp | 86 ++--- aten/src/ATen/test/catch_utils.hpp | 8 + aten/src/ATen/test/cuda_half_test.cu | 6 +- aten/src/ATen/test/cuda_optional_test.cu | 12 +- .../test/cuda_packedtensoraccessor_test.cu | 8 +- aten/src/ATen/test/cuda_rng_test.cpp | 6 +- aten/src/ATen/test/cudnn_test.cpp | 10 +- aten/src/ATen/test/dlconvertor_test.cpp | 10 +- aten/src/ATen/test/half_test.cpp | 102 +++--- aten/src/ATen/test/integer_divider_test.cu | 24 +- aten/src/ATen/test/native_test.cpp | 58 ++-- aten/src/ATen/test/scalar_tensor_test.cpp | 76 ++--- aten/src/ATen/test/scalar_test.cpp | 54 ++-- aten/src/ATen/test/stream_test.cpp | 130 ++++---- aten/src/ATen/test/test_parallel.cpp | 6 +- aten/src/ATen/test/undefined_tensor_test.cpp | 48 +-- aten/src/ATen/test/weakref_test.cpp | 48 +-- aten/src/ATen/test/wrapdim_test.cpp | 30 +- test/cpp/api/any.cpp | 200 ++++++------ test/cpp/api/catch_utils.hpp | 8 + test/cpp/api/cursor.cpp | 276 ++++++++-------- test/cpp/api/integration.cpp | 14 +- test/cpp/api/jit.cpp | 12 +- test/cpp/api/main.cpp | 2 +- test/cpp/api/misc.cpp | 274 ++++++++-------- test/cpp/api/module.cpp | 250 +++++++-------- test/cpp/api/modules.cpp | 234 +++++++------- test/cpp/api/optim.cpp | 94 +++--- test/cpp/api/parallel.cpp | 120 +++---- test/cpp/api/rnn.cpp | 92 +++--- test/cpp/api/sequential.cpp | 150 ++++----- test/cpp/api/serialization.cpp | 84 ++--- test/cpp/api/tensor.cpp | 146 ++++----- test/cpp/api/tensor_cuda.cpp | 8 +- test/cpp/api/tensor_options.cpp | 52 +-- test/cpp/api/tensor_options_cuda.cpp | 44 +-- torch/csrc/jit/catch_utils.hpp | 8 + torch/csrc/jit/test_jit.cpp | 296 +++++++++--------- 41 files changed, 1690 insertions(+), 1666 deletions(-) create mode 100644 aten/src/ATen/test/catch_utils.hpp create mode 100644 test/cpp/api/catch_utils.hpp create mode 100644 torch/csrc/jit/catch_utils.hpp diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp index 986f599da6d11b..fc39eccee3926b 100644 --- a/aten/src/ATen/test/apply_test.cpp +++ b/aten/src/ATen/test/apply_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "cuda.h" #include "cuda_runtime.h" @@ -11,111 +11,111 @@ Tests related to tensor indexing and applying operations. */ #ifndef _WIN32 -TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") { +CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") { int sizes[] = {4, 4}; int strides[] = {4, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == (4 * 4)); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == (4 * 4)); } -TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") { +CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") { int sizes[] = {6, 3, 7}; int strides[] = {3 * 7, 7, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == (6 * 3 * 7)); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7)); } -TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") { +CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") { int sizes[] = {4, 3, 2}; int strides[] = {3 * 3, 3, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 2); - REQUIRE(ti.sizes[0] == (4 * 3)); - REQUIRE(ti.sizes[1] == 2); + CATCH_REQUIRE(ti.dims == 2); + CATCH_REQUIRE(ti.sizes[0] == (4 * 3)); + CATCH_REQUIRE(ti.sizes[1] == 2); } -TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") { +CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") { int sizes[] = {3, 2}; int strides[] = {2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == (3 * 2)); - REQUIRE(ti.strides[0] == 2); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == (3 * 2)); + CATCH_REQUIRE(ti.strides[0] == 2); } -TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){ +CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){ int sizes[] = {3, 6, 5, 2}; int strides[] = {6 * 22, 22, 2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 2); - REQUIRE(ti.sizes[0] == (3 * 6)); - REQUIRE(ti.strides[0] == 22); - REQUIRE(ti.sizes[1] == (5 * 2)); - REQUIRE(ti.strides[1] == 2); + CATCH_REQUIRE(ti.dims == 2); + CATCH_REQUIRE(ti.sizes[0] == (3 * 6)); + CATCH_REQUIRE(ti.strides[0] == 22); + CATCH_REQUIRE(ti.sizes[1] == (5 * 2)); + CATCH_REQUIRE(ti.strides[1] == 2); } -TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") { +CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") { int sizes[] = {1, 10, 1, 5, 4}; int strides[] = {4, 0, 16, 0, 1}; ::at::cuda::detail::TensorInfo ti{nullptr, 5, sizes, strides}; ti.collapseDims(); - REQUIRE(ti.dims == 2); - REQUIRE(ti.sizes[0] == (10 * 5)); - REQUIRE(ti.strides[0] == 0); - REQUIRE(ti.sizes[1] == 4); - REQUIRE(ti.strides[1] == 1); + CATCH_REQUIRE(ti.dims == 2); + CATCH_REQUIRE(ti.sizes[0] == (10 * 5)); + CATCH_REQUIRE(ti.strides[0] == 0); + CATCH_REQUIRE(ti.sizes[1] == 4); + CATCH_REQUIRE(ti.strides[1] == 1); } -TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") { +CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") { int sizes[] = {1, 1, 1}; int strides[] = {17, 12, 3}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - REQUIRE(ti.collapseDims() == 0); - REQUIRE(ti.dims == 1); - REQUIRE(ti.sizes[0] == 1); - REQUIRE(ti.strides[0] == 1); + CATCH_REQUIRE(ti.collapseDims() == 0); + CATCH_REQUIRE(ti.dims == 1); + CATCH_REQUIRE(ti.sizes[0] == 1); + CATCH_REQUIRE(ti.strides[0] == 1); } -TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") { +CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") { int sizes[] = {3, 6, 5, 2}; int strides[] = {6 * 22, 22, 2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - REQUIRE(ti.collapseDims(1) == 1); - REQUIRE(ti.dims == 3); - REQUIRE(ti.sizes[0] == 3); - REQUIRE(ti.strides[0] == (6 * 22)); - REQUIRE(ti.sizes[1] == 6); - REQUIRE(ti.strides[1] == 22); - REQUIRE(ti.sizes[2] == (5 * 2)); - REQUIRE(ti.strides[2] == 2); + CATCH_REQUIRE(ti.collapseDims(1) == 1); + CATCH_REQUIRE(ti.dims == 3); + CATCH_REQUIRE(ti.sizes[0] == 3); + CATCH_REQUIRE(ti.strides[0] == (6 * 22)); + CATCH_REQUIRE(ti.sizes[1] == 6); + CATCH_REQUIRE(ti.strides[1] == 22); + CATCH_REQUIRE(ti.sizes[2] == (5 * 2)); + CATCH_REQUIRE(ti.strides[2] == 2); } -TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") { +CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") { int sizes[] = {3, 6, 5, 2}; int strides[] = {6 * 22, 22, 2 * 2, 2}; ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - REQUIRE(ti.collapseDims(2) == 1); - REQUIRE(ti.dims == 3); - REQUIRE(ti.sizes[0] == (3 * 6)); - REQUIRE(ti.strides[0] == 22); - REQUIRE(ti.sizes[1] == 5); - REQUIRE(ti.strides[1] == 4); - REQUIRE(ti.sizes[2] == 2); - REQUIRE(ti.strides[2] == 2); + CATCH_REQUIRE(ti.collapseDims(2) == 1); + CATCH_REQUIRE(ti.dims == 3); + CATCH_REQUIRE(ti.sizes[0] == (3 * 6)); + CATCH_REQUIRE(ti.strides[0] == 22); + CATCH_REQUIRE(ti.sizes[1] == 5); + CATCH_REQUIRE(ti.strides[1] == 4); + CATCH_REQUIRE(ti.sizes[2] == 2); + CATCH_REQUIRE(ti.strides[2] == 2); } -TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") { +CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") { int sizes[] = {1, 1, 1}; int strides[] = {17, 12, 3}; ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - REQUIRE_THROWS(ti.collapseDims(5)); + _CATCH_REQUIRE_THROWS(ti.collapseDims(5)); } #endif diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp index 38027baae97b73..22be6de7acbc02 100644 --- a/aten/src/ATen/test/apply_utils_test.cpp +++ b/aten/src/ATen/test/apply_utils_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/CPUApplyUtils.h" @@ -108,32 +108,32 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) { }); } -TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { +CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}, -1, -1); } -TEST_CASE("apply utils test 2-dim small", "[cpu]") { +CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}); } -TEST_CASE("apply utils test 2-dim", "[cpu]") { +CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {20, 10}); } -TEST_CASE("apply utils test 3-dim", "[cpu]") { +CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2}); } -TEST_CASE("apply utils test 3-dim medium", "[cpu]") { +CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 40, 2}); } -TEST_CASE("apply utils test 10-dim", "[cpu]") { +CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3}); } diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 94988122adedcc..c64fdec0089dff 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/core/Reduction.h" @@ -20,66 +20,66 @@ using namespace at; using Catch::Matchers::StartsWith; static void test(Type & type) { - SECTION( "resize" ) { + CATCH_SECTION( "resize" ) { auto a = type.tensor(); a.resize_({3,4}); - REQUIRE(a.numel() == 12); + CATCH_REQUIRE(a.numel() == 12); a.resize_({5, 7}); - REQUIRE(a.numel() == 35); + CATCH_REQUIRE(a.numel() == 35); } - SECTION( "ones and dot" ) { + CATCH_SECTION( "ones and dot" ) { Tensor b0 = ones({1, 1}, type); - REQUIRE(2 == (b0+b0).sum().toCDouble()); + CATCH_REQUIRE(2 == (b0+b0).sum().toCDouble()); Tensor b1 = ones({1, 2}, type); - REQUIRE(4 == (b1+b1).sum().toCDouble()); + CATCH_REQUIRE(4 == (b1+b1).sum().toCDouble()); Tensor b = ones({3, 4}, type); - REQUIRE(24 == (b+b).sum().toCDouble()); - REQUIRE(12 == b.numel()); - REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12); + CATCH_REQUIRE(24 == (b+b).sum().toCDouble()); + CATCH_REQUIRE(12 == b.numel()); + CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12); } - SECTION( "rand" ) { + CATCH_SECTION( "rand" ) { for(auto i = 0; i < 10; i++) { Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble)); } } - SECTION( "sort" ) { + CATCH_SECTION( "sort" ) { Tensor b = rand({3, 4}, type); auto z = b.sort(1); auto z_sorted = std::get<0>(z); - REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat()); + CATCH_REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat()); } if(type.backend() != Backend::CUDA) - SECTION( "randperm" ) { + CATCH_SECTION( "randperm" ) { Tensor b = randperm(15, type); Tensor rv, ri; std::tie(rv, ri) = sort(b, 0); - REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat()); + CATCH_REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat()); } - SECTION( "context" ) { + CATCH_SECTION( "context" ) { std::stringstream ss; ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl; } - SECTION( "add" ) { + CATCH_SECTION( "add" ) { Tensor a = rand({3, 4}, type); Tensor b = rand({3, 4}, type); Tensor c = add(a, add(a, b)); //TODO:0-dim Tensor d(3.f); Scalar d = 3.f; - REQUIRE( add(c, d).allclose(a + a + b + d) ); + CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) ); } - SECTION( "loads of adds" ) { + CATCH_SECTION( "loads of adds" ) { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); @@ -89,10 +89,10 @@ static void test(Type & type) { auto end = std::chrono::high_resolution_clock::now(); //TODO TEST PERF? std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; - REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); + CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); } - SECTION( "loads of adds (with copy)" ) { + CATCH_SECTION( "loads of adds (with copy)" ) { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); @@ -102,59 +102,59 @@ static void test(Type & type) { auto end = std::chrono::high_resolution_clock::now(); //TODO TEST PERF? std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; - REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); + CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble()); } - SECTION( "isContiguous" ) { + CATCH_SECTION( "isContiguous" ) { Tensor a = rand({3, 4}, type); - REQUIRE(a.is_contiguous()); + CATCH_REQUIRE(a.is_contiguous()); a = a.transpose(0, 1); - REQUIRE(!a.is_contiguous()); + CATCH_REQUIRE(!a.is_contiguous()); } - SECTION( "permute" ) { + CATCH_SECTION( "permute" ) { Tensor a = rand({3, 4, 5}, type); Tensor b = a.permute({1, 2, 0}); - REQUIRE(b.sizes().equals({4, 5, 3})); - REQUIRE(b.strides().equals({5, 1, 20})); + CATCH_REQUIRE(b.sizes().equals({4, 5, 3})); + CATCH_REQUIRE(b.strides().equals({5, 1, 20})); } - SECTION( "mm" ) { + CATCH_SECTION( "mm" ) { Tensor a = rand({3, 4}, type); Tensor b = rand({4}, type); Tensor c = mv(a, b); - REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1))); + CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1))); } - SECTION( "squeeze" ) { + CATCH_SECTION( "squeeze" ) { Tensor a = rand({2, 1}, type); Tensor b = squeeze(a); - REQUIRE(b.dim() == 1); + CATCH_REQUIRE(b.dim() == 1); a = rand({1}, type); b = squeeze(a); //TODO 0-dim squeeze - REQUIRE(a[0].equal(b)); + CATCH_REQUIRE(a[0].equal(b)); } - SECTION( "copy" ) { + CATCH_SECTION( "copy" ) { Tensor a = zeros({4, 3}, type); Tensor e = rand({4, 3}, type); a.copy_(e); - REQUIRE(a.equal(e)); + CATCH_REQUIRE(a.equal(e)); } - SECTION( "copy (broadcasting)" ) { + CATCH_SECTION( "copy (broadcasting)" ) { Tensor a = zeros({4, 3}, type); Tensor e = rand({3}, type); a.copy_(e); for (int i = 0; i < 4; ++i) { - REQUIRE(a[i].equal(e)); + CATCH_REQUIRE(a[i].equal(e)); } } - SECTION( "abs(value)" ) { + CATCH_SECTION( "abs(value)" ) { Tensor r = at::abs(type.scalarTensor(-3)); - REQUIRE(r.toCInt() == 3); + CATCH_REQUIRE(r.toCInt() == 3); } //TODO(zach): operator overloads @@ -168,120 +168,120 @@ static void test(Type & type) { } #endif - SECTION( "adding a value with a scalar" ) { + CATCH_SECTION( "adding a value with a scalar" ) { Tensor a = rand({4, 3}, type); - REQUIRE((ones({4,3}, type) + a).equal(add(a,1))); + CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1))); } - SECTION( "select" ) { + CATCH_SECTION( "select" ) { Tensor a = rand({3, 7}, type); auto a_13 = select(a, 1, 3); auto a_13_02 = select(select(a, 1, 3), 0, 2); - REQUIRE( a[0][3].equal(a_13[0]) ); - REQUIRE( a[2][3].equal(a_13_02) ); + CATCH_REQUIRE( a[0][3].equal(a_13[0]) ); + CATCH_REQUIRE( a[2][3].equal(a_13_02) ); } - SECTION( "zero-dim" ) { + CATCH_SECTION( "zero-dim" ) { Tensor a = type.scalarTensor(4); //rand(type, {1}); Tensor b = rand({3,4}, type); - REQUIRE((a + a).dim() == 0); - REQUIRE((1 + a).dim() == 0); - REQUIRE((b + a).dim() == 2); - REQUIRE((a + b).dim() == 2); + CATCH_REQUIRE((a + a).dim() == 0); + CATCH_REQUIRE((1 + a).dim() == 0); + CATCH_REQUIRE((b + a).dim() == 2); + CATCH_REQUIRE((a + b).dim() == 2); auto c = rand({3,4}, type); - REQUIRE(c[1][2].dim() == 0); + CATCH_REQUIRE(c[1][2].dim() == 0); auto f = rand({3,4}, type); f[2] = zeros({4}, type); f[1][0] = -1; - REQUIRE(f[2][0].toCDouble() == 0); + CATCH_REQUIRE(f[2][0].toCDouble() == 0); } - SECTION( "tensor from TH" ) { + CATCH_SECTION( "tensor from TH" ) { int a = 4; THFloatTensor *t = THFloatTensor_newWithSize2d(a, a); THFloatTensor_fill(t, a); Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false); - REQUIRE_NOTHROW(tt); + CATCH_REQUIRE_NOTHROW(tt); } - SECTION( "toCFloat" ) { + CATCH_SECTION( "toCFloat" ) { Tensor a = zeros({3,4}); Tensor b = ones({3,7}); Tensor c = cat({a,b},1); - REQUIRE(c.size(1) == 11); + CATCH_REQUIRE(c.size(1) == 11); Tensor e = rand({}); - REQUIRE(*e.data() == e.sum().toCFloat()); + CATCH_REQUIRE(*e.data() == e.sum().toCFloat()); } - SECTION( "to string" ) { + CATCH_SECTION( "to string" ) { Tensor b = ones({3,7})*.0000001f; std::stringstream s; s << b << "\n"; std::string expect = "1e-07 *"; - REQUIRE(s.str().substr(0,expect.size()) == expect); + CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect); } - SECTION("indexing by Scalar") { + CATCH_SECTION("indexing by Scalar") { Tensor tensor = arange(0, 10, kInt); Tensor one = ones({}, kInt); for (int64_t i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } for (size_t i = 0; i < static_cast(tensor.numel()); ++i) { - REQUIRE(tensor[i].equal(one * static_cast(i))); + CATCH_REQUIRE(tensor[i].equal(one * static_cast(i))); } for (int i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } for (int16_t i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } for (int8_t i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[i].equal(one * i)); + CATCH_REQUIRE(tensor[i].equal(one * i)); } - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[Scalar(3.14)].equal(one), StartsWith( "Can only index tensors with integral scalars")); } - SECTION("indexing by zero-dim tensor") { + CATCH_SECTION("indexing by zero-dim tensor") { Tensor tensor = arange(0, 10, kInt); Tensor one = ones({}, kInt); for (int i = 0; i < tensor.numel(); ++i) { - REQUIRE(tensor[one * i].equal(one * i)); + CATCH_REQUIRE(tensor[one * i].equal(one * i)); } - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[ones({}) * 3.14].equal(one), StartsWith( "Can only index tensors with integral scalars")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[Tensor()].equal(one), StartsWith("Can only index with tensors that are defined")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor[ones({2, 3, 4}, kInt)].equal(one), StartsWith("Can only index with tensors that are scalars (zero-dim)")); } - SECTION("dispatch") { + CATCH_SECTION("dispatch") { Tensor tensor = randn({20, 20}); Tensor other = randn({20, 20}); auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean); - REQUIRE(result.allclose(mse_loss(relu(tensor), other))); + CATCH_REQUIRE(result.allclose(mse_loss(relu(tensor), other))); } - SECTION("core") { + CATCH_SECTION("core") { int i = CoreTest(); - REQUIRE(i + 1 == CoreTest()); + CATCH_REQUIRE(i + 1 == CoreTest()); } } -TEST_CASE( "basic tests CPU", "[cpu]" ) { +CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) { manual_seed(123, at::kCPU); test(CPU(kFloat)); } -TEST_CASE( "basic tests GPU", "[cuda]" ) { +CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) { manual_seed(123, at::kCUDA); if(at::hasCUDA()) { diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp index cd5c43d32fae86..822a1d79df1bda 100644 --- a/aten/src/ATen/test/broadcast_test.cpp +++ b/aten/src/ATen/test/broadcast_test.cpp @@ -1,154 +1,154 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; -TEST_CASE( "broadcast", "[]" ) { +CATCH_TEST_CASE( "broadcast", "[]" ) { manual_seed(123, at::kCPU); Type & T = CPU(kFloat); // 0) pre-req tests: - SECTION( "can't expand empty tensor" ) { + CATCH_SECTION( "can't expand empty tensor" ) { auto empty = randn({0}, T); - REQUIRE_THROWS(empty.expand({3})); + _CATCH_REQUIRE_THROWS(empty.expand({3})); } // 1) out-place function with 2 args - SECTION( "out-place function with 2 args" ) { + CATCH_SECTION( "out-place function with 2 args" ) { - SECTION( "basic" ) { + CATCH_SECTION( "basic" ) { auto a = randn({3, 1}, T); auto b = randn({5}, T); std::vector expanded_sizes = {3, 5}; - REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); + CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto aScalar = ones({1}, T); aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); auto b = randn({3, 5}, T); - REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); + CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); } - SECTION( "old fallback behavior yields error" ) { + CATCH_SECTION( "old fallback behavior yields error" ) { auto a = randn({3, 5}, T); auto b = randn({5, 3}, T); - REQUIRE_THROWS(a + b); + _CATCH_REQUIRE_THROWS(a + b); } - SECTION( "with mismatched sizes" ) { + CATCH_SECTION( "with mismatched sizes" ) { auto a = randn({3, 5}, T); auto b = randn({7, 5}, T); - REQUIRE_THROWS(a + b); + _CATCH_REQUIRE_THROWS(a + b); } } - SECTION( "out-place function with 3 args" ) { + CATCH_SECTION( "out-place function with 3 args" ) { - SECTION( "basic" ) { + CATCH_SECTION( "basic" ) { auto a = randn({3, 1, 1}, T); auto b = randn({1, 2, 1}, T); auto c = randn({1, 1, 5}, T); std::vector expanded_sizes = {3, 2, 5}; - REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes))); + CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto aTensorScalar = ones({1}, T); aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); auto b = randn({3, 2, 1}, T); auto c = randn({1, 2, 5}, T); std::vector expanded_sizes = {3, 2, 5}; - REQUIRE(aTensorScalar.addcmul(b, c).equal( + CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal( aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes)))); } - SECTION( "old fallback behavior yields error" ) { + CATCH_SECTION( "old fallback behavior yields error" ) { auto a = randn({3, 2, 5}, T); auto b = randn({2, 3, 5}, T); auto c = randn({5, 3, 2}, T); - REQUIRE_THROWS(a.addcmul(b, c)); + _CATCH_REQUIRE_THROWS(a.addcmul(b, c)); } - SECTION( "with mismatched sizes" ){ + CATCH_SECTION( "with mismatched sizes" ){ auto a = randn({3, 2, 5}, T); auto b = randn({2, 3, 5}, T); auto c = randn({5, 5, 5}, T); - REQUIRE_THROWS(a.addcmul(b, c)); + _CATCH_REQUIRE_THROWS(a.addcmul(b, c)); } } - SECTION( "in-place function with 2 args" ) { - SECTION( "basic" ) { + CATCH_SECTION( "in-place function with 2 args" ) { + CATCH_SECTION( "basic" ) { auto a = randn({3, 5}, T); auto b = randn({3, 1}, T); - REQUIRE((a + b).equal(a + b.expand({3, 5}))); + CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5}))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto a = randn({3, 5}, T); auto bScalar = ones({1}, T); bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); + CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); } - SECTION( "error: would have to expand inplace arg" ) { + CATCH_SECTION( "error: would have to expand inplace arg" ) { auto a = randn({1, 5}, T); auto b = randn({3, 1}, T); - REQUIRE_THROWS(a.add_(b)); + _CATCH_REQUIRE_THROWS(a.add_(b)); } } - SECTION( "in-place function with 3 args" ) { + CATCH_SECTION( "in-place function with 3 args" ) { auto a = randn({3, 5, 2}, T); auto b = randn({3, 1, 2}, T); auto c = randn({1, 5, 1}, T); - SECTION( "basic" ) { + CATCH_SECTION( "basic" ) { auto aClone = a.clone(); - REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); + CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { auto aClone = a.clone(); auto bScalar = ones({1}, T); bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes())))); + CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes())))); } - SECTION( "error: would have to expand inplace arg" ) { + CATCH_SECTION( "error: would have to expand inplace arg" ) { auto a = randn({1, 3, 5}, T); auto b = randn({4, 1, 1}, T); auto c = randn({1, 3, 1}, T); - REQUIRE_THROWS(a.addcmul_(b, c)); + _CATCH_REQUIRE_THROWS(a.addcmul_(b, c)); } } - SECTION( "explicit dim specification" ) { + CATCH_SECTION( "explicit dim specification" ) { auto a = randn({1}, T); auto b = randn({5, 3}, T); auto c = randn({3, 7}, T); - SECTION( "basic" ) { - REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c))); + CATCH_SECTION( "basic" ) { + CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c))); } - SECTION( "with scalar" ) { + CATCH_SECTION( "with scalar" ) { Tensor aScalar = ones({1}, T); aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); + CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); } - SECTION( "with mismatched sizes" ) { + CATCH_SECTION( "with mismatched sizes" ) { auto a = randn({3, 3}, T); - REQUIRE_THROWS(a.addmm(b, c)); + _CATCH_REQUIRE_THROWS(a.addmm(b, c)); } } } diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp new file mode 100644 index 00000000000000..b9b0a87990a9ce --- /dev/null +++ b/aten/src/ATen/test/catch_utils.hpp @@ -0,0 +1,8 @@ +#pragma once + +#define CATCH_CONFIG_PREFIX_ALL +#include + +// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning; +// define our own version that doesn't warn. +#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ ) diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu index fa00e534ee07ef..cce267100589e1 100644 --- a/aten/src/ATen/test/cuda_half_test.cu +++ b/aten/src/ATen/test/cuda_half_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/cuda/NumericLimits.cuh" @@ -82,9 +82,9 @@ void launch_function(){ kernel<<<1,1>>>(); } -TEST_CASE( "half common math functions tests in device", "[cuda]" ) { +CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) { launch_function(); cudaError_t err = cudaDeviceSynchronize(); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu index 9956dcf52b04ef..b64c530b355914 100644 --- a/aten/src/ATen/test/cuda_optional_test.cu +++ b/aten/src/ATen/test/cuda_optional_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/optional.h" @@ -8,15 +8,15 @@ using namespace at; -TEST_CASE( "optional in cuda files", "[cuda]" ) { +CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) { at::optional trivially_destructible; at::optional> non_trivially_destructible; - REQUIRE(!trivially_destructible.has_value()); - REQUIRE(!non_trivially_destructible.has_value()); + CATCH_REQUIRE(!trivially_destructible.has_value()); + CATCH_REQUIRE(!non_trivially_destructible.has_value()); trivially_destructible = {5}; non_trivially_destructible = std::vector{5, 10}; - REQUIRE(trivially_destructible.has_value()); - REQUIRE(non_trivially_destructible.has_value()); + CATCH_REQUIRE(trivially_destructible.has_value()); + CATCH_REQUIRE(non_trivially_destructible.has_value()); } diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu index f1eb5cbbe4b1f3..a529f38d748a1b 100644 --- a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu +++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" @@ -22,7 +22,7 @@ __global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor>>(resa, t1a, t2a); cudaError_t err = cudaDeviceSynchronize(); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); auto expected = mv(t1, t2); - REQUIRE(res.allclose(expected)); + CATCH_REQUIRE(res.allclose(expected)); } diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp index d32903dd2fe1f3..7b14174d3baeb3 100644 --- a/aten/src/ATen/test/cuda_rng_test.cpp +++ b/aten/src/ATen/test/cuda_rng_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "cuda.h" @@ -21,7 +21,7 @@ void testCudaRNGMultithread() { } }; -TEST_CASE( "CUDA RNG test", "[cuda]" ) { - SECTION( "multithread" ) +CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) { + CATCH_SECTION( "multithread" ) testCudaRNGMultithread(); } diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp index 31786e88a0944d..4391867d166772 100644 --- a/aten/src/ATen/test/cudnn_test.cpp +++ b/aten/src/ATen/test/cudnn_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/cudnn/Descriptors.h" @@ -9,7 +9,7 @@ using namespace at; using namespace at::native; -TEST_CASE( "cudnn", "[cuda]" ) { +CATCH_TEST_CASE( "cudnn", "[cuda]" ) { manual_seed(123, at::kCUDA); #if CUDNN_VERSION < 7000 @@ -18,8 +18,8 @@ TEST_CASE( "cudnn", "[cuda]" ) { desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42); desc2.set(handle, 0.5, desc1.state); - REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout); - REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates); - REQUIRE(desc1.desc()->states == desc2.desc()->states); + CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout); + CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates); + CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states); #endif } diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp index 48829298760276..bf0cf93f7c4064 100644 --- a/aten/src/ATen/test/dlconvertor_test.cpp +++ b/aten/src/ATen/test/dlconvertor_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/DLConvertor.h" @@ -11,17 +11,17 @@ using namespace at; -TEST_CASE( "dlconvertor", "[cpu]" ) { +CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) { manual_seed(123, at::kCPU); - INFO( "convert ATen to DLTensor" ); + CATCH_INFO( "convert ATen to DLTensor" ); Tensor a = rand({3,4}); DLManagedTensor* dlMTensor = toDLPack(a); - INFO( "convert DLTensor to ATen" ); + CATCH_INFO( "convert DLTensor to ATen" ); Tensor b = fromDLPack(dlMTensor); - REQUIRE(a.equal(b)); + CATCH_REQUIRE(a.equal(b)); } diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp index 3b2944803e6b5a..32177705a2f883 100644 --- a/aten/src/ATen/test/half_test.cpp +++ b/aten/src/ATen/test/half_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include #include @@ -12,53 +12,53 @@ using namespace at; -TEST_CASE( "half arithmetic", "[]" ) { +CATCH_TEST_CASE( "half arithmetic", "[]" ) { Half zero = 0; Half one = 1; - REQUIRE(zero + one == one); - REQUIRE(zero + zero == zero); - REQUIRE(zero * one == zero); - REQUIRE(one * one == one); - REQUIRE(one / one == one); - REQUIRE(one - one == zero); - REQUIRE(one - zero == one); - REQUIRE(zero - one == -one); - REQUIRE(one + one == Half(2)); - REQUIRE(one + one == 2); + CATCH_REQUIRE(zero + one == one); + CATCH_REQUIRE(zero + zero == zero); + CATCH_REQUIRE(zero * one == zero); + CATCH_REQUIRE(one * one == one); + CATCH_REQUIRE(one / one == one); + CATCH_REQUIRE(one - one == zero); + CATCH_REQUIRE(one - zero == one); + CATCH_REQUIRE(zero - one == -one); + CATCH_REQUIRE(one + one == Half(2)); + CATCH_REQUIRE(one + one == 2); } -TEST_CASE( "half comparisons", "[]" ) { +CATCH_TEST_CASE( "half comparisons", "[]" ) { Half zero = 0; Half one = 1; - REQUIRE(zero < one); - REQUIRE(zero < 1); - REQUIRE(1 > zero); - REQUIRE(0 >= zero); - REQUIRE(0 != one); - REQUIRE(zero == 0); - REQUIRE(zero == zero); - REQUIRE(zero == -zero); + CATCH_REQUIRE(zero < one); + CATCH_REQUIRE(zero < 1); + CATCH_REQUIRE(1 > zero); + CATCH_REQUIRE(0 >= zero); + CATCH_REQUIRE(0 != one); + CATCH_REQUIRE(zero == 0); + CATCH_REQUIRE(zero == zero); + CATCH_REQUIRE(zero == -zero); } -TEST_CASE( "half cast", "[]" ) { +CATCH_TEST_CASE( "half cast", "[]" ) { Half value = 1.5f; - REQUIRE((int)value == 1); - REQUIRE((short)value == 1); - REQUIRE((long long)value == 1LL); - REQUIRE((float)value == 1.5f); - REQUIRE((double)value == 1.5); - REQUIRE((bool)value == true); - REQUIRE((bool)Half(0.0f) == false); + CATCH_REQUIRE((int)value == 1); + CATCH_REQUIRE((short)value == 1); + CATCH_REQUIRE((long long)value == 1LL); + CATCH_REQUIRE((float)value == 1.5f); + CATCH_REQUIRE((double)value == 1.5); + CATCH_REQUIRE((bool)value == true); + CATCH_REQUIRE((bool)Half(0.0f) == false); } -TEST_CASE( "half construction", "[]" ) { - REQUIRE(Half((short)3) == Half(3.0f)); - REQUIRE(Half((unsigned short)3) == Half(3.0f)); - REQUIRE(Half(3) == Half(3.0f)); - REQUIRE(Half(3U) == Half(3.0f)); - REQUIRE(Half(3LL) == Half(3.0f)); - REQUIRE(Half(3ULL) == Half(3.0f)); - REQUIRE(Half(3.5) == Half(3.5f)); +CATCH_TEST_CASE( "half construction", "[]" ) { + CATCH_REQUIRE(Half((short)3) == Half(3.0f)); + CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f)); + CATCH_REQUIRE(Half(3) == Half(3.0f)); + CATCH_REQUIRE(Half(3U) == Half(3.0f)); + CATCH_REQUIRE(Half(3LL) == Half(3.0f)); + CATCH_REQUIRE(Half(3ULL) == Half(3.0f)); + CATCH_REQUIRE(Half(3.5) == Half(3.5f)); } static std::string to_string(const Half& h) { @@ -67,22 +67,22 @@ static std::string to_string(const Half& h) { return ss.str(); } -TEST_CASE( "half to string", "[]" ) { - REQUIRE(to_string(Half(3.5f)) == "3.5"); - REQUIRE(to_string(Half(-100.0f)) == "-100"); +CATCH_TEST_CASE( "half to string", "[]" ) { + CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5"); + CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100"); } -TEST_CASE( "half numeric limits", "[]" ) { +CATCH_TEST_CASE( "half numeric limits", "[]" ) { using limits = std::numeric_limits; - REQUIRE(limits::lowest() == -65504.0f); - REQUIRE(limits::max() == 65504.0f); - REQUIRE(limits::min() > 0); - REQUIRE(limits::min() < 1); - REQUIRE(limits::denorm_min() > 0); - REQUIRE(limits::denorm_min() / 2 == 0); - REQUIRE(limits::infinity() == std::numeric_limits::infinity()); - REQUIRE(limits::quiet_NaN() != limits::quiet_NaN()); - REQUIRE(limits::signaling_NaN() != limits::signaling_NaN()); + CATCH_REQUIRE(limits::lowest() == -65504.0f); + CATCH_REQUIRE(limits::max() == 65504.0f); + CATCH_REQUIRE(limits::min() > 0); + CATCH_REQUIRE(limits::min() < 1); + CATCH_REQUIRE(limits::denorm_min() > 0); + CATCH_REQUIRE(limits::denorm_min() / 2 == 0); + CATCH_REQUIRE(limits::infinity() == std::numeric_limits::infinity()); + CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN()); + CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN()); } // Check the declared type of members of numeric_limits matches @@ -119,7 +119,7 @@ ASSERT_SAME_TYPE(max_exponent10); ASSERT_SAME_TYPE(traps); ASSERT_SAME_TYPE(tinyness_before); -TEST_CASE( "half common math functions test", "[]" ) { +CATCH_TEST_CASE( "half common math functions test", "[]" ) { float threshold = 0.00001; assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold); assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold); diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu index 4c63ab3a8fd205..d09a423d7ca72d 100644 --- a/aten/src/ATen/test/integer_divider_test.cu +++ b/aten/src/ATen/test/integer_divider_test.cu @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or // (b-1), so it takes a few minutes to run. @@ -62,18 +62,18 @@ class IntDividerTester { cudaError_t err; err = cudaMalloc(÷rsBuf_, NUM_CASES * sizeof(IntDivider)); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase)); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } ~IntDividerTester() { cudaError_t err; err = cudaFree(dividersBuf_); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); err = cudaFree(testCasesBuf_); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } void addTestCase(Value dividend, Value divisor, int steps) { @@ -92,18 +92,18 @@ class IntDividerTester { cudaError_t err; if (testCases_.empty()) return; - REQUIRE(!dividers_.empty()); + CATCH_REQUIRE(!dividers_.empty()); - REQUIRE(dividers_.size() <= NUM_CASES); - REQUIRE(testCases_.size() <= NUM_CASES); + CATCH_REQUIRE(dividers_.size() <= NUM_CASES); + CATCH_REQUIRE(testCases_.size() <= NUM_CASES); err = cudaMemcpy(dividersBuf_, dividers_.data(), dividers_.size() * sizeof(IntDivider), cudaMemcpyHostToDevice); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); err = cudaMemcpy(testCasesBuf_, testCases_.data(), testCases_.size() * sizeof(TestCase), cudaMemcpyHostToDevice); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); int numCases = testCases_.size(); testIntDivider<<<512, 512>>>( @@ -180,11 +180,11 @@ static void testUint64Divider() tester.flush(); } -TEST_CASE( "CUDA integer divider", "[cuda]" ) { +CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) { testUint64Divider(); testUint32Divider(); cudaError_t err = cudaDeviceSynchronize(); - REQUIRE(err == cudaSuccess); + CATCH_REQUIRE(err == cudaSuccess); } diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp index e10de30ae8e023..4c57b7d8ee1d96 100644 --- a/aten/src/ATen/test/native_test.cpp +++ b/aten/src/ATen/test/native_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" @@ -9,18 +9,18 @@ using namespace at; using Catch::Matchers::StartsWith; #define REQUIRE_EQUAL(t1, t2) \ - REQUIRE(t1.equal(t2)); + CATCH_REQUIRE(t1.equal(t2)); #define REQUIRE_ALLCLOSE(t1, t2) \ - REQUIRE(t1.is_same_size(t2)); \ - REQUIRE(t1.allclose(t2)); + CATCH_REQUIRE(t1.is_same_size(t2)); \ + CATCH_REQUIRE(t1.allclose(t2)); #define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ - REQUIRE(t1.is_same_size(t2)); \ - REQUIRE(t1.allclose(t2, atol, rtol)); + CATCH_REQUIRE(t1.is_same_size(t2)); \ + CATCH_REQUIRE(t1.allclose(t2, atol, rtol)); void requireEqualTensorList(TensorList t1, TensorList t2) { - REQUIRE(t1.size() == t2.size()); + CATCH_REQUIRE(t1.size() == t2.size()); for (size_t i = 0; i < t1.size(); ++i) { REQUIRE_EQUAL(t1[ i ], t2[ i ]); } @@ -29,7 +29,7 @@ void requireEqualTensorList(TensorList t1, TensorList t2) { void test(Type & T, Type & AccT) { auto t = randn({3, 3}, T); - SECTION( "split: test method, type, namespace give same result" ) { + CATCH_SECTION( "split: test method, type, namespace give same result" ) { auto splitMethod = t.split(1, 0); auto splitType = T.split(t, 1, 0); auto splitNs = at::split(t, 1, 0); @@ -40,7 +40,7 @@ void test(Type & T, Type & AccT) { REQUIRE_EQUAL(at::cat(splitMethod, 0), t); } - SECTION( "chunk: test method, type, namespace give same result" ) { + CATCH_SECTION( "chunk: test method, type, namespace give same result" ) { // test method, type, namespace give same result auto chunkMethod = t.chunk(3, 0); auto chunkType = T.chunk(t, 3, 0); @@ -53,7 +53,7 @@ void test(Type & T, Type & AccT) { } // stack - SECTION( "stack" ) { + CATCH_SECTION( "stack" ) { auto x = rand({2, 3, 4}); auto y = rand({2, 3, 4}); auto z = rand({2, 3, 4}); @@ -66,36 +66,36 @@ void test(Type & T, Type & AccT) { expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end()); REQUIRE_EQUAL(res, res_neg); - REQUIRE(res.sizes().equals(expected_size)); + CATCH_REQUIRE(res.sizes().equals(expected_size)); REQUIRE_EQUAL(res.select(dim, 0), x); REQUIRE_EQUAL(res.select(dim, 1), y); REQUIRE_EQUAL(res.select(dim, 2), z); } } - SECTION( "size / stride" ) { + CATCH_SECTION( "size / stride" ) { auto scalar = randn({}, T); - REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); - REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); - REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); - REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); + CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); auto empty = randn({0}, T); - REQUIRE(empty.size(0) == 0); - REQUIRE(empty.size(-1) == 0); - REQUIRE(empty.stride(0) == 1); - REQUIRE(empty.stride(-1) == 1); + CATCH_REQUIRE(empty.size(0) == 0); + CATCH_REQUIRE(empty.size(-1) == 0); + CATCH_REQUIRE(empty.stride(0) == 1); + CATCH_REQUIRE(empty.stride(-1) == 1); } // matmul - SECTION( "matmul" ) { + CATCH_SECTION( "matmul" ) { auto scalar = randn({}, T); auto d1 = randn({3}, T); auto d2 = randn({2, 3}, T); // 0-d - REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D")); - REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D")); + CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D")); + CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D")); // 1-d REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1)); @@ -140,11 +140,11 @@ void test(Type & T, Type & AccT) { // non-expandable case auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T); - REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size")); + CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size")); } // _standard_gamma_grad - SECTION( "_standard_gamma_grad" ) { + CATCH_SECTION( "_standard_gamma_grad" ) { // check empty auto empty = ones({0}, T); REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty)); @@ -158,10 +158,10 @@ void test(Type & T, Type & AccT) { // check mixing types auto t1 = randn({3, 4}, T); auto t2 = randn({3, 4}, T).toType(kDouble); - REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type")); + CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type")); } - SECTION( "where" ) { + CATCH_SECTION( "where" ) { // empty auto empty = ones({0}, T); auto &bT = T.toScalarType(ScalarType::Byte); @@ -180,13 +180,13 @@ void test(Type & T, Type & AccT) { } } -TEST_CASE( "native test CPU", "[cpu]" ) { +CATCH_TEST_CASE( "native test CPU", "[cpu]" ) { manual_seed(123, at::kCPU); test(CPU(kFloat), CPU(kDouble)); } -TEST_CASE( "native test CUDA", "[cuda]" ) { +CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) { manual_seed(123, at::kCUDA); if (at::hasCUDA()) { diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index d52dc27e20295e..964f6260e7d9ff 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" @@ -18,14 +18,14 @@ using namespace at; _passed = true; \ els; \ } catch (std::exception &e) { \ - REQUIRE(!_passed); \ + CATCH_REQUIRE(!_passed); \ catc; \ } \ } void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) { - REQUIRE(lhs.dim() == rhs.dim()); - REQUIRE(lhs.sizes().equals(rhs.sizes())); + CATCH_REQUIRE(lhs.dim() == rhs.dim()); + CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes())); } bool should_expand(const IntList &from_size, const IntList &to_size) { @@ -49,15 +49,15 @@ void test(Type &T) { for (auto s = sizes.begin(); s != sizes.end(); ++s) { // verify that the dim, sizes, strides, etc match what was requested. auto t = ones(*s, T); - REQUIRE((size_t)t.dim() == s->size()); - REQUIRE((size_t)t.ndimension() == s->size()); - REQUIRE(t.sizes().equals(*s)); - REQUIRE(t.strides().size() == s->size()); + CATCH_REQUIRE((size_t)t.dim() == s->size()); + CATCH_REQUIRE((size_t)t.ndimension() == s->size()); + CATCH_REQUIRE(t.sizes().equals(*s)); + CATCH_REQUIRE(t.strides().size() == s->size()); auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies()); - REQUIRE(t.numel() == numel); + CATCH_REQUIRE(t.numel() == numel); // verify we can output std::stringstream ss; - REQUIRE_NOTHROW(ss << t << std::endl); + CATCH_REQUIRE_NOTHROW(ss << t << std::endl); // set_ auto t2 = ones(*s, T); @@ -65,22 +65,22 @@ void test(Type &T) { require_equal_size_dim(t2, ones({0}, T)); // unsqueeze - REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); + CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); // unsqueeze_ { auto t2 = ones(*s, T); auto r = t2.unsqueeze_(0); - REQUIRE(r.dim() == t.dim() + 1); + CATCH_REQUIRE(r.dim() == t.dim() + 1); } // squeeze (with dimension argument) if (t.dim() == 0 || t.sizes()[0] == 1) { - REQUIRE(t.squeeze(0).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t.squeeze(0).dim() == std::max(t.dim() - 1, 0)); } else { // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; // in NumPy this is an error. - REQUIRE(t.squeeze(0).dim() == t.dim()); + CATCH_REQUIRE(t.squeeze(0).dim() == t.dim()); } // squeeze (with no dimension argument) @@ -99,11 +99,11 @@ void test(Type &T) { // squeeze_ (with dimension argument) auto t2 = ones(*s, T); if (t2.dim() == 0 || t2.sizes()[0] == 1) { - REQUIRE(t2.squeeze_(0).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max(t.dim() - 1, 0)); } else { // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; // in NumPy this is an error. - REQUIRE(t2.squeeze_(0).dim() == t.dim()); + CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim()); } } @@ -122,31 +122,31 @@ void test(Type &T) { // reduce (with dimension argument and with 1 return argument) if (t.numel() != 0) { - REQUIRE(t.sum(0).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t.sum(0).dim() == std::max(t.dim() - 1, 0)); } else { - REQUIRE(t.sum(0).equal(at::zeros({}, T))); + CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T))); } // reduce (with dimension argument and with 2 return arguments) if (t.numel() != 0) { auto ret = t.min(0); - REQUIRE(std::get<0>(ret).dim() == std::max(t.dim() - 1, 0)); - REQUIRE(std::get<1>(ret).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(std::get<0>(ret).dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(std::get<1>(ret).dim() == std::max(t.dim() - 1, 0)); } else { - REQUIRE_THROWS(t.min(0)); + _CATCH_REQUIRE_THROWS(t.min(0)); } // simple indexing if (t.dim() > 0 && t.numel() != 0) { - REQUIRE(t[0].dim() == std::max(t.dim() - 1, 0)); + CATCH_REQUIRE(t[0].dim() == std::max(t.dim() - 1, 0)); } else { - REQUIRE_THROWS(t[0]); + _CATCH_REQUIRE_THROWS(t[0]); } // fill_ (argument to fill_ can only be a 0-dim tensor) TRY_CATCH_ELSE(t.fill_(t.sum(0)), - REQUIRE(t.dim() > 1), - REQUIRE(t.dim() <= 1)); + CATCH_REQUIRE(t.dim() > 1), + CATCH_REQUIRE(t.dim() <= 1)); } for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) { @@ -156,8 +156,8 @@ void test(Type &T) { auto lhs = ones(*lhs_it, T); auto rhs = ones(*rhs_it, T); if(*lhs_it != *rhs_it) { - REQUIRE(!lhs.is_same_size(rhs)); - REQUIRE(!rhs.is_same_size(lhs)); + CATCH_REQUIRE(!lhs.is_same_size(rhs)); + CATCH_REQUIRE(!rhs.is_same_size(lhs)); } } // forced size functions (resize_, resize_as, set_) @@ -192,7 +192,7 @@ void test(Type &T) { auto storage = T.storage(rhs.numel(), false); lhs.set_(storage); // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars - REQUIRE(lhs.dim() != 0); + CATCH_REQUIRE(lhs.dim() != 0); } { // with storage, offset, sizes, strides @@ -211,8 +211,8 @@ void test(Type &T) { auto rhs = ones(*rhs_it, T); auto rhs_size = *rhs_it; TRY_CATCH_ELSE(auto result = lhs.view(rhs_size), - REQUIRE(lhs.numel() != rhs.numel()), - REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs);); + CATCH_REQUIRE(lhs.numel() != rhs.numel()), + CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs);); } // take @@ -220,7 +220,7 @@ void test(Type &T) { auto lhs = ones(*lhs_it, T); auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long); TRY_CATCH_ELSE(auto result = lhs.take(rhs), - REQUIRE(lhs.numel() == 0); REQUIRE(rhs.numel() != 0), + CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0), require_equal_size_dim(result, rhs)); } @@ -230,7 +230,7 @@ void test(Type &T) { auto lhs = ones(*lhs_it, T); auto rhs = ones(*rhs_it, T); TRY_CATCH_ELSE(auto result = lhs.ger(rhs), - REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)), + CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)), [&]() { int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0); int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0); @@ -246,8 +246,8 @@ void test(Type &T) { auto rhs_size = *rhs_it; bool should_pass = should_expand(lhs_size, rhs_size); TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size), - REQUIRE(!should_pass), - REQUIRE(should_pass); require_equal_size_dim(result, rhs);); + CATCH_REQUIRE(!should_pass), + CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs);); // in-place functions (would be good if we can also do a non-broadcasting one, b/c // broadcasting functions will always end up operating on tensors of same size; @@ -255,21 +255,21 @@ void test(Type &T) { { bool should_pass_inplace = should_expand(rhs_size, lhs_size); TRY_CATCH_ELSE(lhs.add_(rhs), - REQUIRE(!should_pass_inplace), - REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T));); + CATCH_REQUIRE(!should_pass_inplace), + CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T));); } } } } } -TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { +CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { manual_seed(123, at::kCPU); test(CPU(kFloat)); } -TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { +CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { manual_seed(123, at::kCUDA); if (at::hasCUDA()) { diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index 72ef4e4ad3cf4c..247830c3cc839c 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include // define constants like M_PI and C keywords for MSVC @@ -33,25 +33,25 @@ struct Foo { void test_overflow() { auto s1 = Scalar(M_PI); - REQUIRE(s1.toFloat() == static_cast(M_PI)); + CATCH_REQUIRE(s1.toFloat() == static_cast(M_PI)); s1.toHalf(); s1 = Scalar(100000); - REQUIRE(s1.toFloat() == 100000.0); - REQUIRE(s1.toInt() == 100000); + CATCH_REQUIRE(s1.toFloat() == 100000.0); + CATCH_REQUIRE(s1.toInt() == 100000); - REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error); + CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error); s1 = Scalar(NAN); - REQUIRE(std::isnan(s1.toFloat())); - REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + CATCH_REQUIRE(std::isnan(s1.toFloat())); + CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); s1 = Scalar(INFINITY); - REQUIRE(std::isinf(s1.toFloat())); - REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + CATCH_REQUIRE(std::isinf(s1.toFloat())); + CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); } -TEST_CASE( "scalar test", "[]" ) { +CATCH_TEST_CASE( "scalar test", "[]" ) { manual_seed(123, at::kCPU); manual_seed(123, at::kCUDA); @@ -62,7 +62,7 @@ TEST_CASE( "scalar test", "[]" ) { Scalar h2 = h; cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() << "\n"; Generator & gen = at::globalContext().defaultGenerator(at::kCPU); - REQUIRE_NOTHROW(gen.seed()); + CATCH_REQUIRE_NOTHROW(gen.seed()); auto && C = at::globalContext(); if(at::hasCUDA()) { auto t2 = zeros({4,4}, at::kCUDA); @@ -71,12 +71,12 @@ TEST_CASE( "scalar test", "[]" ) { auto t = ones({4,4}); auto wha2 = zeros({4,4}).add(t).sum(); - REQUIRE( wha2.toCDouble() == 16.0 ); + CATCH_REQUIRE( wha2.toCDouble() == 16.0 ); - REQUIRE( t.sizes()[0] == 4 ); - REQUIRE( t.sizes()[1] == 4 ); - REQUIRE( t.strides()[0] == 4 ); - REQUIRE( t.strides()[1] == 1 ); + CATCH_REQUIRE( t.sizes()[0] == 4 ); + CATCH_REQUIRE( t.sizes()[1] == 4 ); + CATCH_REQUIRE( t.strides()[0] == 4 ); + CATCH_REQUIRE( t.strides()[1] == 1 ); Type & T = CPU(Float); Tensor x = randn({1,10}, T); @@ -88,26 +88,26 @@ TEST_CASE( "scalar test", "[]" ) { Tensor next_h = i2h.add(h2h); next_h = next_h.tanh(); - REQUIRE_THROWS(at::_local_scalar(Tensor{})); + _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{})); test_overflow(); if(at::hasCUDA()) { auto r = CUDA(Float).copy(next_h); - REQUIRE(CPU(Float).copy(r).equal(next_h)); + CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h)); } - REQUIRE_NOTHROW(randn({10,10,2}, T)); + CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T)); // check Scalar.toTensor on Scalars backed by different data types - REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble); - REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong); - REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble); + CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble); + CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong); + CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble); if (x.type().scalarType() != ScalarType::Half) { AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] { scalar_t s = 1; std::stringstream ss; - REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n"); + CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n"); auto data = (scalar_t*)x.data_ptr(); (void)data; }); @@ -116,10 +116,10 @@ TEST_CASE( "scalar test", "[]" ) { // test direct C-scalar type conversions { auto x = ones({1,2}, T); - REQUIRE_THROWS(x.toCFloat()); + _CATCH_REQUIRE_THROWS(x.toCFloat()); } auto float_one = ones({}, T); - REQUIRE(float_one.toCFloat() == 1); - REQUIRE(float_one.toCInt() == 1); - REQUIRE((float_one.toCHalf() == 1)); + CATCH_REQUIRE(float_one.toCFloat() == 1); + CATCH_REQUIRE(float_one.toCInt() == 1); + CATCH_REQUIRE((float_one.toCHalf() == 1)); } diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp index 145c4f4c261276..8dc015dd1d06ae 100644 --- a/aten/src/ATen/test/stream_test.cpp +++ b/aten/src/ATen/test/stream_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/cuda/CUDAContext.h" #include "ATen/cuda/CUDAGuard.h" @@ -14,7 +14,7 @@ /* Tests related to ATen streams. */ -TEST_CASE( +CATCH_TEST_CASE( "Copying and Moving Streams", "Verifies streams are live through copying and moving") { int32_t device = -1; @@ -29,14 +29,14 @@ TEST_CASE( copyStream = s; - REQUIRE(copyStream.internals() == s.internals()); - REQUIRE(copyStream.device() == device); - REQUIRE(copyStream.stream() == cuda_stream); + CATCH_REQUIRE(copyStream.internals() == s.internals()); + CATCH_REQUIRE(copyStream.device() == device); + CATCH_REQUIRE(copyStream.stream() == cuda_stream); } - REQUIRE(copyStream.internals()); - REQUIRE(copyStream.device() == device); - REQUIRE(copyStream.stream() == cuda_stream); + CATCH_REQUIRE(copyStream.internals()); + CATCH_REQUIRE(copyStream.device() == device); + CATCH_REQUIRE(copyStream.stream() == cuda_stream); // Tests that moving works as expected and preserves the stream at::cuda::CUDAStream moveStream; @@ -47,41 +47,41 @@ TEST_CASE( moveStream = std::move(s); - REQUIRE(moveStream.device() == device); - REQUIRE(moveStream.stream() == cuda_stream); + CATCH_REQUIRE(moveStream.device() == device); + CATCH_REQUIRE(moveStream.stream() == cuda_stream); } - REQUIRE(moveStream.internals()); - REQUIRE(moveStream.device() == device); - REQUIRE(moveStream.stream() == cuda_stream); + CATCH_REQUIRE(moveStream.internals()); + CATCH_REQUIRE(moveStream.device() == device); + CATCH_REQUIRE(moveStream.stream() == cuda_stream); } -TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { +CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { at::cuda::CUDAStream myStream = at::cuda::createCUDAStream(); // Sets and gets at::cuda::setCurrentCUDAStream(myStream); at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream(); - REQUIRE(myStream == curStream); + CATCH_REQUIRE(myStream == curStream); // Gets, sets, and gets default stream at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(); at::cuda::setCurrentCUDAStream(defaultStream); curStream = at::cuda::getCurrentCUDAStream(); - REQUIRE(defaultStream != myStream); - REQUIRE(curStream == defaultStream); + CATCH_REQUIRE(defaultStream != myStream); + CATCH_REQUIRE(curStream == defaultStream); } void thread_fun(at::cuda::CUDAStream& cur_thread_stream) { auto new_stream = at::cuda::createCUDAStream(); at::cuda::setCurrentCUDAStream(new_stream); cur_thread_stream = at::cuda::getCurrentCUDAStream(); - REQUIRE(cur_thread_stream == new_stream); + CATCH_REQUIRE(cur_thread_stream == new_stream); } -TEST_CASE( +CATCH_TEST_CASE( "Multithread Getting and Setting", "Ensures streams are thread local") { at::cuda::CUDAStream s0, s1; @@ -94,25 +94,25 @@ TEST_CASE( at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream(); at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream(); - REQUIRE(cur_stream == default_stream); - REQUIRE(cur_stream != s0); - REQUIRE(cur_stream != s1); - REQUIRE(s0 != s1); + CATCH_REQUIRE(cur_stream == default_stream); + CATCH_REQUIRE(cur_stream != s0); + CATCH_REQUIRE(cur_stream != s1); + CATCH_REQUIRE(s0 != s1); } -TEST_CASE("CUDAGuard") { +CATCH_TEST_CASE("CUDAGuard") { if (at::cuda::getNumGPUs() < 2) { return; } // -- begin setup - REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::current_device() == 0); std::vector streams0 = { at::cuda::getDefaultCUDAStream(), at::cuda::createCUDAStream()}; - REQUIRE(streams0[0].device() == 0); - REQUIRE(streams0[1].device() == 0); + CATCH_REQUIRE(streams0[0].device() == 0); + CATCH_REQUIRE(streams0[1].device() == 0); at::cuda::setCurrentCUDAStream(streams0[0]); std::vector streams1; @@ -121,47 +121,47 @@ TEST_CASE("CUDAGuard") { streams1.push_back(at::cuda::getDefaultCUDAStream()); streams1.push_back(at::cuda::createCUDAStream()); } - REQUIRE(streams1[0].device() == 1); - REQUIRE(streams1[1].device() == 1); + CATCH_REQUIRE(streams1[0].device() == 1); + CATCH_REQUIRE(streams1[1].device() == 1); at::cuda::setCurrentCUDAStream(streams1[0]); - REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::current_device() == 0); // -- end setup // Test that all original streams are recorded. { at::cuda::CUDAGuard guard; - REQUIRE(guard.original_streams().empty()); + CATCH_REQUIRE(guard.original_streams().empty()); guard.set_stream(streams0[0]); - REQUIRE( + CATCH_REQUIRE( guard.original_streams().size() == at::cuda::getNumGPUs()); - REQUIRE(guard.original_streams()[0] == streams0[0]); - REQUIRE(guard.original_streams()[1] == streams1[0]); + CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]); + CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]); } // Setting a stream changes the current device and the stream on that device { at::cuda::CUDAGuard guard(streams1[1]); - REQUIRE(guard.last_device() == 1); - REQUIRE(at::cuda::current_device() == 1); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]); + CATCH_REQUIRE(guard.last_device() == 1); + CATCH_REQUIRE(at::cuda::current_device() == 1); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]); } // Device and stream are now reset - REQUIRE(at::cuda::current_device() == 0); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + CATCH_REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); // Setting only the device changes only the current device and not the stream { at::cuda::CUDAGuard guard(/*device=*/1); - REQUIRE(guard.last_device() == 1); - REQUIRE(at::cuda::current_device() == 1); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + CATCH_REQUIRE(guard.last_device() == 1); + CATCH_REQUIRE(at::cuda::current_device() == 1); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); } - REQUIRE(at::cuda::current_device() == 0); - REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); + CATCH_REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); // Setting the stream first, and then the device, first changes the devices // back, and then resets the stream on the initial device. @@ -171,12 +171,12 @@ TEST_CASE("CUDAGuard") { guard.set_device(1); } - REQUIRE(at::cuda::current_device() == 0); - REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); - REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + CATCH_REQUIRE(at::cuda::current_device() == 0); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); + CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); } -TEST_CASE("CUDAGuardIsMovable") { +CATCH_TEST_CASE("CUDAGuardIsMovable") { if (at::cuda::getNumGPUs() < 2) { return; } @@ -185,17 +185,17 @@ TEST_CASE("CUDAGuardIsMovable") { at::cuda::CUDAGuard first(stream); first.set_device(1); at::cuda::CUDAGuard second(std::move(first)); - REQUIRE(second.original_streams().size() == device_count); - REQUIRE(second.original_device() == 0); - REQUIRE(second.last_device() == 1); + CATCH_REQUIRE(second.original_streams().size() == device_count); + CATCH_REQUIRE(second.original_device() == 0); + CATCH_REQUIRE(second.last_device() == 1); at::cuda::CUDAGuard third; third = std::move(second); - REQUIRE(third.original_streams().size() == device_count); - REQUIRE(third.original_device() == 0); - REQUIRE(third.last_device() == 1); + CATCH_REQUIRE(third.original_streams().size() == device_count); + CATCH_REQUIRE(third.original_device() == 0); + CATCH_REQUIRE(third.last_device() == 1); } -TEST_CASE("Streampool Round Robin") { +CATCH_TEST_CASE("Streampool Round Robin") { std::vector streams{}; for (int i = 0; i < 200; ++i) { streams.emplace_back(at::cuda::detail::CUDAStream_createStream()); @@ -209,10 +209,10 @@ TEST_CASE("Streampool Round Robin") { if (!result_pair.second) hasDuplicates = true; } - REQUIRE(hasDuplicates); + CATCH_REQUIRE(hasDuplicates); } -TEST_CASE("Multi-GPU") { +CATCH_TEST_CASE("Multi-GPU") { if (at::cuda::getNumGPUs() < 2) return; at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0); @@ -221,17 +221,17 @@ TEST_CASE("Multi-GPU") { at::cuda::setCurrentCUDAStream(s0); at::cuda::setCurrentCUDAStream(s1); - REQUIRE(s0 == at::cuda::getCurrentCUDAStream()); + CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream()); at::DeviceGuard device_guard{1}; - REQUIRE(s1 == at::cuda::getCurrentCUDAStream()); + CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream()); } -TEST_CASE("CUDAEvent Syncs") { +CATCH_TEST_CASE("CUDAEvent Syncs") { const auto stream = at::cuda::createCUDAStream(); at::cuda::CUDAEvent event; - REQUIRE(!event.happened()); + CATCH_REQUIRE(!event.happened()); event.recordOnce(stream); @@ -242,10 +242,10 @@ TEST_CASE("CUDAEvent Syncs") { wait_stream1.synchronize_with(event); cudaStreamSynchronize(wait_stream0); - REQUIRE(event.happened()); + CATCH_REQUIRE(event.happened()); } -TEST_CASE("Cross-Device Events") { +CATCH_TEST_CASE("Cross-Device Events") { if (at::cuda::getNumGPUs() < 2) return; const auto stream0 = at::cuda::createCUDAStream(); @@ -260,10 +260,10 @@ TEST_CASE("Cross-Device Events") { event0 = std::move(event1); - REQUIRE(event0.device() == 1); + CATCH_REQUIRE(event0.device() == 1); stream0.synchronize_with(event0); cudaStreamSynchronize(stream0); - REQUIRE(event0.happened()); + CATCH_REQUIRE(event0.happened()); } diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp index 552328029ce03c..81701733b53693 100644 --- a/aten/src/ATen/test/test_parallel.cpp +++ b/aten/src/ATen/test/test_parallel.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/DLConvertor.h" @@ -11,7 +11,7 @@ using namespace at; -TEST_CASE( "parallel", "[cpu]" ) { +CATCH_TEST_CASE( "parallel", "[cpu]" ) { manual_seed(123, at::kCPU); set_num_threads(1); @@ -24,5 +24,5 @@ TEST_CASE( "parallel", "[cpu]" ) { as[0] = 1; as[1] = 0; as[2] = 0; - REQUIRE(a.sum(0).equal(as)); + CATCH_REQUIRE(a.sum(0).equal(as)); } diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp index e47772a7e8a80b..c01dff2d0038b1 100644 --- a/aten/src/ATen/test/undefined_tensor_test.cpp +++ b/aten/src/ATen/test/undefined_tensor_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "ATen/core/UndefinedTensorImpl.h" @@ -8,7 +8,7 @@ using namespace at; -TEST_CASE( "undefined tensor test", "[]" ) { +CATCH_TEST_CASE( "undefined tensor test", "[]" ) { manual_seed(123, at::kCPU); // mainly test ops on undefined tensors don't segfault and give a reasonable errror message. @@ -17,36 +17,36 @@ TEST_CASE( "undefined tensor test", "[]" ) { std::stringstream ss; ss << und << std::endl; - REQUIRE(!und.defined()); - REQUIRE(std::string("UndefinedType") == und.toString()); - - REQUIRE_THROWS(und.strides()); - REQUIRE_THROWS(und.dim()); - REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5)); - REQUIRE_THROWS(und.add(und)); - REQUIRE_THROWS(und.add(ft)); - REQUIRE_THROWS(ft.add(und)); - REQUIRE_THROWS(und.add(5)); - REQUIRE_THROWS(und.mm(und)); + CATCH_REQUIRE(!und.defined()); + CATCH_REQUIRE(std::string("UndefinedType") == und.toString()); + + _CATCH_REQUIRE_THROWS(und.strides()); + _CATCH_REQUIRE_THROWS(und.dim()); + _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5)); + _CATCH_REQUIRE_THROWS(und.add(und)); + _CATCH_REQUIRE_THROWS(und.add(ft)); + _CATCH_REQUIRE_THROWS(ft.add(und)); + _CATCH_REQUIRE_THROWS(und.add(5)); + _CATCH_REQUIRE_THROWS(und.mm(und)); und.toType(und.type()); - REQUIRE_THROWS(und.toType(ft.type())); - REQUIRE_THROWS(ft.toType(und.type())); + _CATCH_REQUIRE_THROWS(und.toType(ft.type())); + _CATCH_REQUIRE_THROWS(ft.toType(und.type())); und.toType(ScalarType::Undefined); - REQUIRE_THROWS(und.toType(ScalarType::Float)); - REQUIRE_THROWS(ft.toType(ScalarType::Undefined)); + _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float)); + _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined)); // copy_ - REQUIRE_THROWS(und.copy_(und)); - REQUIRE_THROWS(und.copy_(ft)); - REQUIRE_THROWS(ft.copy_(und)); + _CATCH_REQUIRE_THROWS(und.copy_(und)); + _CATCH_REQUIRE_THROWS(und.copy_(ft)); + _CATCH_REQUIRE_THROWS(ft.copy_(und)); und.toBackend(Backend::Undefined); - REQUIRE_THROWS(und.toBackend(Backend::CPU)); - REQUIRE_THROWS(ft.toBackend(Backend::Undefined)); + _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU)); + _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined)); Tensor to_move = ones({1}, CPU(kFloat)); Tensor m(std::move(to_move)); - REQUIRE(!to_move.defined()); - REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton()); + CATCH_REQUIRE(!to_move.defined()); + CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton()); } diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp index 167520beb58382..42c9f61b19b5e1 100644 --- a/aten/src/ATen/test/weakref_test.cpp +++ b/aten/src/ATen/test/weakref_test.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" @@ -10,53 +10,53 @@ using at::Tensor; using at::WeakTensor; -TEST_CASE( "Weak pointer tests", "" ) { - SECTION("gets invalidated") { +CATCH_TEST_CASE( "Weak pointer tests", "" ) { + CATCH_SECTION("gets invalidated") { Tensor a = at::ones({2, 2}); WeakTensor b = a; a.reset(); - REQUIRE_FALSE(b.lock().defined()); + CATCH_REQUIRE_FALSE(b.lock().defined()); } - SECTION("can successfully lock") { + CATCH_SECTION("can successfully lock") { Tensor a = at::ones({2, 2}); WeakTensor b = a; auto c = b.lock(); - REQUIRE(c.defined()); + CATCH_REQUIRE(c.defined()); a.reset(); - REQUIRE(b.lock().defined()); + CATCH_REQUIRE(b.lock().defined()); c.reset(); - REQUIRE_FALSE(b.lock().defined()); + CATCH_REQUIRE_FALSE(b.lock().defined()); } - SECTION("updates refcounts correctly") { + CATCH_SECTION("updates refcounts correctly") { Tensor a = at::ones({2, 2}); - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 1); { WeakTensor b = a; - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 2); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 2); } - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 1); { WeakTensor b = a; - REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); auto locked = b.lock(); - REQUIRE(locked.defined()); - REQUIRE(a.use_count() == 2); + CATCH_REQUIRE(locked.defined()); + CATCH_REQUIRE(a.use_count() == 2); } - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 1); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 1); { WeakTensor b = a; - REQUIRE(a.use_count() == 1); - REQUIRE(a.weak_use_count() == 2); + CATCH_REQUIRE(a.use_count() == 1); + CATCH_REQUIRE(a.weak_use_count() == 2); a.reset(); - REQUIRE(b.use_count() == 0); - REQUIRE(b.weak_use_count() == 1); + CATCH_REQUIRE(b.use_count() == 0); + CATCH_REQUIRE(b.weak_use_count() == 1); } } } diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp index 8e813bc7f7deeb..f76dac212a0921 100644 --- a/aten/src/ATen/test/wrapdim_test.cpp +++ b/aten/src/ATen/test/wrapdim_test.cpp @@ -1,43 +1,43 @@ #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; -TEST_CASE( "wrapdim test", "[]" ) { +CATCH_TEST_CASE( "wrapdim test", "[]" ) { manual_seed(123, at::kCPU); Type & T = CPU(kFloat); - SECTION( "simple case" ) { + CATCH_SECTION( "simple case" ) { auto a = randn({2, 3, 4, 5}, T); - REQUIRE(a.prod(-4).equal(a.prod(0))); - REQUIRE(a.prod(3).equal(a.prod(-1))); + CATCH_REQUIRE(a.prod(-4).equal(a.prod(0))); + CATCH_REQUIRE(a.prod(3).equal(a.prod(-1))); } - SECTION( "expression specification" ) { + CATCH_SECTION( "expression specification" ) { auto a = randn({2, 3, 4, 5}, T); - REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0))); - REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1))); + CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0))); + CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1))); // can unsqueeze scalar auto b = randn(1, T); b.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1))); + CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1))); } - SECTION( "empty tensor" ) { + CATCH_SECTION( "empty tensor" ) { auto a = randn(0, T); - REQUIRE(a.prod(0).equal(at::ones({}, T))); + CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T))); } - SECTION( "scalar vs 1-dim, 1-size" ) { + CATCH_SECTION( "scalar vs 1-dim, 1-size" ) { auto a = randn(1, T); - REQUIRE(a.prod(0).equal(a.prod(-1))); + CATCH_REQUIRE(a.prod(0).equal(a.prod(-1))); a.unsafeGetTensorImpl()->maybe_zero_dim(true); - REQUIRE(a.dim() == 0); - REQUIRE(a.prod(0).equal(a.prod(-1))); + CATCH_REQUIRE(a.dim() == 0); + CATCH_REQUIRE(a.prod(0).equal(a.prod(-1))); } } diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp index 9368d4dd5a4983..18db2f531ee358 100644 --- a/test/cpp/api/any.cpp +++ b/test/cpp/api/any.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -13,39 +13,39 @@ using namespace torch::detail; using Catch::Contains; using Catch::StartsWith; -TEST_CASE("any-module") { +CATCH_TEST_CASE("any-module") { torch::manual_seed(0); - SECTION("int()") { + CATCH_SECTION("int()") { struct M : torch::nn::Module { int forward() { return 123; } }; AnyModule any(M{}); - REQUIRE(any.forward() == 123); + CATCH_REQUIRE(any.forward() == 123); } - SECTION("int(int)") { + CATCH_SECTION("int(int)") { struct M : torch::nn::Module { int forward(int x) { return x; } }; AnyModule any(M{}); - REQUIRE(any.forward(5) == 5); + CATCH_REQUIRE(any.forward(5) == 5); } - SECTION("const char*(const char*)") { + CATCH_SECTION("const char*(const char*)") { struct M : torch::nn::Module { const char* forward(const char* x) { return x; } }; AnyModule any(M{}); - REQUIRE(any.forward("hello") == std::string("hello")); + CATCH_REQUIRE(any.forward("hello") == std::string("hello")); } - SECTION("string(int, const double)") { + CATCH_SECTION("string(int, const double)") { struct M : torch::nn::Module { std::string forward(int x, const double f) { return std::to_string(static_cast(x + f)); @@ -53,10 +53,10 @@ TEST_CASE("any-module") { }; AnyModule any(M{}); int x = 4; - REQUIRE(any.forward(x, 3.14) == std::string("7")); + CATCH_REQUIRE(any.forward(x, 3.14) == std::string("7")); } - SECTION("Tensor(string, const string&, string&&)") { + CATCH_SECTION("Tensor(string, const string&, string&&)") { struct M : torch::nn::Module { torch::Tensor forward( std::string a, @@ -67,42 +67,42 @@ TEST_CASE("any-module") { } }; AnyModule any(M{}); - REQUIRE( + CATCH_REQUIRE( any.forward( std::string("a"), std::string("ab"), std::string("abc")) .sum() .toCInt() == 6); } - SECTION("wrong argument type") { + CATCH_SECTION("wrong argument type") { struct M : torch::nn::Module { int forward(float x) { return x; } }; AnyModule any(M{}); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.forward(5.0), StartsWith("Expected argument #0 to be of type float, " "but received value of type double")); } - SECTION("wrong number of arguments") { + CATCH_SECTION("wrong number of arguments") { struct M : torch::nn::Module { int forward(int a, int b) { return a + b; } }; AnyModule any(M{}); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.forward(), Contains("M's forward() method expects 2 arguments, but received 0")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.forward(5), Contains("M's forward() method expects 2 arguments, but received 1")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.forward(1, 2, 3), Contains("M's forward() method expects 2 arguments, but received 3")); } - SECTION("get()") { + CATCH_SECTION("get()") { struct M : torch::nn::Module { explicit M(int value_) : torch::nn::Module("M"), value(value_) {} int value; @@ -112,16 +112,16 @@ TEST_CASE("any-module") { }; AnyModule any(M{5}); - SECTION("good cast") { - REQUIRE(any.get().value == 5); + CATCH_SECTION("good cast") { + CATCH_REQUIRE(any.get().value == 5); } - SECTION("bad cast") { + CATCH_SECTION("bad cast") { struct N : torch::nn::Module {}; - REQUIRE_THROWS_WITH(any.get(), StartsWith("Attempted to cast module")); + CATCH_REQUIRE_THROWS_WITH(any.get(), StartsWith("Attempted to cast module")); } } - SECTION("ptr()") { + CATCH_SECTION("ptr()") { struct M : torch::nn::Module { explicit M(int value_) : torch::nn::Module("M"), value(value_) {} int value; @@ -131,24 +131,24 @@ TEST_CASE("any-module") { }; AnyModule any(M{5}); - SECTION("base class cast") { + CATCH_SECTION("base class cast") { auto ptr = any.ptr(); - REQUIRE(ptr != nullptr); - REQUIRE(ptr->name() == "M"); + CATCH_REQUIRE(ptr != nullptr); + CATCH_REQUIRE(ptr->name() == "M"); } - SECTION("good downcast") { + CATCH_SECTION("good downcast") { auto ptr = any.ptr(); - REQUIRE(ptr != nullptr); - REQUIRE(ptr->value == 5); + CATCH_REQUIRE(ptr != nullptr); + CATCH_REQUIRE(ptr->value == 5); } - SECTION("bad downcast") { + CATCH_SECTION("bad downcast") { struct N : torch::nn::Module {}; - REQUIRE_THROWS_WITH(any.ptr(), StartsWith("Attempted to cast module")); + CATCH_REQUIRE_THROWS_WITH(any.ptr(), StartsWith("Attempted to cast module")); } } - SECTION("default state is empty") { + CATCH_SECTION("default state is empty") { struct M : torch::nn::Module { explicit M(int value_) : value(value_) {} int value; @@ -157,33 +157,33 @@ TEST_CASE("any-module") { } }; AnyModule any; - REQUIRE(any.is_empty()); + CATCH_REQUIRE(any.is_empty()); any = std::make_shared(5); - REQUIRE(!any.is_empty()); - REQUIRE(any.get().value == 5); + CATCH_REQUIRE(!any.is_empty()); + CATCH_REQUIRE(any.get().value == 5); } - SECTION("all methods throw for empty AnyModule") { + CATCH_SECTION("all methods throw for empty AnyModule") { struct M : torch::nn::Module { int forward(int x) { return x; } }; AnyModule any; - REQUIRE(any.is_empty()); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE(any.is_empty()); + CATCH_REQUIRE_THROWS_WITH( any.get(), StartsWith("Cannot call get() on an empty AnyModule")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.ptr(), StartsWith("Cannot call ptr() on an empty AnyModule")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.ptr(), StartsWith("Cannot call ptr() on an empty AnyModule")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.type_info(), StartsWith("Cannot call type_info() on an empty AnyModule")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( any.forward(5), StartsWith("Cannot call forward() on an empty AnyModule")); } - SECTION("can move assign different modules") { + CATCH_SECTION("can move assign different modules") { struct M : torch::nn::Module { std::string forward(int x) { return std::to_string(x); @@ -195,15 +195,15 @@ TEST_CASE("any-module") { } }; AnyModule any; - REQUIRE(any.is_empty()); + CATCH_REQUIRE(any.is_empty()); any = std::make_shared(); - REQUIRE(!any.is_empty()); - REQUIRE(any.forward(5) == "5"); + CATCH_REQUIRE(!any.is_empty()); + CATCH_REQUIRE(any.forward(5) == "5"); any = std::make_shared(); - REQUIRE(!any.is_empty()); - REQUIRE(any.forward(5.0f) == 8); + CATCH_REQUIRE(!any.is_empty()); + CATCH_REQUIRE(any.forward(5.0f) == 8); } - SECTION("constructs from ModuleHolder") { + CATCH_SECTION("constructs from ModuleHolder") { struct MImpl : torch::nn::Module { explicit MImpl(int value_) : torch::nn::Module("M"), value(value_) {} int value; @@ -218,14 +218,14 @@ TEST_CASE("any-module") { }; AnyModule any(M{5}); - REQUIRE(any.get().value == 5); - REQUIRE(any.get()->value == 5); + CATCH_REQUIRE(any.get().value == 5); + CATCH_REQUIRE(any.get()->value == 5); AnyModule module(Linear(3, 4)); std::shared_ptr ptr = module.ptr(); Linear linear(module.get()); } - SECTION("converts autograd::Variable to torch::Tensor correctly") { + CATCH_SECTION("converts autograd::Variable to torch::Tensor correctly") { struct M : torch::nn::Module { torch::Tensor forward(torch::Tensor input) { return input; @@ -236,12 +236,12 @@ TEST_CASE("any-module") { // torch::Tensor before being passed to the function (to avoid a type // mismatch). AnyModule any(M{}); - REQUIRE( + CATCH_REQUIRE( any.forward(torch::autograd::Variable(torch::ones(5))) .sum() .toCFloat() == 5); // at::Tensors that are not variables work too. - REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5); + CATCH_REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5); } } } @@ -263,92 +263,92 @@ AnyModule::Value make_value(T&& value) { } // namespace nn } // namespace torch -TEST_CASE("any-value") { +CATCH_TEST_CASE("any-value") { torch::manual_seed(0); - SECTION("gets the correct value for the right type") { - SECTION("int") { + CATCH_SECTION("gets the correct value for the right type") { + CATCH_SECTION("int") { auto value = make_value(5); // const and non-const types have the same typeid() - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.get() == 5); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.get() == 5); } - SECTION("const int") { + CATCH_SECTION("const int") { auto value = make_value(5); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.get() == 5); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.get() == 5); } - SECTION("const char*") { + CATCH_SECTION("const char*") { auto value = make_value("hello"); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.get() == std::string("hello")); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.get() == std::string("hello")); } - SECTION("std::string") { + CATCH_SECTION("std::string") { auto value = make_value(std::string("hello")); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.get() == "hello"); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.get() == "hello"); } - SECTION("pointers") { + CATCH_SECTION("pointers") { std::string s("hello"); std::string* p = &s; auto value = make_value(p); - REQUIRE(value.try_get() != nullptr); - REQUIRE(*value.get() == "hello"); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(*value.get() == "hello"); } - SECTION("references") { + CATCH_SECTION("references") { std::string s("hello"); const std::string& t = s; auto value = make_value(t); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.get() == "hello"); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.get() == "hello"); } } - SECTION("try_get returns nullptr for the wrong type") { + CATCH_SECTION("try_get returns nullptr for the wrong type") { auto value = make_value(5); - REQUIRE(value.try_get() != nullptr); - REQUIRE(value.try_get() == nullptr); - REQUIRE(value.try_get() == nullptr); - REQUIRE(value.try_get() == nullptr); + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE(value.try_get() == nullptr); + CATCH_REQUIRE(value.try_get() == nullptr); + CATCH_REQUIRE(value.try_get() == nullptr); } - SECTION("get throws for the wrong type") { + CATCH_SECTION("get throws for the wrong type") { auto value = make_value(5); - REQUIRE(value.try_get() != nullptr); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE(value.try_get() != nullptr); + CATCH_REQUIRE_THROWS_WITH( value.get(), StartsWith("Attempted to cast Value to float, " "but its actual type is int")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( value.get(), StartsWith("Attempted to cast Value to long, " "but its actual type is int")); } - SECTION("move is allowed") { + CATCH_SECTION("move is allowed") { auto value = make_value(5); - SECTION("construction") { + CATCH_SECTION("construction") { auto copy = make_value(std::move(value)); - REQUIRE(copy.try_get() != nullptr); - REQUIRE(copy.get() == 5); + CATCH_REQUIRE(copy.try_get() != nullptr); + CATCH_REQUIRE(copy.get() == 5); } - SECTION("assignment") { + CATCH_SECTION("assignment") { auto copy = make_value(10); copy = std::move(value); - REQUIRE(copy.try_get() != nullptr); - REQUIRE(copy.get() == 5); + CATCH_REQUIRE(copy.try_get() != nullptr); + CATCH_REQUIRE(copy.get() == 5); } } - SECTION("type_info is correct") { - SECTION("int") { + CATCH_SECTION("type_info is correct") { + CATCH_SECTION("int") { auto value = make_value(5); - REQUIRE(value.type_info().hash_code() == typeid(int).hash_code()); + CATCH_REQUIRE(value.type_info().hash_code() == typeid(int).hash_code()); } - SECTION("const char") { + CATCH_SECTION("const char") { auto value = make_value("hello"); - REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code()); + CATCH_REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code()); } - SECTION("std::string") { + CATCH_SECTION("std::string") { auto value = make_value(std::string("hello")); - REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code()); + CATCH_REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code()); } } } diff --git a/test/cpp/api/catch_utils.hpp b/test/cpp/api/catch_utils.hpp new file mode 100644 index 00000000000000..b9b0a87990a9ce --- /dev/null +++ b/test/cpp/api/catch_utils.hpp @@ -0,0 +1,8 @@ +#pragma once + +#define CATCH_CONFIG_PREFIX_ALL +#include + +// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning; +// define our own version that doesn't warn. +#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ ) diff --git a/test/cpp/api/cursor.cpp b/test/cpp/api/cursor.cpp index 5c998661be2368..e08bd7858dbb4a 100644 --- a/test/cpp/api/cursor.cpp +++ b/test/cpp/api/cursor.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -58,158 +58,158 @@ struct Container : public torch::nn::Module { std::vector> m; }; -TEST_CASE("cursor/module") { +CATCH_TEST_CASE("cursor/module") { torch::manual_seed(0); - SECTION("Works for flat models (depth = 1)") { + CATCH_SECTION("Works for flat models (depth = 1)") { Container model(TestModule(1), TestModule(2), TestModule(3)); auto cursor = model.modules(); - SECTION("Iterates in the correct order") { + CATCH_SECTION("Iterates in the correct order") { auto iterator = cursor.begin(); - REQUIRE(&iterator->value == &model[0]); - REQUIRE(&(++iterator)->value == &model[1]); - REQUIRE(&(++iterator)->value == &model[2]); - REQUIRE(++iterator == cursor.end()); + CATCH_REQUIRE(&iterator->value == &model[0]); + CATCH_REQUIRE(&(++iterator)->value == &model[1]); + CATCH_REQUIRE(&(++iterator)->value == &model[2]); + CATCH_REQUIRE(++iterator == cursor.end()); } - SECTION("names are flat") { + CATCH_SECTION("names are flat") { auto iterator = cursor.begin(); - REQUIRE(iterator->key == "0"); - REQUIRE((++iterator)->key == "1"); - REQUIRE((++iterator)->key == "2"); + CATCH_REQUIRE(iterator->key == "0"); + CATCH_REQUIRE((++iterator)->key == "1"); + CATCH_REQUIRE((++iterator)->key == "2"); } - SECTION("Apply works") { + CATCH_SECTION("Apply works") { size_t count = 0; cursor.apply([&count, &model](Module& module) { - REQUIRE(&module == &model[count]); + CATCH_REQUIRE(&module == &model[count]); count += 1; }); - REQUIRE(count == 3); + CATCH_REQUIRE(count == 3); } - SECTION("Apply_items works") { + CATCH_SECTION("Apply_items works") { size_t count = 0; cursor.apply_items( [&count, &model](const std::string& key, Module& module) { - REQUIRE(&module == &model[count]); + CATCH_REQUIRE(&module == &model[count]); count += 1; }); - REQUIRE(count == 3); + CATCH_REQUIRE(count == 3); } - SECTION("Map works") { + CATCH_SECTION("Map works") { std::vector vector(3); cursor.map(vector.begin(), [](Module& module) { return &module; }); - REQUIRE(vector[0] == &model[0]); - REQUIRE(vector[1] == &model[1]); - REQUIRE(vector[2] == &model[2]); + CATCH_REQUIRE(vector[0] == &model[0]); + CATCH_REQUIRE(vector[1] == &model[1]); + CATCH_REQUIRE(vector[2] == &model[2]); std::list list; cursor.map(std::inserter(list, list.end()), [](Module& module) { return &module; }); - REQUIRE(list.size() == 3); + CATCH_REQUIRE(list.size() == 3); auto iterator = list.begin(); - REQUIRE(*iterator++ == &model[0]); - REQUIRE(*iterator++ == &model[1]); - REQUIRE(*iterator++ == &model[2]); - REQUIRE(iterator == list.end()); + CATCH_REQUIRE(*iterator++ == &model[0]); + CATCH_REQUIRE(*iterator++ == &model[1]); + CATCH_REQUIRE(*iterator++ == &model[2]); + CATCH_REQUIRE(iterator == list.end()); } - SECTION("Map_items works") { + CATCH_SECTION("Map_items works") { std::map output; cursor.map_items( std::inserter(output, output.end()), [](const std::string& key, Module& module) { return std::make_pair(key, &module); }); - REQUIRE(output.size() == 3); - REQUIRE(output.count("0")); - REQUIRE(output.count("1")); - REQUIRE(output.count("2")); - REQUIRE(output["0"] == &model[0]); - REQUIRE(output["1"] == &model[1]); - REQUIRE(output["2"] == &model[2]); + CATCH_REQUIRE(output.size() == 3); + CATCH_REQUIRE(output.count("0")); + CATCH_REQUIRE(output.count("1")); + CATCH_REQUIRE(output.count("2")); + CATCH_REQUIRE(output["0"] == &model[0]); + CATCH_REQUIRE(output["1"] == &model[1]); + CATCH_REQUIRE(output["2"] == &model[2]); } - SECTION("Count works for flat models") { - REQUIRE(cursor.size() == model.m.size()); + CATCH_SECTION("Count works for flat models") { + CATCH_REQUIRE(cursor.size() == model.m.size()); } - SECTION("find() finds the correct modules when given a valid key") { - REQUIRE(cursor.find("0") == &model[0]); - REQUIRE(cursor.find("1") == &model[1]); - REQUIRE(cursor.find("2") == &model[2]); + CATCH_SECTION("find() finds the correct modules when given a valid key") { + CATCH_REQUIRE(cursor.find("0") == &model[0]); + CATCH_REQUIRE(cursor.find("1") == &model[1]); + CATCH_REQUIRE(cursor.find("2") == &model[2]); } - SECTION("find() returns nullptr when given an invalid key") { - REQUIRE(cursor.find("foo") == nullptr); - REQUIRE(cursor.find("bar") == nullptr); + CATCH_SECTION("find() returns nullptr when given an invalid key") { + CATCH_REQUIRE(cursor.find("foo") == nullptr); + CATCH_REQUIRE(cursor.find("bar") == nullptr); } - SECTION("at(key) returns the correct modules when given a valid key") { - REQUIRE(&cursor.at("0") == &model[0]); - REQUIRE(&cursor.at("1") == &model[1]); - REQUIRE(&cursor.at("2") == &model[2]); + CATCH_SECTION("at(key) returns the correct modules when given a valid key") { + CATCH_REQUIRE(&cursor.at("0") == &model[0]); + CATCH_REQUIRE(&cursor.at("1") == &model[1]); + CATCH_REQUIRE(&cursor.at("2") == &model[2]); } - SECTION("at(key) throws when given an invalid key") { - REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'")); - REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'")); + CATCH_SECTION("at(key) throws when given an invalid key") { + CATCH_REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'")); + CATCH_REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'")); } - SECTION( + CATCH_SECTION( "operator[key] returns the correct modules when given a valid key") { - REQUIRE(&cursor["0"] == &model[0]); - REQUIRE(&cursor["1"] == &model[1]); - REQUIRE(&cursor["2"] == &model[2]); + CATCH_REQUIRE(&cursor["0"] == &model[0]); + CATCH_REQUIRE(&cursor["1"] == &model[1]); + CATCH_REQUIRE(&cursor["2"] == &model[2]); } - SECTION("operator[key] throws when given an invalid key") { - REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'")); - REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'")); + CATCH_SECTION("operator[key] throws when given an invalid key") { + CATCH_REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'")); + CATCH_REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'")); } - SECTION("at(index) returns the correct modules when given a valid index") { - REQUIRE(&cursor.at(0).value == &model[0]); - REQUIRE(&cursor.at(1).value == &model[1]); - REQUIRE(&cursor.at(2).value == &model[2]); + CATCH_SECTION("at(index) returns the correct modules when given a valid index") { + CATCH_REQUIRE(&cursor.at(0).value == &model[0]); + CATCH_REQUIRE(&cursor.at(1).value == &model[1]); + CATCH_REQUIRE(&cursor.at(2).value == &model[2]); } - SECTION("at(index) throws when given an invalid index") { - REQUIRE_THROWS_WITH( + CATCH_SECTION("at(index) throws when given an invalid index") { + CATCH_REQUIRE_THROWS_WITH( cursor.at(5), StartsWith("Index 5 is out of range for cursor of size 3")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( cursor.at(123), StartsWith("Index 123 is out of range for cursor of size 3")); } - SECTION( + CATCH_SECTION( "operator[index] returns the correct modules when given a valid index") { - REQUIRE(&cursor[0].value == &model[0]); - REQUIRE(&cursor[1].value == &model[1]); - REQUIRE(&cursor[2].value == &model[2]); + CATCH_REQUIRE(&cursor[0].value == &model[0]); + CATCH_REQUIRE(&cursor[1].value == &model[1]); + CATCH_REQUIRE(&cursor[2].value == &model[2]); } - SECTION("operator[index] throws when given an invalid key") { - REQUIRE_THROWS_WITH( + CATCH_SECTION("operator[index] throws when given an invalid key") { + CATCH_REQUIRE_THROWS_WITH( cursor[5], StartsWith("Index 5 is out of range for cursor of size 3")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( cursor[123], StartsWith("Index 123 is out of range for cursor of size 3")); } - SECTION("contains() is correct") { - REQUIRE(cursor.contains("0")); - REQUIRE(cursor.contains("1")); - REQUIRE(cursor.contains("2")); + CATCH_SECTION("contains() is correct") { + CATCH_REQUIRE(cursor.contains("0")); + CATCH_REQUIRE(cursor.contains("1")); + CATCH_REQUIRE(cursor.contains("2")); } } - SECTION("Works for deeper hierarchies (depth > 1)") { + CATCH_SECTION("Works for deeper hierarchies (depth > 1)") { // clang-format off Container model( Container( @@ -227,106 +227,106 @@ TEST_CASE("cursor/module") { auto cursor = model.modules(); // This is sufficient for the hierarchical case // (other tests build on top) - SECTION("Iterates in the correct order") { + CATCH_SECTION("Iterates in the correct order") { auto iterator = cursor.begin(); - REQUIRE(&iterator->value == &model[0]); + CATCH_REQUIRE(&iterator->value == &model[0]); auto* seq = dynamic_cast(&model[0]); - REQUIRE(seq != nullptr); - REQUIRE(&(++iterator)->value == &(*seq)[0]); - REQUIRE(&(++iterator)->value == &(*seq)[1]); + CATCH_REQUIRE(seq != nullptr); + CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]); + CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]); - REQUIRE(&(++iterator)->value == &model[1]); - REQUIRE(&(++iterator)->value == &model[2]); + CATCH_REQUIRE(&(++iterator)->value == &model[1]); + CATCH_REQUIRE(&(++iterator)->value == &model[2]); seq = dynamic_cast(&model[2]); - REQUIRE(seq != nullptr); - REQUIRE(&(++iterator)->value == &(*seq)[0]); - REQUIRE(&(++iterator)->value == &(*seq)[1]); + CATCH_REQUIRE(seq != nullptr); + CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]); + CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]); seq = dynamic_cast(&(*seq)[1]); - REQUIRE(seq != nullptr); - REQUIRE(&(++iterator)->value == &(*seq)[0]); - REQUIRE(&(++iterator)->value == &(*seq)[1]); + CATCH_REQUIRE(seq != nullptr); + CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]); + CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]); } - SECTION("children() returns only the first level of submodules") { + CATCH_SECTION("children() returns only the first level of submodules") { auto children = model.children(); - REQUIRE(children.size() == 3); - REQUIRE(&children.at("0") == &model[0]); - REQUIRE(&children.at("1") == &model[1]); - REQUIRE(&children.at("2") == &model[2]); - REQUIRE(!children.contains("0.0")); + CATCH_REQUIRE(children.size() == 3); + CATCH_REQUIRE(&children.at("0") == &model[0]); + CATCH_REQUIRE(&children.at("1") == &model[1]); + CATCH_REQUIRE(&children.at("2") == &model[2]); + CATCH_REQUIRE(!children.contains("0.0")); size_t count = 0; for (auto& child : children) { - REQUIRE(child.key == std::to_string(count)); - REQUIRE(&child.value == &model[count]); + CATCH_REQUIRE(child.key == std::to_string(count)); + CATCH_REQUIRE(&child.value == &model[count]); count += 1; } } } } -TEST_CASE("cursor/parameter") { +CATCH_TEST_CASE("cursor/parameter") { torch::manual_seed(0); - SECTION("Works for single models") { + CATCH_SECTION("Works for single models") { TestModule model(1); auto cursor = model.parameters(); - SECTION("Iterates in the correct order") { + CATCH_SECTION("Iterates in the correct order") { auto iterator = cursor.begin(); - REQUIRE(iterator->value.equal(model.tensor1)); - REQUIRE((++iterator)->value.equal(model.tensor2)); + CATCH_REQUIRE(iterator->value.equal(model.tensor1)); + CATCH_REQUIRE((++iterator)->value.equal(model.tensor2)); } } - SECTION("Works for flat models (depth = 1)") { + CATCH_SECTION("Works for flat models (depth = 1)") { auto first = std::make_shared(1); auto second = std::make_shared(2); Container model(first, second); auto cursor = model.parameters(); - SECTION("Iterates in the correct order") { + CATCH_SECTION("Iterates in the correct order") { auto iterator = cursor.begin(); - REQUIRE(iterator->value.equal(first->tensor1)); - REQUIRE((++iterator)->value.equal(first->tensor2)); - REQUIRE((++iterator)->value.equal(second->tensor1)); - REQUIRE((++iterator)->value.equal(second->tensor2)); + CATCH_REQUIRE(iterator->value.equal(first->tensor1)); + CATCH_REQUIRE((++iterator)->value.equal(first->tensor2)); + CATCH_REQUIRE((++iterator)->value.equal(second->tensor1)); + CATCH_REQUIRE((++iterator)->value.equal(second->tensor2)); } - SECTION("Apply_items works") { + CATCH_SECTION("Apply_items works") { size_t count = 0; cursor.apply_items([&count, &model, &first, &second]( const std::string& key, torch::Tensor& tensor) { switch (count) { case 0: { - REQUIRE(tensor.equal(first->tensor1)); + CATCH_REQUIRE(tensor.equal(first->tensor1)); break; } case 1: { - REQUIRE(tensor.equal(first->tensor2)); + CATCH_REQUIRE(tensor.equal(first->tensor2)); break; } case 2: { - REQUIRE(tensor.equal(second->tensor1)); + CATCH_REQUIRE(tensor.equal(second->tensor1)); break; } case 3: { - REQUIRE(tensor.equal(second->tensor2)); + CATCH_REQUIRE(tensor.equal(second->tensor2)); break; } } count += 1; }); - REQUIRE(count == 4); + CATCH_REQUIRE(count == 4); } // Other tests are correct based on correct iteration behavior and apply // working. } - SECTION("Works for deeper hierarchies (depth > 1)") { + CATCH_SECTION("Works for deeper hierarchies (depth > 1)") { std::vector> modules; for (size_t i = 1; i <= 6; ++i) { modules.push_back(std::make_shared(i)); @@ -346,36 +346,36 @@ TEST_CASE("cursor/parameter") { // clang-format on auto cursor = model.parameters(); - SECTION("Iterates in the correct order") { + CATCH_SECTION("Iterates in the correct order") { auto iterator = cursor.begin(); - REQUIRE(iterator->value.equal(modules[0]->tensor1)); - REQUIRE((++iterator)->value.equal(modules[0]->tensor2)); + CATCH_REQUIRE(iterator->value.equal(modules[0]->tensor1)); + CATCH_REQUIRE((++iterator)->value.equal(modules[0]->tensor2)); for (size_t index = 1; index < 6; ++index) { - REQUIRE((++iterator)->value.equal(modules[index]->tensor1)); - REQUIRE((++iterator)->value.equal(modules[index]->tensor2)); + CATCH_REQUIRE((++iterator)->value.equal(modules[index]->tensor1)); + CATCH_REQUIRE((++iterator)->value.equal(modules[index]->tensor2)); } } - SECTION("names are hierarchical") { + CATCH_SECTION("names are hierarchical") { auto iterator = cursor.begin(); - REQUIRE(iterator->key == "0.0.tensor1"); - REQUIRE((++iterator)->key == "0.0.tensor2"); - REQUIRE((++iterator)->key == "0.1.tensor1"); - REQUIRE((++iterator)->key == "0.1.tensor2"); - REQUIRE((++iterator)->key == "1.tensor1"); - REQUIRE((++iterator)->key == "1.tensor2"); - REQUIRE((++iterator)->key == "2.0.tensor1"); - REQUIRE((++iterator)->key == "2.0.tensor2"); - REQUIRE((++iterator)->key == "2.1.0.tensor1"); - REQUIRE((++iterator)->key == "2.1.0.tensor2"); - REQUIRE((++iterator)->key == "2.1.1.tensor1"); - REQUIRE((++iterator)->key == "2.1.1.tensor2"); - REQUIRE(++iterator == cursor.end()); + CATCH_REQUIRE(iterator->key == "0.0.tensor1"); + CATCH_REQUIRE((++iterator)->key == "0.0.tensor2"); + CATCH_REQUIRE((++iterator)->key == "0.1.tensor1"); + CATCH_REQUIRE((++iterator)->key == "0.1.tensor2"); + CATCH_REQUIRE((++iterator)->key == "1.tensor1"); + CATCH_REQUIRE((++iterator)->key == "1.tensor2"); + CATCH_REQUIRE((++iterator)->key == "2.0.tensor1"); + CATCH_REQUIRE((++iterator)->key == "2.0.tensor2"); + CATCH_REQUIRE((++iterator)->key == "2.1.0.tensor1"); + CATCH_REQUIRE((++iterator)->key == "2.1.0.tensor2"); + CATCH_REQUIRE((++iterator)->key == "2.1.1.tensor1"); + CATCH_REQUIRE((++iterator)->key == "2.1.1.tensor2"); + CATCH_REQUIRE(++iterator == cursor.end()); } } } -TEST_CASE("cursor/non-const-to-const-conversion") { +CATCH_TEST_CASE("cursor/non-const-to-const-conversion") { torch::manual_seed(0); auto first = std::make_shared(1); auto second = std::make_shared(2); @@ -404,11 +404,11 @@ TEST_CASE("cursor/non-const-to-const-conversion") { } } -TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") { +CATCH_TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") { torch::manual_seed(0); TestModule model(1); /// This will only compile if `Cursor` has the appropriate const methods. const auto cursor = model.parameters(); - REQUIRE(cursor.contains("tensor1")); + CATCH_REQUIRE(cursor.contains("tensor1")); } diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp index 8d753197754c19..972223a89fd42f 100644 --- a/test/cpp/api/integration.cpp +++ b/test/cpp/api/integration.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -230,7 +230,7 @@ bool test_mnist( return correct.sum().toCFloat() > telabel.size(0) * 0.8; } -TEST_CASE("integration/cartpole") { +CATCH_TEST_CASE("integration/cartpole") { torch::manual_seed(0); std::cerr << "Training episodic policy gradient with a critic for up to 3000" " episodes, rest your eyes for a bit!\n"; @@ -326,11 +326,11 @@ TEST_CASE("integration/cartpole") { if (running_reward > 150) { break; } - REQUIRE(episode < 3000); + CATCH_REQUIRE(episode < 3000); } } -TEST_CASE("integration/mnist", "[cuda]") { +CATCH_TEST_CASE("integration/mnist", "[cuda]") { torch::manual_seed(0); auto model = std::make_shared(); auto conv1 = model->add(Conv2d(1, 10, 5), "conv1"); @@ -357,7 +357,7 @@ TEST_CASE("integration/mnist", "[cuda]") { auto optimizer = torch::optim::SGD( model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5)); - REQUIRE(test_mnist( + CATCH_REQUIRE(test_mnist( 32, // batch_size 3, // num_epochs true, // useGPU @@ -366,7 +366,7 @@ TEST_CASE("integration/mnist", "[cuda]") { optimizer)); } -TEST_CASE("integration/mnist/batchnorm", "[cuda]") { +CATCH_TEST_CASE("integration/mnist/batchnorm", "[cuda]") { torch::manual_seed(0); auto model = std::make_shared(); auto conv1 = model->add(Conv2d(1, 10, 5), "conv1"); @@ -393,7 +393,7 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") { auto optimizer = torch::optim::SGD( model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5)); - REQUIRE(test_mnist( + CATCH_REQUIRE(test_mnist( 32, // batch_size 3, // num_epochs true, // useGPU diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp index c46868cdbf6989..b477b11c8e5ae1 100644 --- a/test/cpp/api/jit.cpp +++ b/test/cpp/api/jit.cpp @@ -1,12 +1,12 @@ -#include +#include "catch_utils.hpp" #include #include #include -TEST_CASE("torch script") { - SECTION("multiple functions") { +CATCH_TEST_CASE("torch script") { + CATCH_SECTION("multiple functions") { auto module = torch::jit::compile(R"JIT( def test_mul(a, b): return a * b @@ -21,11 +21,11 @@ TEST_CASE("torch script") { auto a = torch::ones(1); auto b = torch::ones(1); - REQUIRE(1 == module->run_method("test_mul", a, b).toTensor().toCLong()); + CATCH_REQUIRE(1 == module->run_method("test_mul", a, b).toTensor().toCLong()); - REQUIRE(2 == module->run_method("test_relu", a, b).toTensor().toCLong()); + CATCH_REQUIRE(2 == module->run_method("test_relu", a, b).toTensor().toCLong()); - REQUIRE( + CATCH_REQUIRE( 0x200 == module->run_method("test_while", a, b).toTensor().toCLong()); } } diff --git a/test/cpp/api/main.cpp b/test/cpp/api/main.cpp index 4b1aaba64b2ef1..92ea3567a3b9a7 100644 --- a/test/cpp/api/main.cpp +++ b/test/cpp/api/main.cpp @@ -1,5 +1,5 @@ #define CATCH_CONFIG_RUNNER -#include +#include "catch_utils.hpp" #include diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp index 6d065bf6fa6523..8ced0e0a750dc8 100644 --- a/test/cpp/api/misc.cpp +++ b/test/cpp/api/misc.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -18,7 +18,7 @@ using OrderedDict = torch::detail::OrderedDict; using Catch::StartsWith; -TEST_CASE("NoGrad") { +CATCH_TEST_CASE("NoGrad") { torch::manual_seed(0); torch::NoGradGuard guard; Linear model(5, 2); @@ -27,88 +27,88 @@ TEST_CASE("NoGrad") { torch::Tensor s = y.sum(); s.backward(); - REQUIRE(!model->parameters()["weight"].grad().defined()); + CATCH_REQUIRE(!model->parameters()["weight"].grad().defined()); } -TEST_CASE("autograd") { +CATCH_TEST_CASE("autograd") { torch::manual_seed(0); auto x = torch::randn({3, 3}, torch::requires_grad()); auto y = torch::randn({3, 3}); auto z = x * y; - SECTION("derivatives of zero-dim tensors") { + CATCH_SECTION("derivatives of zero-dim tensors") { z.sum().backward(); - REQUIRE(x.grad().allclose(y)); + CATCH_REQUIRE(x.grad().allclose(y)); } - SECTION("derivatives of tensors") { + CATCH_SECTION("derivatives of tensors") { z.backward(); - REQUIRE(x.grad().allclose(y)); + CATCH_REQUIRE(x.grad().allclose(y)); } - SECTION("custom gradient inputs") { + CATCH_SECTION("custom gradient inputs") { z.sum().backward(torch::ones({}) * 2); - REQUIRE(x.grad().allclose(y * 2)); + CATCH_REQUIRE(x.grad().allclose(y * 2)); } // Assume everything else is safe from PyTorch tests. } -TEST_CASE("nn::init") { +CATCH_TEST_CASE("nn::init") { auto tensor = torch::empty({3, 4}, torch::requires_grad()); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( tensor.fill_(1), StartsWith("a leaf Variable that requires grad " "has been used in an in-place operation")); - REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12); + CATCH_REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12); } -TEST_CASE("expanding-array") { +CATCH_TEST_CASE("expanding-array") { torch::manual_seed(0); - SECTION("successful construction") { - SECTION("initializer_list") { + CATCH_SECTION("successful construction") { + CATCH_SECTION("initializer_list") { torch::ExpandingArray<5> e({1, 2, 3, 4, 5}); - REQUIRE(e.size() == 5); + CATCH_REQUIRE(e.size() == 5); for (size_t i = 0; i < e.size(); ++i) { - REQUIRE((*e)[i] == i + 1); + CATCH_REQUIRE((*e)[i] == i + 1); } } - SECTION("vector") { + CATCH_SECTION("vector") { torch::ExpandingArray<5> e(std::vector{1, 2, 3, 4, 5}); - REQUIRE(e.size() == 5); + CATCH_REQUIRE(e.size() == 5); for (size_t i = 0; i < e.size(); ++i) { - REQUIRE((*e)[i] == i + 1); + CATCH_REQUIRE((*e)[i] == i + 1); } } - SECTION("array") { + CATCH_SECTION("array") { torch::ExpandingArray<5> e(std::array({1, 2, 3, 4, 5})); - REQUIRE(e.size() == 5); + CATCH_REQUIRE(e.size() == 5); for (size_t i = 0; i < e.size(); ++i) { - REQUIRE((*e)[i] == i + 1); + CATCH_REQUIRE((*e)[i] == i + 1); } } - SECTION("single value") { + CATCH_SECTION("single value") { torch::ExpandingArray<5> e(5); - REQUIRE(e.size() == 5); + CATCH_REQUIRE(e.size() == 5); for (size_t i = 0; i < e.size(); ++i) { - REQUIRE((*e)[i] == 5); + CATCH_REQUIRE((*e)[i] == 5); } } } - SECTION("throws for incorrect size on construction") { - SECTION("initializer_list") { - REQUIRE_THROWS_WITH( + CATCH_SECTION("throws for incorrect size on construction") { + CATCH_SECTION("initializer_list") { + CATCH_REQUIRE_THROWS_WITH( torch::ExpandingArray<5>({1, 2, 3, 4, 5, 6, 7}), StartsWith("Expected 5 values, but instead got 7")); } - SECTION("vector") { - REQUIRE_THROWS_WITH( + CATCH_SECTION("vector") { + CATCH_REQUIRE_THROWS_WITH( torch::ExpandingArray<5>(std::vector({1, 2, 3, 4, 5, 6, 7})), StartsWith("Expected 5 values, but instead got 7")); } } } -TEST_CASE("make_unique") { +CATCH_TEST_CASE("make_unique") { struct Test { explicit Test(const int& x) : lvalue_(x) {} explicit Test(int&& x) : rvalue_(x) {} @@ -117,216 +117,216 @@ TEST_CASE("make_unique") { at::optional rvalue_; }; - SECTION("forwards rvalues correctly") { + CATCH_SECTION("forwards rvalues correctly") { auto ptr = torch::make_unique(123); - REQUIRE(!ptr->lvalue_.has_value()); - REQUIRE(ptr->rvalue_.has_value()); - REQUIRE(*ptr->rvalue_ == 123); + CATCH_REQUIRE(!ptr->lvalue_.has_value()); + CATCH_REQUIRE(ptr->rvalue_.has_value()); + CATCH_REQUIRE(*ptr->rvalue_ == 123); } - SECTION("forwards lvalues correctly") { + CATCH_SECTION("forwards lvalues correctly") { int x = 5; auto ptr = torch::make_unique(x); - REQUIRE(ptr->lvalue_.has_value()); - REQUIRE(*ptr->lvalue_ == 5); - REQUIRE(!ptr->rvalue_.has_value()); + CATCH_REQUIRE(ptr->lvalue_.has_value()); + CATCH_REQUIRE(*ptr->lvalue_ == 5); + CATCH_REQUIRE(!ptr->rvalue_.has_value()); } - SECTION("Can construct unique_ptr of array") { + CATCH_SECTION("Can construct unique_ptr of array") { auto ptr = torch::make_unique(3); // Value initialization is required by the standard. - REQUIRE(ptr[0] == 0); - REQUIRE(ptr[1] == 0); - REQUIRE(ptr[2] == 0); + CATCH_REQUIRE(ptr[0] == 0); + CATCH_REQUIRE(ptr[1] == 0); + CATCH_REQUIRE(ptr[2] == 0); } } -TEST_CASE("ordered-dict") { - SECTION("is empty after default construction") { +CATCH_TEST_CASE("ordered-dict") { + CATCH_SECTION("is empty after default construction") { OrderedDict dict; - REQUIRE(dict.subject() == "Key"); - REQUIRE(dict.is_empty()); - REQUIRE(dict.size() == 0); + CATCH_REQUIRE(dict.subject() == "Key"); + CATCH_REQUIRE(dict.is_empty()); + CATCH_REQUIRE(dict.size() == 0); } - SECTION("insert inserts elements when they are not yet present") { + CATCH_SECTION("insert inserts elements when they are not yet present") { OrderedDict dict; dict.insert("a", 1); dict.insert("b", 2); - REQUIRE(dict.size() == 2); + CATCH_REQUIRE(dict.size() == 2); } - SECTION("get returns values when present") { + CATCH_SECTION("get returns values when present") { OrderedDict dict; dict.insert("a", 1); dict.insert("b", 2); - REQUIRE(dict.get("a") == 1); - REQUIRE(dict.get("b") == 2); + CATCH_REQUIRE(dict.get("a") == 1); + CATCH_REQUIRE(dict.get("b") == 2); } - SECTION("get throws when passed keys that are not present") { + CATCH_SECTION("get throws when passed keys that are not present") { OrderedDict dict; dict.insert("a", 1); dict.insert("b", 2); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( dict.get("foo"), StartsWith("Key 'foo' is not defined")); - REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined")); + CATCH_REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined")); } - SECTION("can initialize from list") { + CATCH_SECTION("can initialize from list") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict.size() == 2); - REQUIRE(dict.get("a") == 1); - REQUIRE(dict.get("b") == 2); + CATCH_REQUIRE(dict.size() == 2); + CATCH_REQUIRE(dict.get("a") == 1); + CATCH_REQUIRE(dict.get("b") == 2); } - SECTION("insert throws when passed elements that are present") { + CATCH_SECTION("insert throws when passed elements that are present") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( dict.insert("a", 1), StartsWith("Key 'a' already defined")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( dict.insert("b", 1), StartsWith("Key 'b' already defined")); } - SECTION("front() returns the first item") { + CATCH_SECTION("front() returns the first item") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict.front().key == "a"); - REQUIRE(dict.front().value == 1); + CATCH_REQUIRE(dict.front().key == "a"); + CATCH_REQUIRE(dict.front().value == 1); } - SECTION("back() returns the last item") { + CATCH_SECTION("back() returns the last item") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict.back().key == "b"); - REQUIRE(dict.back().value == 2); + CATCH_REQUIRE(dict.back().key == "b"); + CATCH_REQUIRE(dict.back().value == 2); } - SECTION("find returns pointers to values when present") { + CATCH_SECTION("find returns pointers to values when present") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict.find("a") != nullptr); - REQUIRE(*dict.find("a") == 1); - REQUIRE(dict.find("b") != nullptr); - REQUIRE(*dict.find("b") == 2); + CATCH_REQUIRE(dict.find("a") != nullptr); + CATCH_REQUIRE(*dict.find("a") == 1); + CATCH_REQUIRE(dict.find("b") != nullptr); + CATCH_REQUIRE(*dict.find("b") == 2); } - SECTION("find returns null pointers when passed keys that are not present") { + CATCH_SECTION("find returns null pointers when passed keys that are not present") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict.find("bar") == nullptr); - REQUIRE(dict.find("") == nullptr); + CATCH_REQUIRE(dict.find("bar") == nullptr); + CATCH_REQUIRE(dict.find("") == nullptr); } - SECTION("operator[] returns values when passed keys that are present") { + CATCH_SECTION("operator[] returns values when passed keys that are present") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict["a"] == 1); - REQUIRE(dict["b"] == 2); + CATCH_REQUIRE(dict["a"] == 1); + CATCH_REQUIRE(dict["b"] == 2); } - SECTION("operator[] returns items positionally when passed integers") { + CATCH_SECTION("operator[] returns items positionally when passed integers") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(dict[0].key == "a"); - REQUIRE(dict[0].value == 1); - REQUIRE(dict[1].key == "b"); - REQUIRE(dict[1].value == 2); + CATCH_REQUIRE(dict[0].key == "a"); + CATCH_REQUIRE(dict[0].value == 1); + CATCH_REQUIRE(dict[1].key == "b"); + CATCH_REQUIRE(dict[1].value == 2); } - SECTION("operator[] throws when passed keys that are not present") { + CATCH_SECTION("operator[] throws when passed keys that are not present") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( dict.get("foo"), StartsWith("Key 'foo' is not defined")); - REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined")); + CATCH_REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined")); } - SECTION("update inserts all items from another OrderedDict") { + CATCH_SECTION("update inserts all items from another OrderedDict") { OrderedDict dict = {{"a", 1}, {"b", 2}}; OrderedDict dict2 = {{"c", 3}}; dict2.update(dict); - REQUIRE(dict2.size() == 3); - REQUIRE(dict2.find("a") != nullptr); - REQUIRE(dict2.find("b") != nullptr); - REQUIRE(dict2.find("c") != nullptr); + CATCH_REQUIRE(dict2.size() == 3); + CATCH_REQUIRE(dict2.find("a") != nullptr); + CATCH_REQUIRE(dict2.find("b") != nullptr); + CATCH_REQUIRE(dict2.find("c") != nullptr); } - SECTION("update also checks for duplicates") { + CATCH_SECTION("update also checks for duplicates") { OrderedDict dict = {{"a", 1}, {"b", 2}}; OrderedDict dict2 = {{"a", 1}}; - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( dict2.update(dict), StartsWith("Key 'a' already defined")); } - SECTION("Can iterate items") { + CATCH_SECTION("Can iterate items") { OrderedDict dict = {{"a", 1}, {"b", 2}}; auto iterator = dict.begin(); - REQUIRE(iterator != dict.end()); - REQUIRE(iterator->key == "a"); - REQUIRE(iterator->value == 1); + CATCH_REQUIRE(iterator != dict.end()); + CATCH_REQUIRE(iterator->key == "a"); + CATCH_REQUIRE(iterator->value == 1); ++iterator; - REQUIRE(iterator != dict.end()); - REQUIRE(iterator->key == "b"); - REQUIRE(iterator->value == 2); + CATCH_REQUIRE(iterator != dict.end()); + CATCH_REQUIRE(iterator->key == "b"); + CATCH_REQUIRE(iterator->value == 2); ++iterator; - REQUIRE(iterator == dict.end()); + CATCH_REQUIRE(iterator == dict.end()); } - SECTION("clear makes the dict empty") { + CATCH_SECTION("clear makes the dict empty") { OrderedDict dict = {{"a", 1}, {"b", 2}}; - REQUIRE(!dict.is_empty()); + CATCH_REQUIRE(!dict.is_empty()); dict.clear(); - REQUIRE(dict.is_empty()); + CATCH_REQUIRE(dict.is_empty()); } - SECTION("can copy construct") { + CATCH_SECTION("can copy construct") { OrderedDict dict = {{"a", 1}, {"b", 2}}; OrderedDict copy = dict; - REQUIRE(copy.size() == 2); - REQUIRE(*copy[0] == 1); - REQUIRE(*copy[1] == 2); + CATCH_REQUIRE(copy.size() == 2); + CATCH_REQUIRE(*copy[0] == 1); + CATCH_REQUIRE(*copy[1] == 2); } - SECTION("can copy assign") { + CATCH_SECTION("can copy assign") { OrderedDict dict = {{"a", 1}, {"b", 2}}; OrderedDict copy = {{"c", 1}}; - REQUIRE(copy.find("c") != nullptr); + CATCH_REQUIRE(copy.find("c") != nullptr); copy = dict; - REQUIRE(copy.size() == 2); - REQUIRE(*copy[0] == 1); - REQUIRE(*copy[1] == 2); - REQUIRE(copy.find("c") == nullptr); + CATCH_REQUIRE(copy.size() == 2); + CATCH_REQUIRE(*copy[0] == 1); + CATCH_REQUIRE(*copy[1] == 2); + CATCH_REQUIRE(copy.find("c") == nullptr); } - SECTION("can move construct") { + CATCH_SECTION("can move construct") { OrderedDict dict = {{"a", 1}, {"b", 2}}; OrderedDict copy = std::move(dict); - REQUIRE(copy.size() == 2); - REQUIRE(*copy[0] == 1); - REQUIRE(*copy[1] == 2); + CATCH_REQUIRE(copy.size() == 2); + CATCH_REQUIRE(*copy[0] == 1); + CATCH_REQUIRE(*copy[1] == 2); } - SECTION("can move assign") { + CATCH_SECTION("can move assign") { OrderedDict dict = {{"a", 1}, {"b", 2}}; OrderedDict copy = {{"c", 1}}; - REQUIRE(copy.find("c") != nullptr); + CATCH_REQUIRE(copy.find("c") != nullptr); copy = std::move(dict); - REQUIRE(copy.size() == 2); - REQUIRE(*copy[0] == 1); - REQUIRE(*copy[1] == 2); - REQUIRE(copy.find("c") == nullptr); + CATCH_REQUIRE(copy.size() == 2); + CATCH_REQUIRE(*copy[0] == 1); + CATCH_REQUIRE(*copy[1] == 2); + CATCH_REQUIRE(copy.find("c") == nullptr); } - SECTION("can insert with braces") { + CATCH_SECTION("can insert with braces") { OrderedDict> dict; dict.insert("a", {1, 2}); - REQUIRE(!dict.is_empty()); - REQUIRE(dict["a"].first == 1); - REQUIRE(dict["a"].second == 2); + CATCH_REQUIRE(!dict.is_empty()); + CATCH_REQUIRE(dict["a"].first == 1); + CATCH_REQUIRE(dict["a"].second == 2); } - SECTION("Error messages include the what") { + CATCH_SECTION("Error messages include the what") { OrderedDict dict("Penguin"); - REQUIRE(dict.subject() == "Penguin"); + CATCH_REQUIRE(dict.subject() == "Penguin"); dict.insert("a", 1); - REQUIRE(!dict.is_empty()); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE(!dict.is_empty()); + CATCH_REQUIRE_THROWS_WITH( dict.get("b"), StartsWith("Penguin 'b' is not defined")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( dict.insert("a", 1), StartsWith("Penguin 'a' already defined")); } } diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp index d4049d6b270b26..2b9d0ad99c3477 100644 --- a/test/cpp/api/module.cpp +++ b/test/cpp/api/module.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -22,21 +22,21 @@ struct AGIUnit2 : torch::nn::Module { }; } // namespace test -TEST_CASE("module/training-mode") { +CATCH_TEST_CASE("module/training-mode") { torch::manual_seed(0); Linear module(3, 4); - REQUIRE(module->is_training()); - SECTION("Enable eval mode") { + CATCH_REQUIRE(module->is_training()); + CATCH_SECTION("Enable eval mode") { module->eval(); - REQUIRE(!module->is_training()); + CATCH_REQUIRE(!module->is_training()); } - SECTION("Enable train mode") { + CATCH_SECTION("Enable train mode") { module->train(); - REQUIRE(module->is_training()); + CATCH_REQUIRE(module->is_training()); } } -TEST_CASE("module/zero-grad") { +CATCH_TEST_CASE("module/zero-grad") { torch::manual_seed(0); Linear module(3, 4); auto weight = torch::ones({8, 3}, torch::requires_grad()); @@ -44,18 +44,18 @@ TEST_CASE("module/zero-grad") { loss.backward(); for (auto& parameter : module->parameters()) { auto grad = parameter->grad(); - REQUIRE(grad.defined()); - REQUIRE(grad.sum().toCFloat() != 0); + CATCH_REQUIRE(grad.defined()); + CATCH_REQUIRE(grad.sum().toCFloat() != 0); } module->zero_grad(); for (auto& parameter : module->parameters()) { auto grad = parameter->grad(); - REQUIRE(grad.defined()); - REQUIRE(grad.sum().toCFloat() == 0); + CATCH_REQUIRE(grad.defined()); + CATCH_REQUIRE(grad.sum().toCFloat() == 0); } } -TEST_CASE("module/zero-grad-with-undefined") { +CATCH_TEST_CASE("module/zero-grad-with-undefined") { struct TestModule : torch::nn::Module { TestModule() { x = register_parameter("x", torch::ones(5, at::requires_grad())); @@ -68,120 +68,120 @@ TEST_CASE("module/zero-grad-with-undefined") { auto z = module.x * 2; z.sum().backward(); - REQUIRE(module.x.grad().defined()); - REQUIRE(!module.y.grad().defined()); + CATCH_REQUIRE(module.x.grad().defined()); + CATCH_REQUIRE(!module.y.grad().defined()); module.zero_grad(); - REQUIRE(module.x.grad().defined()); - REQUIRE(!module.y.grad().defined()); + CATCH_REQUIRE(module.x.grad().defined()); + CATCH_REQUIRE(!module.y.grad().defined()); - REQUIRE(module.x.grad().sum().toCFloat() == 0); + CATCH_REQUIRE(module.x.grad().sum().toCFloat() == 0); } -TEST_CASE("module/name") { +CATCH_TEST_CASE("module/name") { // CHECK instead of REQUIRE because demangling may fail. AGIUnit agi; // Call it twice just to make sure there are no bugs in the lazy // initialization semantics. - CHECK(agi.name() == "AGIUnit"); - CHECK(agi.name() == "AGIUnit"); - SECTION("correctly demangled") { - CHECK(test::AGIUnit().name() == "test::AGIUnit"); - CHECK(test::AGIUnit2().name() == "Foo"); + CATCH_CHECK(agi.name() == "AGIUnit"); + CATCH_CHECK(agi.name() == "AGIUnit"); + CATCH_SECTION("correctly demangled") { + CATCH_CHECK(test::AGIUnit().name() == "test::AGIUnit"); + CATCH_CHECK(test::AGIUnit2().name() == "Foo"); } } -TEST_CASE("module/as") { +CATCH_TEST_CASE("module/as") { Linear module(3, 4); - REQUIRE(module->as() == module.get()); - REQUIRE(module->as() == module.get()); - REQUIRE(module->as() == module.get()); - REQUIRE(module->as() == nullptr); + CATCH_REQUIRE(module->as() == module.get()); + CATCH_REQUIRE(module->as() == module.get()); + CATCH_REQUIRE(module->as() == module.get()); + CATCH_REQUIRE(module->as() == nullptr); std::shared_ptr raw = module.ptr(); - REQUIRE(raw->as() == module.get()); - REQUIRE(raw->as() == module.get()); - REQUIRE(raw->as() == module.get()); - REQUIRE(raw->as() == nullptr); + CATCH_REQUIRE(raw->as() == module.get()); + CATCH_REQUIRE(raw->as() == module.get()); + CATCH_REQUIRE(raw->as() == module.get()); + CATCH_REQUIRE(raw->as() == nullptr); Module& raw_ref = *raw.get(); - REQUIRE(raw_ref.as() == module.get()); - REQUIRE(raw_ref.as() == module.get()); - REQUIRE(raw_ref.as() == module.get()); - REQUIRE(raw_ref.as() == nullptr); + CATCH_REQUIRE(raw_ref.as() == module.get()); + CATCH_REQUIRE(raw_ref.as() == module.get()); + CATCH_REQUIRE(raw_ref.as() == module.get()); + CATCH_REQUIRE(raw_ref.as() == nullptr); if (auto* linear = raw_ref.as()) { - REQUIRE(linear->weight.ndimension() == 2); + CATCH_REQUIRE(linear->weight.ndimension() == 2); } AGIUnit unit; - REQUIRE(unit.as() == nullptr); - REQUIRE(unit.as() == nullptr); - REQUIRE(unit.as() == &unit); + CATCH_REQUIRE(unit.as() == nullptr); + CATCH_REQUIRE(unit.as() == nullptr); + CATCH_REQUIRE(unit.as() == &unit); } -TEST_CASE("module/conversions", "[multi-cuda]") { +CATCH_TEST_CASE("module/conversions", "[multi-cuda]") { torch::manual_seed(0); Linear module(128, 64); - SECTION("starts as float on CPU") { + CATCH_SECTION("starts as float on CPU") { for (auto& parameter : module->parameters()) { - REQUIRE(parameter->device() == torch::Device(torch::kCPU)); - REQUIRE(parameter->dtype() == torch::kFloat32); + CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCPU)); + CATCH_REQUIRE(parameter->dtype() == torch::kFloat32); } } - SECTION("to(CUDA)") { + CATCH_SECTION("to(CUDA)") { module->to({torch::kCUDA, 0}); for (auto& parameter : module->parameters()) { - REQUIRE(parameter->device().type() == torch::Device::Type::CUDA); - REQUIRE(parameter->device().index() == 0); + CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA); + CATCH_REQUIRE(parameter->device().index() == 0); } module->to({at::kCUDA, 1}); for (auto& parameter : module->parameters()) { - REQUIRE(parameter->device().type() == torch::Device::Type::CUDA); - REQUIRE(parameter->device().index() == 1); + CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA); + CATCH_REQUIRE(parameter->device().index() == 1); } } - SECTION("to(CPU)") { + CATCH_SECTION("to(CPU)") { module->to(torch::Device(torch::kCPU)); for (auto& parameter : module->parameters()) { - REQUIRE(parameter->device().type() == torch::Device::Type::CPU); + CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CPU); } } - SECTION("to(Int32)") { + CATCH_SECTION("to(Int32)") { module->to(torch::kInt32); for (auto& parameter : module->parameters()) { - REQUIRE(parameter->dtype() == torch::kInt32); + CATCH_REQUIRE(parameter->dtype() == torch::kInt32); } } - SECTION("to(Float64)") { + CATCH_SECTION("to(Float64)") { module->to(torch::kFloat64); for (auto& parameter : module->parameters()) { - REQUIRE(parameter->dtype() == torch::kFloat64); + CATCH_REQUIRE(parameter->dtype() == torch::kFloat64); } } - SECTION("to(CUDA, Byte)") { + CATCH_SECTION("to(CUDA, Byte)") { module->to(torch::Device(torch::kCUDA, 1), torch::kUInt8); for (auto& parameter : module->parameters()) { - REQUIRE(parameter->device().type() == torch::Device::Type::CUDA); - REQUIRE(parameter->device().index() == 1); + CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA); + CATCH_REQUIRE(parameter->device().index() == 1); } for (auto& parameter : module->parameters()) { - REQUIRE(parameter->dtype() == torch::kUInt8); + CATCH_REQUIRE(parameter->dtype() == torch::kUInt8); } } } -TEST_CASE("module/clone") { +CATCH_TEST_CASE("module/clone") { torch::manual_seed(0); - SECTION( + CATCH_SECTION( "a module that does not override clone() throws when clone() is called") { struct UnCloneable : Module {}; UnCloneable module; - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( module.clone(), StartsWith("clone() has not been implemented")); } - SECTION( + CATCH_SECTION( "a module that overrides clone() does not throw when clone() is called ") { struct Cloneable : Module { std::shared_ptr clone( @@ -190,10 +190,10 @@ TEST_CASE("module/clone") { } }; Cloneable module; - REQUIRE_NOTHROW(module.clone()); + CATCH_REQUIRE_NOTHROW(module.clone()); } - SECTION("Cloning creates distinct parameters") { + CATCH_SECTION("Cloning creates distinct parameters") { struct TestModule : public Cloneable { TestModule() { reset(); @@ -216,32 +216,32 @@ TEST_CASE("module/clone") { auto module2 = module->clone(); auto params1 = module->parameters(); auto params2 = module2->parameters(); - REQUIRE(params1.size() == 6); - REQUIRE(params2.size() == 6); + CATCH_REQUIRE(params1.size() == 6); + CATCH_REQUIRE(params2.size() == 6); for (auto& param : params1) { - REQUIRE(!pointer_equal(param.value, params2[param.key])); - REQUIRE(param->allclose(params2[param.key])); + CATCH_REQUIRE(!pointer_equal(param.value, params2[param.key])); + CATCH_REQUIRE(param->allclose(params2[param.key])); param->add_(2); } for (auto& param : params1) { - REQUIRE(!param->allclose(params2[param.key])); + CATCH_REQUIRE(!param->allclose(params2[param.key])); } auto buffers1 = module->buffers(); auto buffers2 = module2->buffers(); - REQUIRE(buffers1.size() == 1); - REQUIRE(buffers2.size() == 1); + CATCH_REQUIRE(buffers1.size() == 1); + CATCH_REQUIRE(buffers2.size() == 1); for (auto& buffer : buffers1) { - REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key])); - REQUIRE(buffer->allclose(buffers2[buffer.key])); + CATCH_REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key])); + CATCH_REQUIRE(buffer->allclose(buffers2[buffer.key])); buffer->add_(2); } for (auto& buffer : buffers1) { - REQUIRE(!buffer->allclose(buffers2[buffer.key])); + CATCH_REQUIRE(!buffer->allclose(buffers2[buffer.key])); } } - SECTION("Cloning preserves external references") { + CATCH_SECTION("Cloning preserves external references") { struct TestModule : public Cloneable { TestModule() { reset(); @@ -256,19 +256,19 @@ TEST_CASE("module/clone") { torch::NoGradGuard no_grad; module->weight += 1; } - REQUIRE(pointer_equal(module->weight, module->parameters()["weight"])); - REQUIRE(module->weight.allclose(module->parameters()["weight"])); + CATCH_REQUIRE(pointer_equal(module->weight, module->parameters()["weight"])); + CATCH_REQUIRE(module->weight.allclose(module->parameters()["weight"])); auto module2 = std::dynamic_pointer_cast( std::shared_ptr(module->clone())); - REQUIRE(!pointer_equal(module2->weight, module->weight)); - REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"])); - REQUIRE(module2->weight.allclose(module2->parameters()["weight"])); - REQUIRE(module2->weight.allclose(module->weight)); - REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"])); + CATCH_REQUIRE(!pointer_equal(module2->weight, module->weight)); + CATCH_REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"])); + CATCH_REQUIRE(module2->weight.allclose(module2->parameters()["weight"])); + CATCH_REQUIRE(module2->weight.allclose(module->weight)); + CATCH_REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"])); } - SECTION("Cloning copies the values of variables of submodules") { + CATCH_SECTION("Cloning copies the values of variables of submodules") { struct TestModule : public Cloneable { TestModule() { reset(); @@ -299,16 +299,16 @@ TEST_CASE("module/clone") { auto b = std::dynamic_pointer_cast(a->clone()); - REQUIRE(!pointer_equal(b->module->weight, a->module->weight)); - REQUIRE( + CATCH_REQUIRE(!pointer_equal(b->module->weight, a->module->weight)); + CATCH_REQUIRE( pointer_equal(b->module->weight, b->module->parameters()["weight"])); - REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight)); - REQUIRE(b->module->weight.allclose(a->module->weight)); - REQUIRE(b->module->value == a->module->value); + CATCH_REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight)); + CATCH_REQUIRE(b->module->weight.allclose(a->module->weight)); + CATCH_REQUIRE(b->module->value == a->module->value); } } -TEST_CASE("module/clone-to-device", "[cuda]") { +CATCH_TEST_CASE("module/clone-to-device", "[cuda]") { struct TestModule : public Cloneable { TestModule() { reset(); @@ -324,7 +324,7 @@ TEST_CASE("module/clone-to-device", "[cuda]") { torch::Tensor buffer; }; - SECTION("Cloning preserves the device of parameters/buffers") { + CATCH_SECTION("Cloning preserves the device of parameters/buffers") { TestModule m; torch::Device device(torch::kCUDA, 0); @@ -332,33 +332,33 @@ TEST_CASE("module/clone-to-device", "[cuda]") { auto clone = m.clone(); for (const auto& parameter : clone->parameters()) { - REQUIRE(parameter->device().type() == device.type()); - REQUIRE(parameter->device().index() == device.index()); + CATCH_REQUIRE(parameter->device().type() == device.type()); + CATCH_REQUIRE(parameter->device().index() == device.index()); } for (const auto& buffer : clone->buffers()) { - REQUIRE(buffer->device().type() == device.type()); - REQUIRE(buffer->device().index() == device.index()); + CATCH_REQUIRE(buffer->device().type() == device.type()); + CATCH_REQUIRE(buffer->device().index() == device.index()); } } - SECTION( + CATCH_SECTION( "Cloning to a particular device places all parameters/buffers there") { TestModule m; torch::Device device(torch::kCUDA, 1); // everything is on CPU here auto clone = m.clone(device); for (const auto& parameter : clone->parameters()) { - REQUIRE(parameter->device().type() == device.type()); - REQUIRE(parameter->device().index() == device.index()); + CATCH_REQUIRE(parameter->device().type() == device.type()); + CATCH_REQUIRE(parameter->device().index() == device.index()); } for (const auto& buffer : clone->buffers()) { - REQUIRE(buffer->device().type() == device.type()); - REQUIRE(buffer->device().index() == device.index()); + CATCH_REQUIRE(buffer->device().type() == device.type()); + CATCH_REQUIRE(buffer->device().index() == device.index()); } } } -TEST_CASE("module/parameters") { +CATCH_TEST_CASE("module/parameters") { torch::manual_seed(0); struct TestModule : Module { TestModule() { @@ -372,19 +372,19 @@ TEST_CASE("module/parameters") { TestModule module; - SECTION("has correct number of parameters") { - REQUIRE(module.parameters().size() == 3); + CATCH_SECTION("has correct number of parameters") { + CATCH_REQUIRE(module.parameters().size() == 3); } - SECTION("contains parameters with the correct name") { + CATCH_SECTION("contains parameters with the correct name") { auto parameters = module.parameters(); - REQUIRE(parameters.contains("a")); - REQUIRE(parameters.contains("b")); - REQUIRE(parameters.contains("c")); + CATCH_REQUIRE(parameters.contains("a")); + CATCH_REQUIRE(parameters.contains("b")); + CATCH_REQUIRE(parameters.contains("c")); } } -TEST_CASE("module/buffers") { +CATCH_TEST_CASE("module/buffers") { torch::manual_seed(0); struct TestModule : Module { TestModule() { @@ -398,19 +398,19 @@ TEST_CASE("module/buffers") { TestModule module; - SECTION("has correct number of buffers") { - REQUIRE(module.buffers().size() == 3); + CATCH_SECTION("has correct number of buffers") { + CATCH_REQUIRE(module.buffers().size() == 3); } - SECTION("contains buffers with the correct name") { + CATCH_SECTION("contains buffers with the correct name") { auto buffers = module.buffers(); - REQUIRE(buffers.contains("a")); - REQUIRE(buffers.contains("b")); - REQUIRE(buffers.contains("c")); + CATCH_REQUIRE(buffers.contains("a")); + CATCH_REQUIRE(buffers.contains("b")); + CATCH_REQUIRE(buffers.contains("c")); } } -TEST_CASE("module/default-constructor") { +CATCH_TEST_CASE("module/default-constructor") { struct AImpl : torch::nn::Module { AImpl() : x_(123) {} AImpl(int x) : x_(x) {} @@ -420,20 +420,20 @@ TEST_CASE("module/default-constructor") { { A a; - REQUIRE(a); - REQUIRE(!a.is_empty()); - REQUIRE(a->x_ == 123); + CATCH_REQUIRE(a); + CATCH_REQUIRE(!a.is_empty()); + CATCH_REQUIRE(a->x_ == 123); } { A a(5); - REQUIRE(a); - REQUIRE(!a.is_empty()); - REQUIRE(a->x_ == 5); + CATCH_REQUIRE(a); + CATCH_REQUIRE(!a.is_empty()); + CATCH_REQUIRE(a->x_ == 5); } { A a = nullptr; - REQUIRE(!a); - REQUIRE(a.is_empty()); - REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder")); + CATCH_REQUIRE(!a); + CATCH_REQUIRE(a.is_empty()); + CATCH_REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder")); } } diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 928a39f7d8b44a..7d4f9ab05fe93e 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -39,92 +39,92 @@ class NestedModel : public torch::nn::Module { std::shared_ptr t; }; -TEST_CASE("modules") { +CATCH_TEST_CASE("modules") { torch::manual_seed(0); - SECTION("conv") { - SECTION("1d") { + CATCH_SECTION("conv") { + CATCH_SECTION("1d") { Conv1d model(Conv1dOptions(3, 2, 3).stride(2)); auto x = torch::randn({2, 3, 5}, torch::requires_grad()); auto y = model->forward(x); torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 3); - REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.ndimension() == 3); + CATCH_REQUIRE(s.ndimension() == 0); for (auto i = 0; i < 3; i++) { - REQUIRE(y.size(i) == 2); + CATCH_REQUIRE(y.size(i) == 2); } - REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3); } - SECTION("2d") { - SECTION("even") { + CATCH_SECTION("2d") { + CATCH_SECTION("even") { Conv2d model(Conv2dOptions(3, 2, 3).stride(2)); auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad()); auto y = model->forward(x); torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 4); - REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.ndimension() == 4); + CATCH_REQUIRE(s.ndimension() == 0); for (auto i = 0; i < 4; i++) { - REQUIRE(y.size(i) == 2); + CATCH_REQUIRE(y.size(i) == 2); } - REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3); } - SECTION("uneven") { + CATCH_SECTION("uneven") { Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2})); auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad()); auto y = model->forward(x); torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 4); - REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.ndimension() == 4); + CATCH_REQUIRE(s.ndimension() == 0); for (auto i = 0; i < 4; i++) { - REQUIRE(y.size(i) == 2); + CATCH_REQUIRE(y.size(i) == 2); } - REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2); } } - SECTION("3d") { + CATCH_SECTION("3d") { Conv3d model(Conv3dOptions(3, 2, 3).stride(2)); auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad()); auto y = model->forward(x); torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 5); - REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.ndimension() == 5); + CATCH_REQUIRE(s.ndimension() == 0); for (auto i = 0; i < 5; i++) { - REQUIRE(y.size(i) == 2); + CATCH_REQUIRE(y.size(i) == 2); } - REQUIRE( + CATCH_REQUIRE( model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3 * 3); } } - SECTION("linear") { - SECTION("basic1") { + CATCH_SECTION("linear") { + CATCH_SECTION("basic1") { Linear model(5, 2); auto x = torch::randn({10, 5}, torch::requires_grad()); auto y = model->forward(x); torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 2); - REQUIRE(s.ndimension() == 0); - REQUIRE(y.size(0) == 10); - REQUIRE(y.size(1) == 2); + CATCH_REQUIRE(y.ndimension() == 2); + CATCH_REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.size(0) == 10); + CATCH_REQUIRE(y.size(1) == 2); - REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5); } } - SECTION("simple") { + CATCH_SECTION("simple") { auto model = std::make_shared(); auto l1 = model->add(Linear(10, 3), "l1"); auto l2 = model->add(Linear(3, 5), "l2"); @@ -136,20 +136,20 @@ TEST_CASE("modules") { x = l3->forward(x).clamp_min(0); x.backward(); - REQUIRE(x.ndimension() == 2); - REQUIRE(x.size(0) == 1000); - REQUIRE(x.size(1) == 100); - REQUIRE(x.min().toCFloat() == 0); + CATCH_REQUIRE(x.ndimension() == 2); + CATCH_REQUIRE(x.size(0) == 1000); + CATCH_REQUIRE(x.size(1) == 100); + CATCH_REQUIRE(x.min().toCFloat() == 0); } - SECTION("embedding") { - SECTION("basic") { + CATCH_SECTION("embedding") { + CATCH_SECTION("basic") { const int64_t dict_size = 10; Embedding model(dict_size, 2); - REQUIRE(model->parameters().contains("weight")); - REQUIRE(model->weight.ndimension() == 2); - REQUIRE(model->weight.size(0) == dict_size); - REQUIRE(model->weight.size(1) == 2); + CATCH_REQUIRE(model->parameters().contains("weight")); + CATCH_REQUIRE(model->weight.ndimension() == 2); + CATCH_REQUIRE(model->weight.size(0) == dict_size); + CATCH_REQUIRE(model->weight.size(1) == 2); // Cannot get gradients to change indices (input) - only for embedding // params @@ -158,65 +158,65 @@ TEST_CASE("modules") { torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 2); - REQUIRE(s.ndimension() == 0); - REQUIRE(y.size(0) == 10); - REQUIRE(y.size(1) == 2); + CATCH_REQUIRE(y.ndimension() == 2); + CATCH_REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.size(0) == 10); + CATCH_REQUIRE(y.size(1) == 2); - REQUIRE(model->parameters()["weight"].grad().numel() == 2 * dict_size); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * dict_size); } - SECTION("list") { + CATCH_SECTION("list") { Embedding model(6, 4); auto x = torch::full({2, 3}, 5, torch::kInt64); auto y = model->forward(x); torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 3); - REQUIRE(y.size(0) == 2); - REQUIRE(y.size(1) == 3); - REQUIRE(y.size(2) == 4); + CATCH_REQUIRE(y.ndimension() == 3); + CATCH_REQUIRE(y.size(0) == 2); + CATCH_REQUIRE(y.size(1) == 3); + CATCH_REQUIRE(y.size(2) == 4); } } - SECTION("dropout") { + CATCH_SECTION("dropout") { Dropout dropout(0.5); torch::Tensor x = torch::ones(100, torch::requires_grad()); torch::Tensor y = dropout->forward(x); y.backward(); - REQUIRE(y.ndimension() == 1); - REQUIRE(y.size(0) == 100); - REQUIRE(y.sum().toCFloat() < 130); // Probably - REQUIRE(y.sum().toCFloat() > 70); // Probably + CATCH_REQUIRE(y.ndimension() == 1); + CATCH_REQUIRE(y.size(0) == 100); + CATCH_REQUIRE(y.sum().toCFloat() < 130); // Probably + CATCH_REQUIRE(y.sum().toCFloat() > 70); // Probably dropout->eval(); y = dropout->forward(x); - REQUIRE(y.sum().toCFloat() == 100); + CATCH_REQUIRE(y.sum().toCFloat() == 100); } - SECTION("param") { + CATCH_SECTION("param") { auto model = std::make_shared(); auto parameters = model->parameters(); - REQUIRE(parameters["param"].size(0) == 3); - REQUIRE(parameters["param"].size(1) == 2); - REQUIRE(parameters["param"].size(2) == 21); - REQUIRE(parameters["l1.bias"].size(0) == 20); - REQUIRE(parameters["l1.weight"].size(0) == 20); - REQUIRE(parameters["l1.weight"].size(1) == 5); - REQUIRE(parameters["test.l1.bias"].size(0) == 3); - REQUIRE(parameters["test.l1.weight"].size(0) == 3); - REQUIRE(parameters["test.l1.weight"].size(1) == 10); - REQUIRE(parameters["test.l2.bias"].size(0) == 5); - REQUIRE(parameters["test.l2.weight"].size(0) == 5); - REQUIRE(parameters["test.l2.weight"].size(1) == 3); - REQUIRE(parameters["test.l3.bias"].size(0) == 100); - REQUIRE(parameters["test.l3.weight"].size(0) == 100); - REQUIRE(parameters["test.l3.weight"].size(1) == 5); + CATCH_REQUIRE(parameters["param"].size(0) == 3); + CATCH_REQUIRE(parameters["param"].size(1) == 2); + CATCH_REQUIRE(parameters["param"].size(2) == 21); + CATCH_REQUIRE(parameters["l1.bias"].size(0) == 20); + CATCH_REQUIRE(parameters["l1.weight"].size(0) == 20); + CATCH_REQUIRE(parameters["l1.weight"].size(1) == 5); + CATCH_REQUIRE(parameters["test.l1.bias"].size(0) == 3); + CATCH_REQUIRE(parameters["test.l1.weight"].size(0) == 3); + CATCH_REQUIRE(parameters["test.l1.weight"].size(1) == 10); + CATCH_REQUIRE(parameters["test.l2.bias"].size(0) == 5); + CATCH_REQUIRE(parameters["test.l2.weight"].size(0) == 5); + CATCH_REQUIRE(parameters["test.l2.weight"].size(1) == 3); + CATCH_REQUIRE(parameters["test.l3.bias"].size(0) == 100); + CATCH_REQUIRE(parameters["test.l3.weight"].size(0) == 100); + CATCH_REQUIRE(parameters["test.l3.weight"].size(1) == 5); } - SECTION("functional") { + CATCH_SECTION("functional") { { bool was_called = false; auto functional = Functional([&was_called](torch::Tensor input) { @@ -224,63 +224,63 @@ TEST_CASE("modules") { return input; }); auto output = functional->forward(torch::ones(5, torch::requires_grad())); - REQUIRE(was_called); - REQUIRE(output.equal(torch::ones(5, torch::requires_grad()))); + CATCH_REQUIRE(was_called); + CATCH_REQUIRE(output.equal(torch::ones(5, torch::requires_grad()))); was_called = false; // Use the call operator overload here. output = functional(torch::ones(5, torch::requires_grad())); - REQUIRE(was_called); - REQUIRE(output.equal(torch::ones(5, torch::requires_grad()))); + CATCH_REQUIRE(was_called); + CATCH_REQUIRE(output.equal(torch::ones(5, torch::requires_grad()))); } { auto functional = Functional(torch::relu); - REQUIRE(functional(torch::ones({})).toCFloat() == 1); - REQUIRE(functional(torch::ones({})).toCFloat() == 1); - REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0); + CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 1); + CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 1); + CATCH_REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0); } { auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1); - REQUIRE(functional(torch::ones({})).toCFloat() == 0); + CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 0); } } - SECTION("batchnorm") { + CATCH_SECTION("batchnorm") { { BatchNorm bn(5); // Is stateful by default. - REQUIRE(bn->options.stateful()); + CATCH_REQUIRE(bn->options.stateful()); - REQUIRE(bn->running_mean.defined()); - REQUIRE(bn->running_mean.dim() == 1); - REQUIRE(bn->running_mean.size(0) == 5); + CATCH_REQUIRE(bn->running_mean.defined()); + CATCH_REQUIRE(bn->running_mean.dim() == 1); + CATCH_REQUIRE(bn->running_mean.size(0) == 5); - REQUIRE(bn->running_variance.defined()); - REQUIRE(bn->running_variance.dim() == 1); - REQUIRE(bn->running_variance.size(0) == 5); + CATCH_REQUIRE(bn->running_variance.defined()); + CATCH_REQUIRE(bn->running_variance.dim() == 1); + CATCH_REQUIRE(bn->running_variance.size(0) == 5); // Is affine by default. - REQUIRE(bn->options.affine()); + CATCH_REQUIRE(bn->options.affine()); - REQUIRE(bn->weight.defined()); - REQUIRE(bn->weight.dim() == 1); - REQUIRE(bn->weight.size(0) == 5); + CATCH_REQUIRE(bn->weight.defined()); + CATCH_REQUIRE(bn->weight.dim() == 1); + CATCH_REQUIRE(bn->weight.size(0) == 5); - REQUIRE(bn->bias.defined()); - REQUIRE(bn->bias.dim() == 1); - REQUIRE(bn->bias.size(0) == 5); + CATCH_REQUIRE(bn->bias.defined()); + CATCH_REQUIRE(bn->bias.dim() == 1); + CATCH_REQUIRE(bn->bias.size(0) == 5); } { BatchNorm bn(BatchNormOptions(5).stateful(false).affine(false)); - REQUIRE(!bn->running_mean.defined()); - REQUIRE(!bn->running_variance.defined()); - REQUIRE(!bn->weight.defined()); - REQUIRE(!bn->bias.defined()); + CATCH_REQUIRE(!bn->running_mean.defined()); + CATCH_REQUIRE(!bn->running_variance.defined()); + CATCH_REQUIRE(!bn->weight.defined()); + CATCH_REQUIRE(!bn->bias.defined()); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( bn->forward(torch::ones({2, 5})), StartsWith("Calling BatchNorm::forward is only permitted " "when the 'stateful' option is true (was false). " @@ -298,14 +298,14 @@ TEST_CASE("modules") { auto output = bn->pure_forward(input, mean, variance); auto expected = (input - mean) / torch::sqrt(variance + bn->options.eps()); - REQUIRE(output.allclose(expected)); + CATCH_REQUIRE(output.allclose(expected)); } } } -TEST_CASE("modules_cuda", "[cuda]") { +CATCH_TEST_CASE("modules_cuda", "[cuda]") { torch::manual_seed(0); - SECTION("1") { + CATCH_SECTION("1") { Linear model(5, 2); model->to(torch::kCUDA); auto x = @@ -314,15 +314,15 @@ TEST_CASE("modules_cuda", "[cuda]") { torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 2); - REQUIRE(s.ndimension() == 0); - REQUIRE(y.size(0) == 10); - REQUIRE(y.size(1) == 2); + CATCH_REQUIRE(y.ndimension() == 2); + CATCH_REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.size(0) == 10); + CATCH_REQUIRE(y.size(1) == 2); - REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5); } - SECTION("2") { + CATCH_SECTION("2") { Linear model(5, 2); model->to(torch::kCUDA); model->to(torch::kCPU); @@ -331,11 +331,11 @@ TEST_CASE("modules_cuda", "[cuda]") { torch::Tensor s = y.sum(); s.backward(); - REQUIRE(y.ndimension() == 2); - REQUIRE(s.ndimension() == 0); - REQUIRE(y.size(0) == 10); - REQUIRE(y.size(1) == 2); + CATCH_REQUIRE(y.ndimension() == 2); + CATCH_REQUIRE(s.ndimension() == 0); + CATCH_REQUIRE(y.size(0) == 10); + CATCH_REQUIRE(y.size(1) == 2); - REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5); + CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5); } } diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp index ab278180b12b34..4cb398dd89dc82 100644 --- a/test/cpp/api/optim.cpp +++ b/test/cpp/api/optim.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -118,24 +118,24 @@ void check_exact_values( optimizer.step(); if (i % kSampleEvery == 0) { - REQUIRE( + CATCH_REQUIRE( expected_parameters.at(i / kSampleEvery).size() == parameters.size()); for (size_t p = 0; p < parameters.size(); ++p) { - REQUIRE(parameters.at(p)->defined()); + CATCH_REQUIRE(parameters.at(p)->defined()); auto computed = parameters.at(p)->flatten(); auto expected = expected_parameters.at(i / kSampleEvery).at(p); if (!computed.allclose(expected, /*rtol=*/1e-3, /*atol=*/5e-4)) { std::cout << "Iteration " << i << ": " << computed << " != " << expected << " (parameter " << p << ")" << std::endl; - REQUIRE(false); + CATCH_REQUIRE(false); } } } } } -TEST_CASE("Optim/BasicInterface") { +CATCH_TEST_CASE("Optim/BasicInterface") { struct MyOptimizer : Optimizer { using Optimizer::Optimizer; void step() override {} @@ -144,139 +144,139 @@ TEST_CASE("Optim/BasicInterface") { torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})}; { MyOptimizer optimizer(parameters); - REQUIRE(optimizer.size() == parameters.size()); + CATCH_REQUIRE(optimizer.size() == parameters.size()); } { MyOptimizer optimizer; - REQUIRE(optimizer.size() == 0); + CATCH_REQUIRE(optimizer.size() == 0); optimizer.add_parameters(parameters); - REQUIRE(optimizer.size() == parameters.size()); + CATCH_REQUIRE(optimizer.size() == parameters.size()); for (size_t p = 0; p < parameters.size(); ++p) { - REQUIRE(optimizer.parameters()[p].allclose(parameters[p])); + CATCH_REQUIRE(optimizer.parameters()[p].allclose(parameters[p])); } } { Linear linear(3, 4); MyOptimizer optimizer(linear->parameters()); - REQUIRE(optimizer.size() == linear->parameters().size()); + CATCH_REQUIRE(optimizer.size() == linear->parameters().size()); } } -TEST_CASE("Optim/XORConvergence/SGD") { - REQUIRE(test_optimizer_xor( +CATCH_TEST_CASE("Optim/XORConvergence/SGD") { + CATCH_REQUIRE(test_optimizer_xor( SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6))); } -TEST_CASE("Optim/XORConvergence/Adagrad") { - REQUIRE(test_optimizer_xor( +CATCH_TEST_CASE("Optim/XORConvergence/Adagrad") { + CATCH_REQUIRE(test_optimizer_xor( AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3))); } -TEST_CASE("Optim/XORConvergence/RMSprop") { - REQUIRE(test_optimizer_xor(RMSpropOptions(0.1).centered(true))); +CATCH_TEST_CASE("Optim/XORConvergence/RMSprop") { + CATCH_REQUIRE(test_optimizer_xor(RMSpropOptions(0.1).centered(true))); } -TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") { - REQUIRE(test_optimizer_xor( +CATCH_TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") { + CATCH_REQUIRE(test_optimizer_xor( RMSpropOptions(0.1).momentum(0.9).weight_decay(1e-6))); } -TEST_CASE("Optim/XORConvergence/Adam") { - REQUIRE(test_optimizer_xor(AdamOptions(0.1).weight_decay(1e-6))); +CATCH_TEST_CASE("Optim/XORConvergence/Adam") { + CATCH_REQUIRE(test_optimizer_xor(AdamOptions(0.1).weight_decay(1e-6))); } -TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") { - REQUIRE(test_optimizer_xor( +CATCH_TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") { + CATCH_REQUIRE(test_optimizer_xor( AdamOptions(0.1).weight_decay(1e-6).amsgrad(true))); } -TEST_CASE("Optim/ProducesPyTorchValues/Adam") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/Adam") { check_exact_values(AdamOptions(1.0), expected_parameters::Adam); } -TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecay") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecay") { check_exact_values( AdamOptions(1.0).weight_decay(1e-2), expected_parameters::Adam_with_weight_decay); } -TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecayAndAMSGrad") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecayAndAMSGrad") { check_exact_values( AdamOptions(1.0).weight_decay(1e-6).amsgrad(true), expected_parameters::Adam_with_weight_decay_and_amsgrad); } -TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") { check_exact_values( AdagradOptions(1.0), expected_parameters::Adagrad); } -TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecay") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecay") { check_exact_values( AdagradOptions(1.0).weight_decay(1e-2), expected_parameters::Adagrad_with_weight_decay); } -TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecayAndLRDecay") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecayAndLRDecay") { check_exact_values( AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3), expected_parameters::Adagrad_with_weight_decay_and_lr_decay); } -TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") { check_exact_values( RMSpropOptions(0.1), expected_parameters::RMSprop); } -TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecay") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecay") { check_exact_values( RMSpropOptions(0.1).weight_decay(1e-2), expected_parameters::RMSprop_with_weight_decay); } -TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCentered") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCentered") { check_exact_values( RMSpropOptions(0.1).weight_decay(1e-6).centered(true), expected_parameters::RMSprop_with_weight_decay_and_centered); } -TEST_CASE( +CATCH_TEST_CASE( "Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCenteredAndMomentum") { check_exact_values( RMSpropOptions(0.1).weight_decay(1e-6).centered(true).momentum(0.9), expected_parameters::RMSprop_with_weight_decay_and_centered_and_momentum); } -TEST_CASE("Optim/ProducesPyTorchValues/SGD") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGD") { check_exact_values(SGDOptions(0.1), expected_parameters::SGD); } -TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecay") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecay") { check_exact_values( SGDOptions(0.1).weight_decay(1e-2), expected_parameters::SGD_with_weight_decay); } -TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndMomentum") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndMomentum") { check_exact_values( SGDOptions(0.1).weight_decay(1e-2).momentum(0.9), expected_parameters::SGD_with_weight_decay_and_momentum); } -TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndNesterovMomentum") { +CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndNesterovMomentum") { check_exact_values( SGDOptions(0.1).weight_decay(1e-6).momentum(0.9).nesterov(true), expected_parameters::SGD_with_weight_decay_and_nesterov_momentum); } -TEST_CASE("Optim/ZeroGrad") { +CATCH_TEST_CASE("Optim/ZeroGrad") { torch::manual_seed(0); Linear model(2, 8); SGD optimizer(model->parameters(), 0.1); for (const auto& parameter : model->parameters()) { - REQUIRE(!parameter->grad().defined()); + CATCH_REQUIRE(!parameter->grad().defined()); } auto output = model->forward(torch::ones({5, 2})); @@ -284,19 +284,19 @@ TEST_CASE("Optim/ZeroGrad") { loss.backward(); for (const auto& parameter : model->parameters()) { - REQUIRE(parameter->grad().defined()); - REQUIRE(parameter->grad().sum().toCFloat() > 0); + CATCH_REQUIRE(parameter->grad().defined()); + CATCH_REQUIRE(parameter->grad().sum().toCFloat() > 0); } optimizer.zero_grad(); for (const auto& parameter : model->parameters()) { - REQUIRE(parameter->grad().defined()); - REQUIRE(parameter->grad().sum().toCFloat() == 0); + CATCH_REQUIRE(parameter->grad().defined()); + CATCH_REQUIRE(parameter->grad().sum().toCFloat() == 0); } } -TEST_CASE("Optim/ExternalVectorOfParameters") { +CATCH_TEST_CASE("Optim/ExternalVectorOfParameters") { torch::manual_seed(0); std::vector parameters = { @@ -313,12 +313,12 @@ TEST_CASE("Optim/ExternalVectorOfParameters") { optimizer.step(); - REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0)); - REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0)); - REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0)); + CATCH_REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0)); + CATCH_REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0)); + CATCH_REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0)); } -TEST_CASE("Optim/AddParameter/LBFGS") { +CATCH_TEST_CASE("Optim/AddParameter/LBFGS") { torch::manual_seed(0); std::vector parameters = {torch::randn({5, 5})}; diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp index a1517586fc9718..33e3a16d9a8040 100644 --- a/test/cpp/api/parallel.cpp +++ b/test/cpp/api/parallel.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -19,92 +19,92 @@ using namespace torch::nn; #ifdef USE_CUDA -TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") { +CATCH_TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") { Scatter scatter( {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)}); auto input = torch::ones(10, torch::requires_grad(true)); auto output = scatter.apply({input}); - REQUIRE(output.size() == 2); - REQUIRE(output[0].size(0) == 5); - REQUIRE(output[1].size(0) == 5); + CATCH_REQUIRE(output.size() == 2); + CATCH_REQUIRE(output[0].size(0) == 5); + CATCH_REQUIRE(output[1].size(0) == 5); - REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)}) + CATCH_REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)}) .allclose(input)); auto sum = output[0].to({torch::kCUDA, 1}) + output[1]; sum.backward(); - REQUIRE(input.grad().defined()); - REQUIRE(input.grad().device().is_cpu()); - REQUIRE(input.grad().sum().toCInt() == 10); + CATCH_REQUIRE(input.grad().defined()); + CATCH_REQUIRE(input.grad().device().is_cpu()); + CATCH_REQUIRE(input.grad().sum().toCInt() == 10); } -TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") { +CATCH_TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") { Gather gather(torch::Device(torch::kCUDA, 1)); auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0})); auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1})); auto outputs = gather.apply({a, b}); - REQUIRE(outputs.size() == 1); + CATCH_REQUIRE(outputs.size() == 1); auto& output = outputs.front(); - REQUIRE(output.size(0) == 10); - REQUIRE(output.device() == torch::Device(torch::kCUDA, 1)); + CATCH_REQUIRE(output.size(0) == 10); + CATCH_REQUIRE(output.device() == torch::Device(torch::kCUDA, 1)); auto chunks = output.chunk(2); - REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a)); - REQUIRE(chunks[1].allclose(b)); + CATCH_REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a)); + CATCH_REQUIRE(chunks[1].allclose(b)); output.backward(); - REQUIRE(a.grad().defined()); - REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0)); - REQUIRE(a.grad().sum().toCInt() == 5); + CATCH_REQUIRE(a.grad().defined()); + CATCH_REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0)); + CATCH_REQUIRE(a.grad().sum().toCInt() == 5); - REQUIRE(b.grad().defined()); - REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1)); - REQUIRE(b.grad().sum().toCInt() == 5); + CATCH_REQUIRE(b.grad().defined()); + CATCH_REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1)); + CATCH_REQUIRE(b.grad().sum().toCInt() == 5); } -TEST_CASE("Parallel/Replicate", "[multi-cuda]") { +CATCH_TEST_CASE("Parallel/Replicate", "[multi-cuda]") { Linear linear(3, 4); auto replicas = parallel::replicate( linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)}); - REQUIRE(replicas.size() == 2); + CATCH_REQUIRE(replicas.size() == 2); auto original_parameters = linear->parameters(); auto replica1_parameters = replicas[0]->parameters(); for (auto& parameter : replica1_parameters) { - REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0)); + CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0)); } replicas[0]->to(torch::kCPU); - REQUIRE(replica1_parameters.size() == original_parameters.size()); + CATCH_REQUIRE(replica1_parameters.size() == original_parameters.size()); for (size_t i = 0; i < original_parameters.size(); ++i) { - REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i])); - REQUIRE( + CATCH_REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i])); + CATCH_REQUIRE( replica1_parameters[i].data() != original_parameters[i].data()); } auto replica2_parameters = replicas[1]->parameters(); for (auto& parameter : replica2_parameters) { - REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1)); + CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1)); } replicas[1]->to(torch::kCPU); - REQUIRE(replica2_parameters.size() == original_parameters.size()); + CATCH_REQUIRE(replica2_parameters.size() == original_parameters.size()); for (size_t i = 0; i < original_parameters.size(); ++i) { - REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i])); - REQUIRE( + CATCH_REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i])); + CATCH_REQUIRE( replica2_parameters[i].data() != original_parameters[i].data()); } } -TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") { +CATCH_TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") { Linear a(3, 4); Linear b(std::static_pointer_cast(a->clone())); @@ -121,17 +121,17 @@ TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") { auto outputs = parallel::parallel_apply(modules, inputs); - REQUIRE(outputs.size() == 3); - REQUIRE(outputs[0].device().is_cpu()); + CATCH_REQUIRE(outputs.size() == 3); + CATCH_REQUIRE(outputs[0].device().is_cpu()); - REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0)); - REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0])); + CATCH_REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0)); + CATCH_REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0])); - REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1)); - REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0])); + CATCH_REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1)); + CATCH_REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0])); } -TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") { +CATCH_TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") { struct M : torch::nn::Module { torch::Tensor forward(torch::Tensor input) { return torch::ones({5}, torch::dtype(torch::kInt32)); @@ -147,17 +147,17 @@ TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") { auto outputs = parallel::parallel_apply(modules, inputs, devices); - REQUIRE(outputs.size() == 3); - REQUIRE(outputs[0].device().is_cuda()); - REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1)); + CATCH_REQUIRE(outputs.size() == 3); + CATCH_REQUIRE(outputs[0].device().is_cuda()); + CATCH_REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1)); - REQUIRE(outputs[1].device().is_cuda()); - REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0)); + CATCH_REQUIRE(outputs[1].device().is_cuda()); + CATCH_REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0)); - REQUIRE(outputs[2].device().is_cpu()); + CATCH_REQUIRE(outputs[2].device().is_cpu()); } -TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") { +CATCH_TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") { struct M : torch::nn::Cloneable { void reset() override {} torch::Tensor forward(torch::Tensor input) { @@ -167,11 +167,11 @@ TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") { auto m = std::make_shared(); auto input = torch::ones({10, 3}); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( parallel::data_parallel(m, input), StartsWith("Badness!")); } -TEST_CASE( +CATCH_TEST_CASE( "Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice", "[multi-cuda]") { struct M : torch::nn::Cloneable { @@ -192,9 +192,9 @@ TEST_CASE( input, /*devices=*/at::nullopt, /*output_device=*/torch::Device(torch::kCUDA, 1)); - REQUIRE(output.defined()); - REQUIRE(output.device().is_cuda()); - REQUIRE(output.device().index() == 1); + CATCH_REQUIRE(output.defined()); + CATCH_REQUIRE(output.device().is_cuda()); + CATCH_REQUIRE(output.device().index() == 1); } { // Verify for the single-device case (where we don't scatter/gather). @@ -203,16 +203,16 @@ TEST_CASE( input, /*devices=*/std::vector{torch::Device(torch::kCUDA, 0)}, /*output_device=*/torch::Device(torch::kCUDA, 1)); - REQUIRE(m->intermediate_tensor.defined()); - REQUIRE(m->intermediate_tensor.device().is_cuda()); - REQUIRE(m->intermediate_tensor.device().index() == 0); - REQUIRE(output.defined()); - REQUIRE(output.device().is_cuda()); - REQUIRE(output.device().index() == 1); + CATCH_REQUIRE(m->intermediate_tensor.defined()); + CATCH_REQUIRE(m->intermediate_tensor.device().is_cuda()); + CATCH_REQUIRE(m->intermediate_tensor.device().index() == 0); + CATCH_REQUIRE(output.defined()); + CATCH_REQUIRE(output.device().is_cuda()); + CATCH_REQUIRE(output.device().index() == 1); } } -TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") { +CATCH_TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") { struct M : torch::nn::Cloneable { void reset() override {} torch::Tensor forward(torch::Tensor input) { @@ -225,9 +225,9 @@ TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") { auto output = parallel::data_parallel(m, input); const auto device_count = torch::cuda::device_count(); - REQUIRE(output.numel() == device_count); + CATCH_REQUIRE(output.numel() == device_count); for (size_t i = 0; i < device_count; ++i) { - REQUIRE(output[i].toCInt() == i); + CATCH_REQUIRE(output[i].toCInt() == i); } } diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp index 96685728484a39..a307851afbae46 100644 --- a/test/cpp/api/rnn.cpp +++ b/test/cpp/api/rnn.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -71,22 +71,22 @@ void check_lstm_sizes(RNNOutput output) { // Expect the LSTM to have 64 outputs and 3 layers, with an input of batch // 10 and 16 time steps (10 x 16 x n) - REQUIRE(output.output.ndimension() == 3); - REQUIRE(output.output.size(0) == 10); - REQUIRE(output.output.size(1) == 16); - REQUIRE(output.output.size(2) == 64); + CATCH_REQUIRE(output.output.ndimension() == 3); + CATCH_REQUIRE(output.output.size(0) == 10); + CATCH_REQUIRE(output.output.size(1) == 16); + CATCH_REQUIRE(output.output.size(2) == 64); - REQUIRE(output.state.ndimension() == 4); - REQUIRE(output.state.size(0) == 2); // (hx, cx) - REQUIRE(output.state.size(1) == 3); // layers - REQUIRE(output.state.size(2) == 16); // Batchsize - REQUIRE(output.state.size(3) == 64); // 64 hidden dims + CATCH_REQUIRE(output.state.ndimension() == 4); + CATCH_REQUIRE(output.state.size(0) == 2); // (hx, cx) + CATCH_REQUIRE(output.state.size(1) == 3); // layers + CATCH_REQUIRE(output.state.size(2) == 16); // Batchsize + CATCH_REQUIRE(output.state.size(3) == 64); // 64 hidden dims // Something is in the hiddens - REQUIRE(output.state.norm().toCFloat() > 0); + CATCH_REQUIRE(output.state.norm().toCFloat() > 0); } -TEST_CASE("RNN/CheckOutputSizes") { +CATCH_TEST_CASE("RNN/CheckOutputSizes") { torch::manual_seed(0); LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2)); // Input size is: sequence length, batch size, input size @@ -104,10 +104,10 @@ TEST_CASE("RNN/CheckOutputSizes") { torch::Tensor diff = next.state - output.state; // Hiddens changed - REQUIRE(diff.abs().sum().toCFloat() > 1e-3); + CATCH_REQUIRE(diff.abs().sum().toCFloat() > 1e-3); } -TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") { +CATCH_TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") { torch::manual_seed(0); // Make sure the outputs match pytorch outputs LSTM model(2, 2); @@ -127,10 +127,10 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") { } auto out = model->forward(x); - REQUIRE(out.output.ndimension() == 3); - REQUIRE(out.output.size(0) == 3); - REQUIRE(out.output.size(1) == 4); - REQUIRE(out.output.size(2) == 2); + CATCH_REQUIRE(out.output.ndimension() == 3); + CATCH_REQUIRE(out.output.size(0) == 3); + CATCH_REQUIRE(out.output.size(1) == 4); + CATCH_REQUIRE(out.output.size(2) == 2); auto flat = out.output.view(3 * 4 * 2); float c_out[] = {0.4391, 0.5402, 0.4330, 0.5324, 0.4261, 0.5239, @@ -138,14 +138,14 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") { 0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003, 0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666}; for (size_t i = 0; i < 3 * 4 * 2; i++) { - REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3); + CATCH_REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3); } - REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2 - REQUIRE(out.state.size(0) == 2); - REQUIRE(out.state.size(1) == 1); - REQUIRE(out.state.size(2) == 4); - REQUIRE(out.state.size(3) == 2); + CATCH_REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2 + CATCH_REQUIRE(out.state.size(0) == 2); + CATCH_REQUIRE(out.state.size(1) == 1); + CATCH_REQUIRE(out.state.size(2) == 4); + CATCH_REQUIRE(out.state.size(3) == 2); flat = out.state.view(16); float h_out[] = {0.7889, 0.9003, @@ -164,33 +164,33 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") { 1.0931, 1.4911}; for (size_t i = 0; i < 16; i++) { - REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3); + CATCH_REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3); } } -TEST_CASE("RNN/integration/LSTM") { - REQUIRE(test_RNN_xor( +CATCH_TEST_CASE("RNN/integration/LSTM") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); })); } -TEST_CASE("RNN/integration/GRU") { - REQUIRE( +CATCH_TEST_CASE("RNN/integration/GRU") { + CATCH_REQUIRE( test_RNN_xor([](int s) { return GRU(GRUOptions(s, s).layers(2)); })); } -TEST_CASE("RNN/integration/RNN") { - SECTION("relu") { - REQUIRE(test_RNN_xor( +CATCH_TEST_CASE("RNN/integration/RNN") { + CATCH_SECTION("relu") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); })); } - SECTION("tanh") { - REQUIRE(test_RNN_xor( + CATCH_SECTION("tanh") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); })); } } -TEST_CASE("rnn_cuda", "[cuda]") { - SECTION("sizes") { +CATCH_TEST_CASE("rnn_cuda", "[cuda]") { + CATCH_SECTION("sizes") { torch::manual_seed(0); LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2)); model->to(torch::kCUDA); @@ -209,26 +209,26 @@ TEST_CASE("rnn_cuda", "[cuda]") { torch::Tensor diff = next.state - output.state; // Hiddens changed - REQUIRE(diff.abs().sum().toCFloat() > 1e-3); + CATCH_REQUIRE(diff.abs().sum().toCFloat() > 1e-3); } - SECTION("lstm") { - REQUIRE(test_RNN_xor( + CATCH_SECTION("lstm") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }, true)); } - SECTION("gru") { - REQUIRE(test_RNN_xor( + CATCH_SECTION("gru") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return GRU(GRUOptions(s, s).layers(2)); }, true)); } - SECTION("rnn") { - SECTION("relu") { - REQUIRE(test_RNN_xor( + CATCH_SECTION("rnn") { + CATCH_SECTION("relu") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }, true)); } - SECTION("tanh") { - REQUIRE(test_RNN_xor( + CATCH_SECTION("tanh") { + CATCH_REQUIRE(test_RNN_xor( [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }, true)); } } diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp index aef1332720e4a4..777d6e28024b9b 100644 --- a/test/cpp/api/sequential.cpp +++ b/test/cpp/api/sequential.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -21,7 +21,7 @@ using namespace torch::test; using Catch::StartsWith; -TEST_CASE("Sequential/ConstructsFromSharedPointer") { +CATCH_TEST_CASE("Sequential/ConstructsFromSharedPointer") { struct M : torch::nn::Module { explicit M(int value_) : value(value_) {} int value; @@ -31,10 +31,10 @@ TEST_CASE("Sequential/ConstructsFromSharedPointer") { }; Sequential sequential( std::make_shared(1), std::make_shared(2), std::make_shared(3)); - REQUIRE(sequential->size() == 3); + CATCH_REQUIRE(sequential->size() == 3); } -TEST_CASE("Sequential/ConstructsFromConcreteType") { +CATCH_TEST_CASE("Sequential/ConstructsFromConcreteType") { struct M : torch::nn::Module { explicit M(int value_) : value(value_) {} int value; @@ -44,9 +44,9 @@ TEST_CASE("Sequential/ConstructsFromConcreteType") { }; Sequential sequential(M(1), M(2), M(3)); - REQUIRE(sequential->size() == 3); + CATCH_REQUIRE(sequential->size() == 3); } -TEST_CASE("Sequential/ConstructsFromModuleHolder") { +CATCH_TEST_CASE("Sequential/ConstructsFromModuleHolder") { struct MImpl : torch::nn::Module { explicit MImpl(int value_) : value(value_) {} int forward() { @@ -61,10 +61,10 @@ TEST_CASE("Sequential/ConstructsFromModuleHolder") { }; Sequential sequential(M(1), M(2), M(3)); - REQUIRE(sequential->size() == 3); + CATCH_REQUIRE(sequential->size() == 3); } -TEST_CASE("Sequential/PushBackAddsAnElement") { +CATCH_TEST_CASE("Sequential/PushBackAddsAnElement") { struct M : torch::nn::Module { explicit M(int value_) : value(value_) {} int forward() { @@ -73,17 +73,17 @@ TEST_CASE("Sequential/PushBackAddsAnElement") { int value; }; Sequential sequential; - REQUIRE(sequential->size() == 0); - REQUIRE(sequential->is_empty()); + CATCH_REQUIRE(sequential->size() == 0); + CATCH_REQUIRE(sequential->is_empty()); sequential->push_back(Linear(3, 4)); - REQUIRE(sequential->size() == 1); + CATCH_REQUIRE(sequential->size() == 1); sequential->push_back(std::make_shared(1)); - REQUIRE(sequential->size() == 2); + CATCH_REQUIRE(sequential->size() == 2); sequential->push_back(M(2)); - REQUIRE(sequential->size() == 3); + CATCH_REQUIRE(sequential->size() == 3); } -TEST_CASE("Sequential/AccessWithAt") { +CATCH_TEST_CASE("Sequential/AccessWithAt") { struct M : torch::nn::Module { explicit M(int value_) : value(value_) {} int forward() { @@ -98,22 +98,22 @@ TEST_CASE("Sequential/AccessWithAt") { for (auto& module : modules) { sequential->push_back(module); } - REQUIRE(sequential->size() == 3); + CATCH_REQUIRE(sequential->size() == 3); // returns the correct module for a given index for (size_t i = 0; i < modules.size(); ++i) { - REQUIRE(&sequential->at(i) == modules[i].get()); + CATCH_REQUIRE(&sequential->at(i) == modules[i].get()); } // throws for a bad index - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( sequential->at(modules.size() + 1), StartsWith("Index out of range")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( sequential->at(modules.size() + 1000000), StartsWith("Index out of range")); } -TEST_CASE("Sequential/AccessWithPtr") { +CATCH_TEST_CASE("Sequential/AccessWithPtr") { struct M : torch::nn::Module { explicit M(int value_) : value(value_) {} int forward() { @@ -128,46 +128,46 @@ TEST_CASE("Sequential/AccessWithPtr") { for (auto& module : modules) { sequential->push_back(module); } - REQUIRE(sequential->size() == 3); + CATCH_REQUIRE(sequential->size() == 3); // returns the correct module for a given index for (size_t i = 0; i < modules.size(); ++i) { - REQUIRE(sequential->ptr(i).get() == modules[i].get()); - REQUIRE(sequential[i].get() == modules[i].get()); - REQUIRE(sequential->ptr(i).get() == modules[i].get()); + CATCH_REQUIRE(sequential->ptr(i).get() == modules[i].get()); + CATCH_REQUIRE(sequential[i].get() == modules[i].get()); + CATCH_REQUIRE(sequential->ptr(i).get() == modules[i].get()); } // throws for a bad index - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( sequential->ptr(modules.size() + 1), StartsWith("Index out of range")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( sequential->ptr(modules.size() + 1000000), StartsWith("Index out of range")); } -TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") { +CATCH_TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") { Sequential empty; - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( empty->forward(), StartsWith("Cannot call forward() on an empty Sequential")); } -TEST_CASE("Sequential/CallingForwardChainsCorrectly") { +CATCH_TEST_CASE("Sequential/CallingForwardChainsCorrectly") { struct MockModule : torch::nn::Module { explicit MockModule(int value) : expected(value) {} int expected; int forward(int value) { - REQUIRE(value == expected); + CATCH_REQUIRE(value == expected); return value + 1; } }; Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3}); - REQUIRE(sequential->forward(1) == 4); + CATCH_REQUIRE(sequential->forward(1) == 4); } -TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") { +CATCH_TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") { struct M : public torch::nn::Module { int forward() { return 5; @@ -175,14 +175,14 @@ TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") { }; Sequential sequential(M{}); - REQUIRE(sequential->forward() == 5); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE(sequential->forward() == 5); + CATCH_REQUIRE_THROWS_WITH( sequential->forward(), StartsWith("The type of the return value " "is int, but you asked for type float")); } -TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") { +CATCH_TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") { struct M : public torch::nn::Module { torch::Tensor forward(torch::Tensor v) { return v; @@ -191,21 +191,21 @@ TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") { Sequential sequential(M{}); auto variable = torch::ones({3, 3}, torch::requires_grad()); - REQUIRE(sequential->forward(variable).equal(variable)); + CATCH_REQUIRE(sequential->forward(variable).equal(variable)); } -TEST_CASE("Sequential/ForwardReturnsTheLastValue") { +CATCH_TEST_CASE("Sequential/ForwardReturnsTheLastValue") { torch::manual_seed(0); Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100)); auto x = torch::randn({1000, 10}, torch::requires_grad()); auto y = sequential->forward(x); - REQUIRE(y.ndimension() == 2); - REQUIRE(y.size(0) == 1000); - REQUIRE(y.size(1) == 100); + CATCH_REQUIRE(y.ndimension() == 2); + CATCH_REQUIRE(y.size(0) == 1000); + CATCH_REQUIRE(y.size(1) == 100); } -TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") { +CATCH_TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") { Sequential sequential( Linear(10, 3), Conv2d(1, 2, 3), @@ -215,7 +215,7 @@ TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") { LSTM(4, 5)); } -TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") { +CATCH_TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") { struct A : torch::nn::Module { int forward(int x) { return x; @@ -240,34 +240,34 @@ TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") { Sequential b(C{}, D{}); a->extend(*b); - REQUIRE(a->size() == 4); - REQUIRE(a[0]->as()); - REQUIRE(a[1]->as()); - REQUIRE(a[2]->as()); - REQUIRE(a[3]->as()); + CATCH_REQUIRE(a->size() == 4); + CATCH_REQUIRE(a[0]->as()); + CATCH_REQUIRE(a[1]->as()); + CATCH_REQUIRE(a[2]->as()); + CATCH_REQUIRE(a[3]->as()); - REQUIRE(b->size() == 2); - REQUIRE(b[0]->as()); - REQUIRE(b[1]->as()); + CATCH_REQUIRE(b->size() == 2); + CATCH_REQUIRE(b[0]->as()); + CATCH_REQUIRE(b[1]->as()); std::vector> c = {std::make_shared(), std::make_shared()}; b->extend(c); - REQUIRE(b->size() == 4); - REQUIRE(b[0]->as()); - REQUIRE(b[1]->as()); - REQUIRE(b[2]->as()); - REQUIRE(b[3]->as()); + CATCH_REQUIRE(b->size() == 4); + CATCH_REQUIRE(b[0]->as()); + CATCH_REQUIRE(b[1]->as()); + CATCH_REQUIRE(b[2]->as()); + CATCH_REQUIRE(b[3]->as()); } -TEST_CASE("Sequential/HasReferenceSemantics") { +CATCH_TEST_CASE("Sequential/HasReferenceSemantics") { Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5)); Sequential second(first); - REQUIRE(first.get() == second.get()); - REQUIRE(first->size() == second->size()); - REQUIRE(std::equal( + CATCH_REQUIRE(first.get() == second.get()); + CATCH_REQUIRE(first->size() == second->size()); + CATCH_REQUIRE(std::equal( first->begin(), first->end(), second->begin(), @@ -276,17 +276,17 @@ TEST_CASE("Sequential/HasReferenceSemantics") { })); } -TEST_CASE("Sequential/IsCloneable") { +CATCH_TEST_CASE("Sequential/IsCloneable") { Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3)); Sequential clone = std::dynamic_pointer_cast(sequential->clone()); - REQUIRE(sequential->size() == clone->size()); + CATCH_REQUIRE(sequential->size() == clone->size()); for (size_t i = 0; i < sequential->size(); ++i) { // The modules should be the same kind (type). - REQUIRE(sequential[i]->name() == clone[i]->name()); + CATCH_REQUIRE(sequential[i]->name() == clone[i]->name()); // But not pointer-equal (distinct objects). - REQUIRE(sequential[i] != clone[i]); + CATCH_REQUIRE(sequential[i] != clone[i]); } // Verify that the clone is deep, i.e. parameters of modules are cloned too. @@ -295,38 +295,38 @@ TEST_CASE("Sequential/IsCloneable") { auto params1 = sequential->parameters(); auto params2 = clone->parameters(); - REQUIRE(params1.size() == params2.size()); + CATCH_REQUIRE(params1.size() == params2.size()); for (auto& param : params1) { - REQUIRE(!pointer_equal(param.value, params2[param.key])); - REQUIRE(param->device() == params2[param.key].device()); - REQUIRE(param->allclose(params2[param.key])); + CATCH_REQUIRE(!pointer_equal(param.value, params2[param.key])); + CATCH_REQUIRE(param->device() == params2[param.key].device()); + CATCH_REQUIRE(param->allclose(params2[param.key])); param->add_(2); } for (auto& param : params1) { - REQUIRE(!param->allclose(params2[param.key])); + CATCH_REQUIRE(!param->allclose(params2[param.key])); } } -TEST_CASE("Sequential/RegistersElementsAsSubmodules") { +CATCH_TEST_CASE("Sequential/RegistersElementsAsSubmodules") { Sequential sequential(Linear(10, 3), Conv2d(1, 2, 3), FeatureDropout(0.5)); auto modules = sequential->modules(); - REQUIRE(modules.size() == sequential->children().size()); + CATCH_REQUIRE(modules.size() == sequential->children().size()); - REQUIRE(modules[0]->as()); - REQUIRE(modules[1]->as()); - REQUIRE(modules[2]->as()); + CATCH_REQUIRE(modules[0]->as()); + CATCH_REQUIRE(modules[1]->as()); + CATCH_REQUIRE(modules[2]->as()); } -TEST_CASE("Sequential/CloneToDevice", "[cuda]") { +CATCH_TEST_CASE("Sequential/CloneToDevice", "[cuda]") { Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3)); torch::Device device(torch::kCUDA, 0); Sequential clone = std::dynamic_pointer_cast(sequential->clone(device)); for (const auto& p : clone->parameters()) { - REQUIRE(p->device() == device); + CATCH_REQUIRE(p->device() == device); } for (const auto& b : clone->buffers()) { - REQUIRE(b->device() == device); + CATCH_REQUIRE(b->device() == device); } } diff --git a/test/cpp/api/serialization.cpp b/test/cpp/api/serialization.cpp index 354108991f42cd..fda133b8d9bf00 100644 --- a/test/cpp/api/serialization.cpp +++ b/test/cpp/api/serialization.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include #include @@ -30,12 +30,12 @@ Sequential xor_model() { } } // namespace -TEST_CASE("serialization") { +CATCH_TEST_CASE("serialization") { torch::manual_seed(0); - SECTION("undefined") { + CATCH_SECTION("undefined") { auto x = torch::Tensor(); - REQUIRE(!x.defined()); + CATCH_REQUIRE(!x.defined()); auto y = torch::randn({5}); @@ -43,10 +43,10 @@ TEST_CASE("serialization") { torch::save(ss, &x); torch::load(ss, &y); - REQUIRE(!y.defined()); + CATCH_REQUIRE(!y.defined()); } - SECTION("cputypes") { + CATCH_SECTION("cputypes") { for (int i = 0; i < static_cast(torch::Dtype::NumOptions); i++) { if (i == static_cast(torch::Dtype::Half)) { // XXX can't serialize half tensors at the moment since contiguous() is @@ -69,17 +69,17 @@ TEST_CASE("serialization") { torch::save(ss, &x); torch::load(ss, &y); - REQUIRE(y.defined()); - REQUIRE(x.sizes().vec() == y.sizes().vec()); + CATCH_REQUIRE(y.defined()); + CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec()); if (torch::isIntegralType(static_cast(i))) { - REQUIRE(x.equal(y)); + CATCH_REQUIRE(x.equal(y)); } else { - REQUIRE(x.allclose(y)); + CATCH_REQUIRE(x.allclose(y)); } } } - SECTION("binary") { + CATCH_SECTION("binary") { auto x = torch::randn({5, 5}); auto y = torch::Tensor(); @@ -93,11 +93,11 @@ TEST_CASE("serialization") { archive(y); } - REQUIRE(y.defined()); - REQUIRE(x.sizes().vec() == y.sizes().vec()); - REQUIRE(x.allclose(y)); + CATCH_REQUIRE(y.defined()); + CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec()); + CATCH_REQUIRE(x.allclose(y)); } - SECTION("portable_binary") { + CATCH_SECTION("portable_binary") { auto x = torch::randn({5, 5}); auto y = torch::Tensor(); @@ -111,12 +111,12 @@ TEST_CASE("serialization") { archive(y); } - REQUIRE(y.defined()); - REQUIRE(x.sizes().vec() == y.sizes().vec()); - REQUIRE(x.allclose(y)); + CATCH_REQUIRE(y.defined()); + CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec()); + CATCH_REQUIRE(x.allclose(y)); } - SECTION("resized") { + CATCH_SECTION("resized") { auto x = torch::randn({11, 5}); x.resize_({5, 5}); auto y = torch::Tensor(); @@ -131,11 +131,11 @@ TEST_CASE("serialization") { archive(y); } - REQUIRE(y.defined()); - REQUIRE(x.sizes().vec() == y.sizes().vec()); - REQUIRE(x.allclose(y)); + CATCH_REQUIRE(y.defined()); + CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec()); + CATCH_REQUIRE(x.allclose(y)); } - SECTION("sliced") { + CATCH_SECTION("sliced") { auto x = torch::randn({11, 5}); x = x.slice(0, 1, 3); auto y = torch::Tensor(); @@ -150,12 +150,12 @@ TEST_CASE("serialization") { archive(y); } - REQUIRE(y.defined()); - REQUIRE(x.sizes().vec() == y.sizes().vec()); - REQUIRE(x.allclose(y)); + CATCH_REQUIRE(y.defined()); + CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec()); + CATCH_REQUIRE(x.allclose(y)); } - SECTION("noncontig") { + CATCH_SECTION("noncontig") { auto x = torch::randn({11, 5}); x = x.slice(1, 1, 4); auto y = torch::Tensor(); @@ -170,12 +170,12 @@ TEST_CASE("serialization") { archive(y); } - REQUIRE(y.defined()); - REQUIRE(x.sizes().vec() == y.sizes().vec()); - REQUIRE(x.allclose(y)); + CATCH_REQUIRE(y.defined()); + CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec()); + CATCH_REQUIRE(x.allclose(y)); } - SECTION("xor") { + CATCH_SECTION("xor") { // We better be able to save and load a XOR model! auto getLoss = [](Sequential model, uint32_t batch_size) { auto inputs = torch::empty({batch_size, 2}); @@ -207,7 +207,7 @@ TEST_CASE("serialization") { optimizer.step(); running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01; - REQUIRE(epoch < 3000); + CATCH_REQUIRE(epoch < 3000); epoch++; } @@ -216,10 +216,10 @@ TEST_CASE("serialization") { torch::load(ss, model2); auto loss = getLoss(model2, 100); - REQUIRE(loss.toCFloat() < 0.1); + CATCH_REQUIRE(loss.toCFloat() < 0.1); } - SECTION("optim") { + CATCH_SECTION("optim") { auto model1 = Linear(5, 2); auto model2 = Linear(5, 2); auto model3 = Linear(5, 2); @@ -235,8 +235,8 @@ TEST_CASE("serialization") { auto param2 = model2->parameters(); auto param3 = model3->parameters(); for (const auto& p : param1) { - REQUIRE(param1[p.key].allclose(param2[p.key])); - REQUIRE(param2[p.key].allclose(param3[p.key])); + CATCH_REQUIRE(param1[p.key].allclose(param2[p.key])); + CATCH_REQUIRE(param2[p.key].allclose(param3[p.key])); } // Make some optimizers with momentum (and thus state) @@ -281,13 +281,13 @@ TEST_CASE("serialization") { for (const auto& p : param1) { const auto& name = p.key; // Model 1 and 3 should be the same - REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat()); - REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat()); + CATCH_REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat()); + CATCH_REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat()); } } } -TEST_CASE("serialization_cuda", "[cuda]") { +CATCH_TEST_CASE("serialization_cuda", "[cuda]") { torch::manual_seed(0); // We better be able to save and load a XOR model! auto getLoss = [](Sequential model, uint32_t batch_size) { @@ -318,7 +318,7 @@ TEST_CASE("serialization_cuda", "[cuda]") { optimizer.step(); running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01; - REQUIRE(epoch < 3000); + CATCH_REQUIRE(epoch < 3000); epoch++; } @@ -327,7 +327,7 @@ TEST_CASE("serialization_cuda", "[cuda]") { torch::load(ss, model2); auto loss = getLoss(model2, 100); - REQUIRE(loss.toCFloat() < 0.1); + CATCH_REQUIRE(loss.toCFloat() < 0.1); model2->to(torch::kCUDA); ss.clear(); @@ -335,5 +335,5 @@ TEST_CASE("serialization_cuda", "[cuda]") { torch::load(ss, model3); loss = getLoss(model3, 100); - REQUIRE(loss.toCFloat() < 0.1); + CATCH_REQUIRE(loss.toCFloat() < 0.1); } diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp index f08a30d13c1e6e..57605562db1258 100644 --- a/test/cpp/api/tensor.cpp +++ b/test/cpp/api/tensor.cpp @@ -1,4 +1,4 @@ -#include +#include "catch_utils.hpp" #include @@ -19,12 +19,12 @@ bool almost_equal(at::Tensor left, T right, T tolerance = 1e-4) { } #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_) \ - REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type()); \ - REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \ - REQUIRE(tensor.dtype() == (type_)); \ - REQUIRE(tensor.layout() == (layout_)) + CATCH_REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type()); \ + CATCH_REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \ + CATCH_REQUIRE(tensor.dtype() == (type_)); \ + CATCH_REQUIRE(tensor.layout() == (layout_)) -TEST_CASE("Tensor/ToDtype") { +CATCH_TEST_CASE("Tensor/ToDtype") { auto tensor = at::empty({3, 4}); REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided); @@ -39,7 +39,7 @@ TEST_CASE("Tensor/ToDtype") { } // Not currently supported. -// TEST_CASE("Tensor/ToLayout") { +// CATCH_TEST_CASE("Tensor/ToLayout") { // auto tensor = at::empty({3, 4}); // REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided); // @@ -50,7 +50,7 @@ TEST_CASE("Tensor/ToDtype") { // REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided); // } -TEST_CASE("Tensor/ToDevice", "[cuda]") { +CATCH_TEST_CASE("Tensor/ToDevice", "[cuda]") { auto tensor = at::empty({3, 4}); REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided); @@ -67,7 +67,7 @@ TEST_CASE("Tensor/ToDevice", "[cuda]") { REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided); } -TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") { +CATCH_TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") { auto tensor = at::empty({3, 4}); REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided); @@ -75,119 +75,119 @@ TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") { REQUIRE_TENSOR_OPTIONS(at::kCUDA, 1, at::kInt, at::kStrided); } -TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") { +CATCH_TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") { { auto tensor = torch::empty({3, 4}, at::requires_grad()); - REQUIRE(tensor.requires_grad()); + CATCH_REQUIRE(tensor.requires_grad()); tensor = tensor.to(at::kDouble); - REQUIRE(tensor.requires_grad()); + CATCH_REQUIRE(tensor.requires_grad()); } { auto tensor = torch::empty({3, 4}); - REQUIRE(!tensor.requires_grad()); + CATCH_REQUIRE(!tensor.requires_grad()); tensor = tensor.to(at::kDouble); - REQUIRE(!tensor.requires_grad()); + CATCH_REQUIRE(!tensor.requires_grad()); } } -TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") { +CATCH_TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") { auto tensor = at::empty({3, 4}, at::kFloat); auto hopefully_not_copy = tensor.to(at::kFloat); - REQUIRE(hopefully_not_copy.data() == tensor.data()); + CATCH_REQUIRE(hopefully_not_copy.data() == tensor.data()); } -TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") { +CATCH_TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") { auto tensor = at::tensor(123); - REQUIRE(tensor.numel() == 1); - REQUIRE(tensor.dtype() == at::kInt); - REQUIRE(tensor[0].toCInt() == 123); + CATCH_REQUIRE(tensor.numel() == 1); + CATCH_REQUIRE(tensor.dtype() == at::kInt); + CATCH_REQUIRE(tensor[0].toCInt() == 123); tensor = at::tensor(123.456f); - REQUIRE(tensor.numel() == 1); - REQUIRE(tensor.dtype() == at::kFloat); - REQUIRE(almost_equal(tensor[0], 123.456f)); + CATCH_REQUIRE(tensor.numel() == 1); + CATCH_REQUIRE(tensor.dtype() == at::kFloat); + CATCH_REQUIRE(almost_equal(tensor[0], 123.456f)); tensor = at::tensor(123.456); - REQUIRE(tensor.numel() == 1); - REQUIRE(tensor.dtype() == at::kDouble); - REQUIRE(almost_equal(tensor[0], 123.456)); + CATCH_REQUIRE(tensor.numel() == 1); + CATCH_REQUIRE(tensor.dtype() == at::kDouble); + CATCH_REQUIRE(almost_equal(tensor[0], 123.456)); } -TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") { +CATCH_TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") { auto tensor = at::tensor({1, 2, 3}); - REQUIRE(tensor.numel() == 3); - REQUIRE(tensor.dtype() == at::kInt); - REQUIRE(exactly_equal(tensor[0], 1)); - REQUIRE(exactly_equal(tensor[1], 2)); - REQUIRE(exactly_equal(tensor[2], 3)); + CATCH_REQUIRE(tensor.numel() == 3); + CATCH_REQUIRE(tensor.dtype() == at::kInt); + CATCH_REQUIRE(exactly_equal(tensor[0], 1)); + CATCH_REQUIRE(exactly_equal(tensor[1], 2)); + CATCH_REQUIRE(exactly_equal(tensor[2], 3)); tensor = at::tensor({1.5, 2.25, 3.125}); - REQUIRE(tensor.numel() == 3); - REQUIRE(tensor.dtype() == at::kDouble); - REQUIRE(almost_equal(tensor[0], 1.5)); - REQUIRE(almost_equal(tensor[1], 2.25)); - REQUIRE(almost_equal(tensor[2], 3.125)); + CATCH_REQUIRE(tensor.numel() == 3); + CATCH_REQUIRE(tensor.dtype() == at::kDouble); + CATCH_REQUIRE(almost_equal(tensor[0], 1.5)); + CATCH_REQUIRE(almost_equal(tensor[1], 2.25)); + CATCH_REQUIRE(almost_equal(tensor[2], 3.125)); } -TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") { +CATCH_TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") { auto tensor = torch::tensor({1, 2, 3}); - REQUIRE(tensor.is_variable()); - REQUIRE(tensor.numel() == 3); - REQUIRE(tensor.dtype() == at::kInt); - REQUIRE(exactly_equal(tensor[0], 1)); - REQUIRE(exactly_equal(tensor[1], 2)); - REQUIRE(exactly_equal(tensor[2], 3)); + CATCH_REQUIRE(tensor.is_variable()); + CATCH_REQUIRE(tensor.numel() == 3); + CATCH_REQUIRE(tensor.dtype() == at::kInt); + CATCH_REQUIRE(exactly_equal(tensor[0], 1)); + CATCH_REQUIRE(exactly_equal(tensor[1], 2)); + CATCH_REQUIRE(exactly_equal(tensor[2], 3)); tensor = torch::tensor({1.5, 2.25, 3.125}); - REQUIRE(tensor.is_variable()); - REQUIRE(tensor.numel() == 3); - REQUIRE(tensor.dtype() == at::kDouble); - REQUIRE(almost_equal(tensor[0], 1.5)); - REQUIRE(almost_equal(tensor[1], 2.25)); - REQUIRE(almost_equal(tensor[2], 3.125)); + CATCH_REQUIRE(tensor.is_variable()); + CATCH_REQUIRE(tensor.numel() == 3); + CATCH_REQUIRE(tensor.dtype() == at::kDouble); + CATCH_REQUIRE(almost_equal(tensor[0], 1.5)); + CATCH_REQUIRE(almost_equal(tensor[1], 2.25)); + CATCH_REQUIRE(almost_equal(tensor[2], 3.125)); } -TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") { +CATCH_TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") { std::vector v = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; auto tensor = at::tensor(v); - REQUIRE(tensor.numel() == v.size()); - REQUIRE(tensor.dtype() == at::kInt); + CATCH_REQUIRE(tensor.numel() == v.size()); + CATCH_REQUIRE(tensor.dtype() == at::kInt); for (size_t i = 0; i < v.size(); ++i) { - REQUIRE(exactly_equal(tensor[i], v.at(i))); + CATCH_REQUIRE(exactly_equal(tensor[i], v.at(i))); } std::vector w = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.0}; tensor = at::tensor(w); - REQUIRE(tensor.numel() == w.size()); - REQUIRE(tensor.dtype() == at::kFloat); + CATCH_REQUIRE(tensor.numel() == w.size()); + CATCH_REQUIRE(tensor.dtype() == at::kFloat); for (size_t i = 0; i < w.size(); ++i) { - REQUIRE(almost_equal(tensor[i], w.at(i))); + CATCH_REQUIRE(almost_equal(tensor[i], w.at(i))); } } -TEST_CASE("Tensor/UsesOptionsThatAreSupplied") { +CATCH_TEST_CASE("Tensor/UsesOptionsThatAreSupplied") { auto tensor = at::tensor(123, dtype(at::kFloat)) + 0.5; - REQUIRE(tensor.numel() == 1); - REQUIRE(tensor.dtype() == at::kFloat); - REQUIRE(almost_equal(tensor[0], 123.5)); + CATCH_REQUIRE(tensor.numel() == 1); + CATCH_REQUIRE(tensor.dtype() == at::kFloat); + CATCH_REQUIRE(almost_equal(tensor[0], 123.5)); tensor = at::tensor({1.1, 2.2, 3.3}, dtype(at::kInt)); - REQUIRE(tensor.numel() == 3); - REQUIRE(tensor.dtype() == at::kInt); - REQUIRE(tensor.layout() == at::kStrided); - REQUIRE(exactly_equal(tensor[0], 1)); - REQUIRE(exactly_equal(tensor[1], 2)); - REQUIRE(exactly_equal(tensor[2], 3)); + CATCH_REQUIRE(tensor.numel() == 3); + CATCH_REQUIRE(tensor.dtype() == at::kInt); + CATCH_REQUIRE(tensor.layout() == at::kStrided); + CATCH_REQUIRE(exactly_equal(tensor[0], 1)); + CATCH_REQUIRE(exactly_equal(tensor[1], 2)); + CATCH_REQUIRE(exactly_equal(tensor[2], 3)); } -TEST_CASE("FromBlob") { +CATCH_TEST_CASE("FromBlob") { std::vector v = {1, 2, 3}; auto tensor = torch::from_blob(v.data(), v.size(), torch::kInt32); - REQUIRE(tensor.is_variable()); - REQUIRE(tensor.numel() == 3); - REQUIRE(tensor[0].toCInt() == 1); - REQUIRE(tensor[1].toCInt() == 2); - REQUIRE(tensor[2].toCInt() == 3); + CATCH_REQUIRE(tensor.is_variable()); + CATCH_REQUIRE(tensor.numel() == 3); + CATCH_REQUIRE(tensor[0].toCInt() == 1); + CATCH_REQUIRE(tensor[1].toCInt() == 2); + CATCH_REQUIRE(tensor[2].toCInt() == 3); } diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp index 82d874e74b11b0..8f85014499d201 100644 --- a/test/cpp/api/tensor_cuda.cpp +++ b/test/cpp/api/tensor_cuda.cpp @@ -1,11 +1,11 @@ -#include +#include "catch_utils.hpp" #include #include -TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") { +CATCH_TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") { auto tensor = at::tensor({1, 2, 3}, at::device({at::kCUDA, 1})); - REQUIRE(tensor.device().type() == at::Device::Type::CUDA); - REQUIRE(tensor.device().index() == 1); + CATCH_REQUIRE(tensor.device().type() == at::Device::Type::CUDA); + CATCH_REQUIRE(tensor.device().index() == 1); } diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp index ab80c5f45ab396..7118a3599a5d1a 100644 --- a/test/cpp/api/tensor_options.cpp +++ b/test/cpp/api/tensor_options.cpp @@ -1,4 +1,4 @@ -#include "catch.hpp" +#include "catch_utils.hpp" #include @@ -14,28 +14,28 @@ using namespace at; // A macro so we don't lose location information when an assertion fails. #define REQUIRE_OPTIONS(device_, index_, type_, layout_) \ - REQUIRE(options.device().type() == Device((device_), (index_)).type()); \ - REQUIRE(options.device().index() == Device((device_), (index_)).index()); \ - REQUIRE(options.dtype() == (type_)); \ - REQUIRE(options.layout() == (layout_)) + CATCH_REQUIRE(options.device().type() == Device((device_), (index_)).type()); \ + CATCH_REQUIRE(options.device().index() == Device((device_), (index_)).index()); \ + CATCH_REQUIRE(options.dtype() == (type_)); \ + CATCH_REQUIRE(options.layout() == (layout_)) #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_) \ - REQUIRE(tensor.device().type() == Device((device_), (index_)).type()); \ - REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \ - REQUIRE(tensor.type().scalarType() == (type_)); \ - REQUIRE(tensor.type().layout() == (layout_)) + CATCH_REQUIRE(tensor.device().type() == Device((device_), (index_)).type()); \ + CATCH_REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \ + CATCH_REQUIRE(tensor.type().scalarType() == (type_)); \ + CATCH_REQUIRE(tensor.type().layout() == (layout_)) -TEST_CASE("TensorOptions/DefaultsToTheRightValues") { +CATCH_TEST_CASE("TensorOptions/DefaultsToTheRightValues") { TensorOptions options; REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided); } -TEST_CASE("TensorOptions/ReturnsTheCorrectType") { +CATCH_TEST_CASE("TensorOptions/ReturnsTheCorrectType") { auto options = TensorOptions().device(kCPU).dtype(kInt).layout(kSparse); - REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt)); + CATCH_REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt)); } -TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") { +CATCH_TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") { auto options = dtype(kInt); REQUIRE_OPTIONS(kCPU, -1, kInt, kStrided); @@ -52,7 +52,7 @@ TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") { REQUIRE_OPTIONS(kCUDA, 3, kByte, kSparse); } -TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") { +CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") { TensorOptions options; REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided); @@ -69,7 +69,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") { REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse); } -TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") { +CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") { auto options = empty(5, kDouble).options(); REQUIRE_OPTIONS(kCPU, -1, kDouble, kStrided); @@ -77,37 +77,37 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") { REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse); } -TEST_CASE("TensorOptions/ConstructsWellFromVariables") { +CATCH_TEST_CASE("TensorOptions/ConstructsWellFromVariables") { auto options = torch::empty(5).options(); REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided); - REQUIRE(!options.requires_grad()); + CATCH_REQUIRE(!options.requires_grad()); options = torch::empty(5, at::requires_grad()).options(); REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided); - REQUIRE(!options.requires_grad()); + CATCH_REQUIRE(!options.requires_grad()); } -TEST_CASE("Device/ParsesCorrectlyFromString") { +CATCH_TEST_CASE("Device/ParsesCorrectlyFromString") { Device device("cpu:0"); - REQUIRE(device == Device(kCPU, 0)); + CATCH_REQUIRE(device == Device(kCPU, 0)); device = Device("cpu"); - REQUIRE(device == Device(kCPU)); + CATCH_REQUIRE(device == Device(kCPU)); device = Device("cuda:123"); - REQUIRE(device == Device(kCUDA, 123)); + CATCH_REQUIRE(device == Device(kCUDA, 123)); device = Device("cuda"); - REQUIRE(device == Device(kCUDA)); + CATCH_REQUIRE(device == Device(kCUDA)); std::vector badnesses = { "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"}; for (const auto& badness : badnesses) { - REQUIRE_THROWS(Device(badness)); + _CATCH_REQUIRE_THROWS(Device(badness)); } } -TEST_CASE("OptionsGuard") { +CATCH_TEST_CASE("OptionsGuard") { Tensor tensor; { OptionsGuard guard(TensorOptions{}); @@ -132,5 +132,5 @@ TEST_CASE("OptionsGuard") { tensor = torch::empty({10}); } REQUIRE_TENSOR_OPTIONS(kCPU, -1, kFloat, kStrided); - REQUIRE(tensor.requires_grad()); + CATCH_REQUIRE(tensor.requires_grad()); } diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp index ea33321b08e2f7..edeede8423c392 100644 --- a/test/cpp/api/tensor_options_cuda.cpp +++ b/test/cpp/api/tensor_options_cuda.cpp @@ -1,4 +1,4 @@ -#include "catch.hpp" +#include "catch_utils.hpp" #include #include @@ -10,18 +10,18 @@ using namespace at; // A macro so we don't lose location information when an assertion fails. #define REQUIRE_OPTIONS(device_, index_, type_, layout_) \ - REQUIRE(options.device().type() == Device((device_), (index_)).type()); \ - REQUIRE(options.device().index() == Device((device_), (index_)).index()); \ - REQUIRE(options.dtype() == (type_)); \ - REQUIRE(options.layout() == (layout_)) + CATCH_REQUIRE(options.device().type() == Device((device_), (index_)).type()); \ + CATCH_REQUIRE(options.device().index() == Device((device_), (index_)).index()); \ + CATCH_REQUIRE(options.dtype() == (type_)); \ + CATCH_REQUIRE(options.layout() == (layout_)) #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_) \ - REQUIRE(tensor.device().type() == Device((device_), (index_)).type()); \ - REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \ - REQUIRE(tensor.type().scalarType() == (type_)); \ - REQUIRE(tensor.type().layout() == (layout_)) + CATCH_REQUIRE(tensor.device().type() == Device((device_), (index_)).type()); \ + CATCH_REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \ + CATCH_REQUIRE(tensor.type().scalarType() == (type_)); \ + CATCH_REQUIRE(tensor.type().layout() == (layout_)) -TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") { +CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") { auto options = CUDA(kFloat).options(); REQUIRE_OPTIONS(kCUDA, -1, kFloat, kStrided); @@ -41,7 +41,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") { REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse); } -TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") { +CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") { auto options = empty(5, device(kCUDA).dtype(kDouble)).options(); REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided); @@ -66,7 +66,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") { } } -TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") { +CATCH_TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") { Tensor tensor; { OptionsGuard guard(device(kCUDA)); @@ -87,7 +87,7 @@ TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") { REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kInt, kStrided); } -TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") { +CATCH_TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") { Tensor tensor; { // Check that OptionsGuard respects any active device before construction. @@ -112,17 +112,17 @@ TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") { } } -TEST_CASE("DeviceGuardIsMovable", "[cuda]") { +CATCH_TEST_CASE("DeviceGuardIsMovable", "[cuda]") { DeviceGuard first(1); - REQUIRE(first.original_index() == 0); - REQUIRE(first.last_index() == 1); + CATCH_REQUIRE(first.original_index() == 0); + CATCH_REQUIRE(first.last_index() == 1); DeviceGuard second(std::move(first)); - REQUIRE(second.original_index() == 0); - REQUIRE(second.last_index() == 1); - REQUIRE(first.original_index() == -1); + CATCH_REQUIRE(second.original_index() == 0); + CATCH_REQUIRE(second.last_index() == 1); + CATCH_REQUIRE(first.original_index() == -1); DeviceGuard third; third = std::move(second); - REQUIRE(third.original_index() == 0); - REQUIRE(third.last_index() == 1); - REQUIRE(second.original_index() == -1); + CATCH_REQUIRE(third.original_index() == 0); + CATCH_REQUIRE(third.last_index() == 1); + CATCH_REQUIRE(second.original_index() == -1); } diff --git a/torch/csrc/jit/catch_utils.hpp b/torch/csrc/jit/catch_utils.hpp new file mode 100644 index 00000000000000..b9b0a87990a9ce --- /dev/null +++ b/torch/csrc/jit/catch_utils.hpp @@ -0,0 +1,8 @@ +#pragma once + +#define CATCH_CONFIG_PREFIX_ALL +#include + +// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning; +// define our own version that doesn't warn. +#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ ) diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index 28bf95847d634f..3110fb2c360a35 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -1,13 +1,13 @@ #ifdef USE_CATCH #define CATCH_CONFIG_MAIN -#include "catch.hpp" +#include "catch_utils.hpp" using Catch::StartsWith; #else -#define REQUIRE JIT_ASSERT +#define CATCH_REQUIRE JIT_ASSERT #endif @@ -110,9 +110,9 @@ static void codeTemplateTest() { e.v("what",{"is","this"}); TemplateEnv c(e); c.s("hi","foo2"); - REQUIRE(e.s("hi") == "foo"); - REQUIRE(c.s("hi") == "foo2"); - REQUIRE(e.v("what")[0] == "is"); + CATCH_REQUIRE(e.s("hi") == "foo"); + CATCH_REQUIRE(c.s("hi") == "foo2"); + CATCH_REQUIRE(e.v("what")[0] == "is"); } { @@ -126,7 +126,7 @@ static void codeTemplateTest() { auto s = ct.format(e); //std::cout << "'" << s << "'\n"; //std::cout << "'" << ct_expect << "'\n"; - REQUIRE(s == ct_expect); + CATCH_REQUIRE(s == ct_expect); } } @@ -146,11 +146,11 @@ static void fusionTests() { auto b = at::rand({4,3}, at::kCUDA).transpose(0,1); auto o = at::zeros({3,4}, at::kCUDA); auto outputs = debugLaunchGraph(graph, 0, {a,b}); - REQUIRE(outputs.size() == 1); + CATCH_REQUIRE(outputs.size() == 1); auto o2 = a*b; float max_diff = (o2 - outputs[0]).abs().max().toCDouble(); //std::cout << "max diff: " << max_diff << "\n"; - REQUIRE(max_diff == 0); + CATCH_REQUIRE(max_diff == 0); }; testSimple(); @@ -200,10 +200,10 @@ static void fusionTests() { auto out0 = t16*t5; auto outputs = debugLaunchGraph(graph, 0, inputs); - REQUIRE(outputs.size() == graph.outputs().size()); - REQUIRE(out0.is_same_size(outputs.front())); + CATCH_REQUIRE(outputs.size() == graph.outputs().size()); + CATCH_REQUIRE(out0.is_same_size(outputs.front())); float max_diff = (outputs.front() - out0).abs().max().toCDouble(); - REQUIRE(max_diff < 1e-6); + CATCH_REQUIRE(max_diff < 1e-6); }; testOne(0,0,0,0); @@ -234,12 +234,12 @@ static void fusionTests() { auto o_r = a*b; auto o2_r = at::cat({a, o_r}, dim); auto outputs = debugLaunchGraph(graph, 0, {a,b}); - REQUIRE(outputs.size() == 2); + CATCH_REQUIRE(outputs.size() == 2); float max_diff = (o_r - outputs[0]).abs().max().toCDouble(); - REQUIRE(max_diff == 0); + CATCH_REQUIRE(max_diff == 0); float max_diff2 = (o2_r - outputs[1]).abs().max().toCDouble(); - REQUIRE(max_diff2 == 0); + CATCH_REQUIRE(max_diff2 == 0); }; testConcat(0); testConcat(1); @@ -255,58 +255,58 @@ void attributesTest() { auto four = attr::perm; Attr attr; attr.f_(one,3.4)->i_(two,5)->s_(three,"what"); - REQUIRE(attr.f(one) == 3.4); - REQUIRE(attr.s(three) == "what"); - REQUIRE(attr.i(two) == 5); + CATCH_REQUIRE(attr.f(one) == 3.4); + CATCH_REQUIRE(attr.s(three) == "what"); + CATCH_REQUIRE(attr.i(two) == 5); attr.s_(one,"no"); - REQUIRE(attr.s(one) == "no"); - REQUIRE(attr.hasAttribute(three)); - REQUIRE(!attr.hasAttribute(four)); + CATCH_REQUIRE(attr.s(one) == "no"); + CATCH_REQUIRE(attr.hasAttribute(three)); + CATCH_REQUIRE(!attr.hasAttribute(four)); attr.ss_(two, {"hi", "now"}); - REQUIRE(attr.ss(two).at(1) == "now"); + CATCH_REQUIRE(attr.ss(two).at(1) == "now"); Attr attr2; attr2.copyAttributes(attr); - REQUIRE(attr2.s(one) == "no"); + CATCH_REQUIRE(attr2.s(one) == "no"); attr2.f_(one,5); - REQUIRE(attr.s(one) == "no"); - REQUIRE(attr2.f(one) == 5); + CATCH_REQUIRE(attr.s(one) == "no"); + CATCH_REQUIRE(attr2.f(one) == 5); } void internedStringsTests () { - REQUIRE(prim::Param == Symbol::prim("Param")); - REQUIRE(prim::Return == Symbol::prim("Return")); - REQUIRE(prim::Return.toUnqualString() == std::string("Return")); - REQUIRE(prim::Return.toQualString() == std::string("prim::Return")); + CATCH_REQUIRE(prim::Param == Symbol::prim("Param")); + CATCH_REQUIRE(prim::Return == Symbol::prim("Return")); + CATCH_REQUIRE(prim::Return.toUnqualString() == std::string("Return")); + CATCH_REQUIRE(prim::Return.toQualString() == std::string("prim::Return")); Symbol newsym = Symbol::aten("__NEW_SYMBOL"); size_t symstart = newsym; - REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL")); + CATCH_REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL")); // TODO: This test is a bit too close to the implementation details. - REQUIRE(Symbol::aten("What") == symstart+1); - REQUIRE(Symbol::aten("What2") == symstart+2); - REQUIRE(Symbol::aten("What") == symstart+1); - REQUIRE(Symbol::aten("What2") == symstart+2); - REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2")); + CATCH_REQUIRE(Symbol::aten("What") == symstart+1); + CATCH_REQUIRE(Symbol::aten("What2") == symstart+2); + CATCH_REQUIRE(Symbol::aten("What") == symstart+1); + CATCH_REQUIRE(Symbol::aten("What2") == symstart+2); + CATCH_REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2")); } void fromQualStringTests() { - REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param")); - REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm")); - REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM")); - REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value")); - REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope("")); - REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string("")); - REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::")); - REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param")); - REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns")); - REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns")); + CATCH_REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param")); + CATCH_REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm")); + CATCH_REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM")); + CATCH_REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value")); + CATCH_REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope("")); + CATCH_REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string("")); + CATCH_REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::")); + CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param")); + CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns")); + CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns")); auto bad_inputs = {"scope", ":", ""}; for (auto input : bad_inputs) { try { Symbol::fromQualString(input); - REQUIRE(0); + CATCH_REQUIRE(0); } catch (std::runtime_error c) { } } @@ -467,8 +467,8 @@ void interpTest() { std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh); //std::cout << almostEqual(outputs[0],hx) << "\n"; - REQUIRE(exactlyEqual(outputs[0],hx)); - REQUIRE(exactlyEqual(outputs[1],cx)); + CATCH_REQUIRE(exactlyEqual(outputs[0],hx)); + CATCH_REQUIRE(exactlyEqual(outputs[1],cx)); } void interpStageTest() { @@ -500,8 +500,8 @@ void interpStageTest() { std::tie(hx, cx) = lstm(input[0], hx, cx1, w_ih, w_hh); //std::cout << almostEqual(outputs[0],hx) << "\n"; - REQUIRE(exactlyEqual(outputs[0],hx)); - REQUIRE(exactlyEqual(outputs[1],cx)); + CATCH_REQUIRE(exactlyEqual(outputs[0],hx)); + CATCH_REQUIRE(exactlyEqual(outputs[1],cx)); } using var_meta_type = std::vector; @@ -554,10 +554,10 @@ variable_list grad(const variable_list& outputs, const variable_list& inputs, co } void assertAllClose(const tensor_list& a, const tensor_list& b) { - REQUIRE(a.size() == b.size()); + CATCH_REQUIRE(a.size() == b.size()); for (size_t i = 0; i < a.size(); ++i) { - REQUIRE(a[i].is_same_size(b[i])); - REQUIRE(a[i].allclose(b[i])); + CATCH_REQUIRE(a[i].is_same_size(b[i])); + CATCH_REQUIRE(a[i].allclose(b[i])); } } @@ -654,11 +654,11 @@ void testDifferentiate(std::ostream & out) { std::vector expected_captured_outputs = {1}; std::vector expected_input_vjps = {0, 1}; std::vector expected_output_vjps = {0, 1}; - REQUIRE(grad_spec.f_real_outputs == 1); - REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs); - REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs); - REQUIRE(grad_spec.df_input_vjps == expected_input_vjps); - REQUIRE(grad_spec.df_output_vjps == expected_output_vjps); + CATCH_REQUIRE(grad_spec.f_real_outputs == 1); + CATCH_REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs); + CATCH_REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs); + CATCH_REQUIRE(grad_spec.df_input_vjps == expected_input_vjps); + CATCH_REQUIRE(grad_spec.df_output_vjps == expected_output_vjps); out << "testDifferentiate\n"; out << *grad_spec.f; out << *grad_spec.df; @@ -684,11 +684,11 @@ void testDifferentiateWithRequiresGrad(std::ostream & out) { auto grad_spec = differentiate(graph); std::vector expected_input_vjps = {1, 2}; // for e and %4 = (d + a) std::vector expected_output_vjps = {0}; // only a requires grad - REQUIRE(grad_spec.f_real_outputs == 2); // we need one temporary %4 = (d + a) - REQUIRE(grad_spec.df_input_captured_inputs == std::vector({0})); - REQUIRE(grad_spec.df_input_captured_outputs == std::vector({2})); - REQUIRE(grad_spec.df_input_vjps == expected_input_vjps); - REQUIRE(grad_spec.df_output_vjps == expected_output_vjps); + CATCH_REQUIRE(grad_spec.f_real_outputs == 2); // we need one temporary %4 = (d + a) + CATCH_REQUIRE(grad_spec.df_input_captured_inputs == std::vector({0})); + CATCH_REQUIRE(grad_spec.df_input_captured_outputs == std::vector({2})); + CATCH_REQUIRE(grad_spec.df_input_vjps == expected_input_vjps); + CATCH_REQUIRE(grad_spec.df_output_vjps == expected_output_vjps); out << "testDifferentiateWithRequiresGrad\n"; out << *grad_spec.f; out << *grad_spec.df; @@ -718,7 +718,7 @@ bool isEqual(at::IntList lhs, at::IntList rhs) { } bool isEqual(const CompleteArgumentInfo & ti, const autograd::Variable & v) { - REQUIRE(ti.isTensor()); + CATCH_REQUIRE(ti.isTensor()); if(!ti.defined()) return ti.defined() == v.defined(); return @@ -754,34 +754,34 @@ void argumentSpecTest() { CompleteArgumentSpec a(true, list); CompleteArgumentSpec b(true, list); - REQUIRE(a.hashCode() == b.hashCode()); + CATCH_REQUIRE(a.hashCode() == b.hashCode()); - REQUIRE(a == b); + CATCH_REQUIRE(a == b); CompleteArgumentSpec d(true, list2); - REQUIRE(d == a); - REQUIRE(d.hashCode() == a.hashCode()); + CATCH_REQUIRE(d == a); + CATCH_REQUIRE(d.hashCode() == a.hashCode()); for(size_t i = 0; i < list.size(); ++i) { - REQUIRE(isEqual(a.at(i), list[i].toTensor())); + CATCH_REQUIRE(isEqual(a.at(i), list[i].toTensor())); } CompleteArgumentSpec no_grad(/*with_grad=*/false, list); - REQUIRE(no_grad != a); + CATCH_REQUIRE(no_grad != a); std::unordered_set spec; spec.insert(std::move(a)); - REQUIRE(spec.count(b) > 0); - REQUIRE(spec.count(no_grad) == 0); + CATCH_REQUIRE(spec.count(b) > 0); + CATCH_REQUIRE(spec.count(no_grad) == 0); spec.insert(std::move(no_grad)); - REQUIRE(spec.count(CompleteArgumentSpec(true,list)) == 1); + CATCH_REQUIRE(spec.count(CompleteArgumentSpec(true,list)) == 1); list2[1].toTensor().transpose_(0,1); CompleteArgumentSpec c(true, list2); // same as list, except for one stride - REQUIRE(!(c == a)); - REQUIRE(spec.count(c) == 0); + CATCH_REQUIRE(!(c == a)); + CATCH_REQUIRE(spec.count(c) == 0); Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) }; CompleteArgumentSpec with_const(true, stack); - REQUIRE(with_const.at(2).sizes().size() == 2); + CATCH_REQUIRE(with_const.at(2).sizes().size() == 2); } void testGraphExecutor() { @@ -802,11 +802,11 @@ void testGraphExecutor() { GraphExecutor executor(g); auto stack = createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh)}); executor.run(stack); - REQUIRE(stack.size() == 2); + CATCH_REQUIRE(stack.size() == 2); at::Tensor r0, r1; std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh); - REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0)); - REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1)); + CATCH_REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0)); + CATCH_REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1)); } void testBlocks(std::ostream & out) { @@ -877,11 +877,11 @@ void testControlFlow() { auto run_binary = [&](const std::string & name, int64_t a, int64_t b) { return V(run(name, {L(a), L(b)})[0]); }; - REQUIRE(2 == run_binary("if_test", 1, 2)); - REQUIRE(3 == run_binary("if_test", 3, 2)); - REQUIRE(2 == run_binary("if_one", 2, 3)); - REQUIRE(2 == run_binary("if_one", 3, 2)); - REQUIRE(256 == run_binary("while_test",2,0)); + CATCH_REQUIRE(2 == run_binary("if_test", 1, 2)); + CATCH_REQUIRE(3 == run_binary("if_test", 3, 2)); + CATCH_REQUIRE(2 == run_binary("if_one", 2, 3)); + CATCH_REQUIRE(2 == run_binary("if_one", 3, 2)); + CATCH_REQUIRE(256 == run_binary("while_test",2,0)); } void testIValue() { @@ -939,18 +939,18 @@ void testCustomOperators() { RegisterOperators reg({createOperator( "foo::bar", [](double a, at::Tensor b) { return a + b; })}); auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); - REQUIRE(ops.size() == 1); + CATCH_REQUIRE(ops.size() == 1); auto& op = ops.front(); - REQUIRE(op->schema().name == "foo::bar"); + CATCH_REQUIRE(op->schema().name == "foo::bar"); - REQUIRE(op->schema().arguments.size() == 2); - REQUIRE(op->schema().arguments[0].name == "_0"); - REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType); - REQUIRE(op->schema().arguments[1].name == "_1"); - REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType); + CATCH_REQUIRE(op->schema().arguments.size() == 2); + CATCH_REQUIRE(op->schema().arguments[0].name == "_0"); + CATCH_REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType); + CATCH_REQUIRE(op->schema().arguments[1].name == "_1"); + CATCH_REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType); - REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType); + CATCH_REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType); Stack stack; push(stack, 2.0f, autograd::make_variable(at::ones(5))); @@ -958,7 +958,7 @@ void testCustomOperators() { at::Tensor output; pop(stack, output); - REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f)))); + CATCH_REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f)))); } { RegisterOperators reg({createOperator( @@ -967,19 +967,19 @@ void testCustomOperators() { auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); - REQUIRE(ops.size() == 1); + CATCH_REQUIRE(ops.size() == 1); auto& op = ops.front(); - REQUIRE(op->schema().name == "foo::bar_with_schema"); + CATCH_REQUIRE(op->schema().name == "foo::bar_with_schema"); - REQUIRE(op->schema().arguments.size() == 2); - REQUIRE(op->schema().arguments[0].name == "a"); - REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType); - REQUIRE(op->schema().arguments[1].name == "b"); - REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType); + CATCH_REQUIRE(op->schema().arguments.size() == 2); + CATCH_REQUIRE(op->schema().arguments[0].name == "a"); + CATCH_REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType); + CATCH_REQUIRE(op->schema().arguments[1].name == "b"); + CATCH_REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType); - REQUIRE(op->schema().returns.size() == 1); - REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType); + CATCH_REQUIRE(op->schema().returns.size() == 1); + CATCH_REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType); Stack stack; push(stack, 2.0f, autograd::make_variable(at::ones(5))); @@ -987,7 +987,7 @@ void testCustomOperators() { at::Tensor output; pop(stack, output); - REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f)))); + CATCH_REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f)))); } { // Check that lists work well. @@ -999,21 +999,21 @@ void testCustomOperators() { auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); - REQUIRE(ops.size() == 1); + CATCH_REQUIRE(ops.size() == 1); auto& op = ops.front(); - REQUIRE(op->schema().name == "foo::lists"); + CATCH_REQUIRE(op->schema().name == "foo::lists"); - REQUIRE(op->schema().arguments.size() == 3); - REQUIRE(op->schema().arguments[0].name == "ints"); - REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofInts())); - REQUIRE(op->schema().arguments[1].name == "floats"); - REQUIRE(op->schema().arguments[1].type->isSubtypeOf(ListType::ofFloats())); - REQUIRE(op->schema().arguments[2].name == "tensors"); - REQUIRE(op->schema().arguments[2].type->isSubtypeOf(ListType::ofTensors())); + CATCH_REQUIRE(op->schema().arguments.size() == 3); + CATCH_REQUIRE(op->schema().arguments[0].name == "ints"); + CATCH_REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofInts())); + CATCH_REQUIRE(op->schema().arguments[1].name == "floats"); + CATCH_REQUIRE(op->schema().arguments[1].type->isSubtypeOf(ListType::ofFloats())); + CATCH_REQUIRE(op->schema().arguments[2].name == "tensors"); + CATCH_REQUIRE(op->schema().arguments[2].type->isSubtypeOf(ListType::ofTensors())); - REQUIRE(op->schema().returns.size() == 1); - REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofFloats())); + CATCH_REQUIRE(op->schema().returns.size() == 1); + CATCH_REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofFloats())); Stack stack; push(stack, std::vector{1, 2}); @@ -1023,9 +1023,9 @@ void testCustomOperators() { std::vector output; pop(stack, output); - REQUIRE(output.size() == 2); - REQUIRE(output[0] == 1.0); - REQUIRE(output[1] == 2.0); + CATCH_REQUIRE(output.size() == 2); + CATCH_REQUIRE(output[0] == 1.0); + CATCH_REQUIRE(output[1] == 2.0); } { RegisterOperators reg( @@ -1034,17 +1034,17 @@ void testCustomOperators() { auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); - REQUIRE(ops.size() == 1); + CATCH_REQUIRE(ops.size() == 1); auto& op = ops.front(); - REQUIRE(op->schema().name == "foo::lists2"); + CATCH_REQUIRE(op->schema().name == "foo::lists2"); - REQUIRE(op->schema().arguments.size() == 1); - REQUIRE(op->schema().arguments[0].name == "tensors"); - REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofTensors())); + CATCH_REQUIRE(op->schema().arguments.size() == 1); + CATCH_REQUIRE(op->schema().arguments[0].name == "tensors"); + CATCH_REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofTensors())); - REQUIRE(op->schema().returns.size() == 1); - REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofTensors())); + CATCH_REQUIRE(op->schema().returns.size() == 1); + CATCH_REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofTensors())); Stack stack; push(stack, std::vector{autograd::make_variable(at::ones(5))}); @@ -1052,31 +1052,31 @@ void testCustomOperators() { std::vector output; pop(stack, output); - REQUIRE(output.size() == 1); - REQUIRE(output[0].allclose(autograd::make_variable(at::ones(5)))); + CATCH_REQUIRE(output.size() == 1); + CATCH_REQUIRE(output[0].allclose(autograd::make_variable(at::ones(5)))); } { #ifdef USE_CATCH - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( createOperator( "foo::bar_with_bad_schema(Tensor a) -> Tensor", [](double a, at::Tensor b) { return a + b; }), StartsWith("Inferred 2 argument(s) for operator implementation, " "but the provided schema specified 1 argument(s).")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( createOperator( "foo::bar_with_bad_schema(Tensor a) -> Tensor", [](double a) { return a; }), StartsWith("Inferred type for argument #0 was float, " "but the provided schema specified type Dynamic " "for the argument in that position")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( createOperator( "foo::bar_with_bad_schema(float a) -> (float, float)", [](double a) { return a; }), StartsWith("Inferred 1 return value(s) for operator implementation, " "but the provided schema specified 2 return value(s).")); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( createOperator( "foo::bar_with_bad_schema(float a) -> Tensor", [](double a) { return a; }), @@ -1109,7 +1109,7 @@ void testCustomOperators() { break; } } - REQUIRE(contains_traced_op); + CATCH_REQUIRE(contains_traced_op); } { #ifdef USE_CATCH @@ -1124,7 +1124,7 @@ void testCustomOperators() { Stack stack; push(stack, std::vector{1.0}); - REQUIRE_THROWS_WITH( + CATCH_REQUIRE_THROWS_WITH( op.getOperation()(stack), StartsWith("Tracing float lists currently not supported!")); #endif @@ -1156,42 +1156,42 @@ TORCH_API std::string runJITCPPTests() { #ifdef USE_CATCH -TEST_CASE( "jit test CPU", "[cpu]" ) { +CATCH_TEST_CASE( "jit test CPU", "[cpu]" ) { std::stringstream out; - SECTION( "control flow" ) + CATCH_SECTION( "control flow" ) testControlFlow(); - SECTION( "blocks" ) + CATCH_SECTION( "blocks" ) testBlocks(out); - SECTION( "create autodiff subgraphs" ) + CATCH_SECTION( "create autodiff subgraphs" ) testCreateAutodiffSubgraphs(out); - SECTION( "differentiate" ) + CATCH_SECTION( "differentiate" ) testDifferentiate(out); - SECTION( "differentiate with requires grad" ) + CATCH_SECTION( "differentiate with requires grad" ) testDifferentiateWithRequiresGrad(out); - SECTION( "AD formulas" ) + CATCH_SECTION( "AD formulas" ) testADFormulas(); - SECTION( "code template" ) + CATCH_SECTION( "code template" ) codeTemplateTest(); - SECTION( "attributes" ) + CATCH_SECTION( "attributes" ) attributesTest(); - SECTION( "interned strings" ) + CATCH_SECTION( "interned strings" ) internedStringsTests(); - SECTION( "custom operators" ) + CATCH_SECTION( "custom operators" ) testCustomOperators(); } -TEST_CASE( "jit test CUDA", "[cuda]" ) { +CATCH_TEST_CASE( "jit test CUDA", "[cuda]" ) { - SECTION( "graph executor" ) + CATCH_SECTION( "graph executor" ) testGraphExecutor(); - SECTION( "fusion" ) + CATCH_SECTION( "fusion" ) fusionTests(); - SECTION( "interp" ) + CATCH_SECTION( "interp" ) interpTest(); - SECTION( "interp stage" ) + CATCH_SECTION( "interp stage" ) interpStageTest(); - SECTION( "argument spec" ) + CATCH_SECTION( "argument spec" ) argumentSpecTest(); } From c8fbeb3aa2068bb509de40067aa4f69fe25f22f2 Mon Sep 17 00:00:00 2001 From: Will Feng Date: Tue, 18 Sep 2018 10:14:40 -0700 Subject: [PATCH 222/237] Add empty tensor tests to test_sparse (#11228) Summary: This PR adds empty sparse tensor tests to `test_sparse.py`, and also fix various places in internal code to make the tests pass. **[NOTE] API CHANGE:** - `coalesce` on sparse tensor will always be performed out-of-place now (meaning the original tensor will never be affected) Pull Request resolved: https://github.com/pytorch/pytorch/pull/11228 Differential Revision: D9755189 Pulled By: yf225 fbshipit-source-id: e9d36f437db1a132c423d3a282ff405a084ae7cc --- aten/src/ATen/native/sparse/SparseTensor.cpp | 22 +- .../ATen/native/sparse/SparseTensorMath.cpp | 18 +- .../native/sparse/cuda/SparseCUDATensor.cpp | 3 + .../native/sparse/cuda/SparseCUDATensor.cu | 10 +- .../sparse/cuda/SparseCUDATensorMath.cu | 66 +- ...arse.test_add_dense_sparse_mismatch.expect | 1 - .../TestCudaSparse.test_log1p-backward.expect | 1 - ...stCudaSparse.test_log1p-uncoalesced.expect | 1 - ...arse.test_add_dense_sparse_mismatch.expect | 1 - ...coalescedSparse.test_log1p-backward.expect | 1 - ...lescedSparse.test_log1p-uncoalesced.expect | 1 - ...arse.test_add_dense_sparse_mismatch.expect | 1 - .../TestSparse.test_log1p-backward.expect | 1 - .../TestSparse.test_log1p-uncoalesced.expect | 1 - ...TestSparseOneOff.test_cuda_from_cpu.expect | 1 - ...eOff.test_cuda_sparse_cpu_dense_add.expect | 1 - ...arse.test_add_dense_sparse_mismatch.expect | 1 - ...coalescedSparse.test_log1p-backward.expect | 1 - ...lescedSparse.test_log1p-uncoalesced.expect | 1 - test/test_sparse.py | 1125 +++++++++++------ 20 files changed, 780 insertions(+), 478 deletions(-) delete mode 100644 test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect delete mode 100644 test/expect/TestCudaSparse.test_log1p-backward.expect delete mode 100644 test/expect/TestCudaSparse.test_log1p-uncoalesced.expect delete mode 100644 test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect delete mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect delete mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect delete mode 100644 test/expect/TestSparse.test_add_dense_sparse_mismatch.expect delete mode 100644 test/expect/TestSparse.test_log1p-backward.expect delete mode 100644 test/expect/TestSparse.test_log1p-uncoalesced.expect delete mode 100644 test/expect/TestSparseOneOff.test_cuda_from_cpu.expect delete mode 100644 test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect delete mode 100644 test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect delete mode 100644 test/expect/TestUncoalescedSparse.test_log1p-backward.expect delete mode 100644 test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 25fd4fc5df4326..49efed2a1e066b 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -257,12 +257,16 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) { AT_ASSERT(!self.is_variable()); AT_ASSERT(self.is_sparse()); - if (self._nnz() < 2) { - _get_sparse_impl(self)->set_coalesced(true); - } if (self.is_coalesced()) { return self; } + // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false, + // we should keep the original tensor intact and do coalesce on a copy of the tensor + if (self._nnz() < 2) { + SparseTensor dst = self.clone(); + _get_sparse_impl(dst)->set_coalesced(true); + return dst; + } LongTensor indices = self._indices(); Tensor values = self._values().contiguous(); @@ -306,13 +310,17 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) { int64_t pos = indicesPermutationAccessor[j]; int64_t curr = indicesBufferAccessor[j]; if (curr == prev) { - THBlas_axpy(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); + if (values.numel() > 0) { // if values is an empty tensor, there are no elements to copy + THBlas_axpy(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); + } } else { ++i; for (int64_t d = 0; d < sparseDims; d++) { newIndicesAccessor[d][i] = indicesAccessor[d][pos]; } - THBlas_copy(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); + if (values.numel() > 0) { // if values is an empty tensor, there are no elements to copy + THBlas_copy(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); + } } prev = curr; } @@ -345,6 +353,10 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse _get_sparse_impl(r)->set_coalesced(mask.is_coalesced()); int64_t r_nnz = mask._nnz(); _get_sparse_impl(r)->set_nnz_and_narrow(r_nnz); + if (t.numel() == 0) { // if t is an empty tensor, there is no need to mask its elements + return r; + } + // NB: Relies on mask._nnz() == 0 test above auto mask_indices_accessor = mask_indices.accessor(); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index afd8001734a9a8..ec074b5a6c8a88 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -250,18 +250,22 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S for (d = 0; d < sparseDims; d++) { r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i]; } - THBlas_axpy(blockSize, 1, - t_values_ptr + t_i * blockSize, 1, - r_values_ptr + r_i * blockSize, 1); + if (t_values.numel() > 0) { // We add all elements from t_values to r_values only if t_values is not an empty tensor + THBlas_axpy(blockSize, 1, + t_values_ptr + t_i * blockSize, 1, + r_values_ptr + r_i * blockSize, 1); + } t_i++; } if (cmp <= 0) { for (d = 0; d < sparseDims; d++) { r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i]; } - THBlas_axpy(blockSize, cast_value, - s_values_ptr + s_i * blockSize, 1, - r_values_ptr + r_i * blockSize, 1); + if (s_values.numel() > 0) { // We add all elements from s_values to r_values only if s_values is not an empty tensor + THBlas_axpy(blockSize, cast_value, + s_values_ptr + s_i * blockSize, 1, + r_values_ptr + r_i * blockSize, 1); + } s_i++; } r_i++; @@ -368,6 +372,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes()); if (src_._nnz() == 0 || t_._nnz() == 0) { + r.resize_as_(src_); return r.zero_(); } @@ -519,7 +524,6 @@ Tensor& s_addmm_out_sparse_dense_cpu( AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor"); AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values"); - AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor"); AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); SparseTensor sparse = sparse_.coalesce(); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp index fc53c41b56f727..107a30f51c2a97 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp @@ -25,6 +25,9 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars _alias_into_sparse(r, mask_indices.clone(), r_values); _get_sparse_impl(r)->set_coalesced(mask.is_coalesced()); _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz()); + if (t.numel() == 0) { // if t is an empty tensor, there is no need to mask its elements + return r; + } LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options()); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index ff4b0e0c57736c..ef2da61460eb99 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -27,12 +27,16 @@ namespace at { namespace native { SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { #ifndef __HIP_PLATFORM_HCC__ int64_t nnz = self._nnz(); - if (nnz < 2) { - _get_sparse_impl(self)->set_coalesced(true); - } if (self.is_coalesced()) { return self; } + // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false, + // we should keep the original tensor intact and do coalesce on a copy of the tensor + if (nnz < 2) { + SparseTensor dst = self.clone(); + _get_sparse_impl(dst)->set_coalesced(true); + return dst; + } cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index d7701479e9b243..712c28817bf6b1 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -106,37 +106,38 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT r__.transpose_(0, 1); } - /* dense */ - Tensor dense_; - char transpose_dense; - if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { - transpose_dense = 'n'; - dense_ = dense; - } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) { - transpose_dense = 't'; - dense_ = dense; - } else { - transpose_dense = 't'; - dense_ = dense.contiguous(); - } - - sparse::cuda::csrmm2( - 'n', - transpose_dense, - m, - n, - k, - nnz, - cast_alpha, - values.data(), - csr.data(), - colIndicesInt.data(), - dense_.data(), - (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), - cast_beta, - r__.data(), - r__.stride(1)); + if (nnz > 0) { + /* dense */ + Tensor dense_; + char transpose_dense; + if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { + transpose_dense = 'n'; + dense_ = dense; + } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) { + transpose_dense = 't'; + dense_ = dense; + } else { + transpose_dense = 't'; + dense_ = dense.contiguous(); + } + sparse::cuda::csrmm2( + 'n', + transpose_dense, + m, + n, + k, + nnz, + cast_alpha, + values.data(), + csr.data(), + colIndicesInt.data(), + dense_.data(), + (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), + cast_beta, + r__.data(), + r__.stride(1)); + } }); r_.copy_(r__); @@ -281,6 +282,10 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR int64_t nDim = dense.dim(); int64_t nDimI = sparse._sparseDims(); + if (sparse._values().numel() == 0) { + return r_; + } + if (sparse.is_coalesced()) { // TODO benchmark to decide whether to remove this special case const dim3 block = cuda::getApplyBlock(); @@ -423,6 +428,7 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons SparseTensor src = src_.coalesce(); if (src_._nnz() == 0 || t_._nnz() == 0) { + r_.resize_as_(src_); return r_.zero_(); } diff --git a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect deleted file mode 100644 index b6af4e9f4280b5..00000000000000 --- a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect +++ /dev/null @@ -1 +0,0 @@ -add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestCudaSparse.test_log1p-backward.expect b/test/expect/TestCudaSparse.test_log1p-backward.expect deleted file mode 100644 index 8e4e1fc8c1c18f..00000000000000 --- a/test/expect/TestCudaSparse.test_log1p-backward.expect +++ /dev/null @@ -1 +0,0 @@ -log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect deleted file mode 100644 index b25c0d3db02b2e..00000000000000 --- a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect +++ /dev/null @@ -1 +0,0 @@ -log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect deleted file mode 100644 index b6af4e9f4280b5..00000000000000 --- a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect +++ /dev/null @@ -1 +0,0 @@ -add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect deleted file mode 100644 index 8e4e1fc8c1c18f..00000000000000 --- a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect +++ /dev/null @@ -1 +0,0 @@ -log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect deleted file mode 100644 index b25c0d3db02b2e..00000000000000 --- a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect +++ /dev/null @@ -1 +0,0 @@ -log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect deleted file mode 100644 index b6af4e9f4280b5..00000000000000 --- a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect +++ /dev/null @@ -1 +0,0 @@ -add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestSparse.test_log1p-backward.expect b/test/expect/TestSparse.test_log1p-backward.expect deleted file mode 100644 index 8e4e1fc8c1c18f..00000000000000 --- a/test/expect/TestSparse.test_log1p-backward.expect +++ /dev/null @@ -1 +0,0 @@ -log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestSparse.test_log1p-uncoalesced.expect b/test/expect/TestSparse.test_log1p-uncoalesced.expect deleted file mode 100644 index b25c0d3db02b2e..00000000000000 --- a/test/expect/TestSparse.test_log1p-uncoalesced.expect +++ /dev/null @@ -1 +0,0 @@ -log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect deleted file mode 100644 index fab1614da93d4a..00000000000000 --- a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect +++ /dev/null @@ -1 +0,0 @@ -backend of indices (CUDA) must match backend of values (CPU) \ No newline at end of file diff --git a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect deleted file mode 100644 index 77b0b500f3b692..00000000000000 --- a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect +++ /dev/null @@ -1 +0,0 @@ -add: expected 'other' to be a CPU tensor, but got a CUDA tensor \ No newline at end of file diff --git a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect deleted file mode 100644 index b6af4e9f4280b5..00000000000000 --- a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect +++ /dev/null @@ -1 +0,0 @@ -add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect deleted file mode 100644 index 8e4e1fc8c1c18f..00000000000000 --- a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect +++ /dev/null @@ -1 +0,0 @@ -log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect deleted file mode 100644 index b25c0d3db02b2e..00000000000000 --- a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect +++ /dev/null @@ -1 +0,0 @@ -log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/test_sparse.py b/test/test_sparse.py index 82de45d6d17384..760879485e2b6d 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -44,7 +44,7 @@ def setUp(self): self.SparseTensor = torch.sparse.DoubleTensor super(TestSparse, self).setUp() - def _gen_sparse(self, d, nnz, with_size): + def _gen_sparse(self, sparse_dims, nnz, with_size): # TODO: Consider implementing this in the CUDA case by directly # performing the operations on the GPU. You won't be able to # use torch.rand/torch.randn in this case because they are @@ -54,28 +54,30 @@ def _gen_sparse(self, d, nnz, with_size): # If you do this, be sure to update assert_uncoalesced too if isinstance(with_size, Number): - with_size = [with_size] * d + with_size = [with_size] * sparse_dims if self.is_uncoalesced: # We want to generate a tensor with a lot of uncoalesced # entries to stress test whether or not we handle this # (subtle) case correctly - v_size = [nnz * 2] + list(with_size[d:]) + v_size = [nnz * 2] + list(with_size[sparse_dims:]) v = torch.randn(*v_size) - r = torch.rand(d, nnz) + r = torch.rand(sparse_dims, nnz) # Repeat the indexes, so every position shows up twice - i = torch.cat([r, r], dim=1) * \ - torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1) + i = torch.cat([r, r], dim=1) + if nnz > 0: + i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz * 2, 1).transpose(0, 1) i = i.type(torch.LongTensor) x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size)) self.assert_uncoalesced(x) else: - # Generate a sparse tensor with d sparse dimensions; the - # rest the dimensions with_size[d:] are dense. - v_size = [nnz] + list(with_size[d:]) + # Generate a sparse tensor with sparse_dims sparse dimensions; the + # rest the dimensions with_size[sparse_dims:] are dense. + v_size = [nnz] + list(with_size[sparse_dims:]) v = torch.randn(*v_size) - i = torch.rand(d, nnz) * \ - torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1) + i = torch.rand(sparse_dims, nnz) + if nnz > 0: + i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz, 1).transpose(0, 1) i = i.type(torch.LongTensor) x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size)) @@ -90,15 +92,13 @@ def assert_uncoalesced(self, x): correctness of the uncoalesced tensor generation algorithm. """ assert not x.is_coalesced() - # Strategy: construct a new sparse tensor with the raw value - # field overwritten to a tensor of ones, coalesce it, and then - # check if any value entries are > 1 (which indicates that the - # original was uncoalesced.) - i = x._indices().clone() - v = x._values().clone().fill_(1) - y = torch.sparse.DoubleTensor(i, v, x.size()) - z = self.safeCoalesce(y) - assert (z._values() > 1).sum() > 0 + existing_indices = set() + for i in range(x._nnz()): + index = str(x._indices()[:, i]) + if index in existing_indices: + return True + else: + existing_indices.add(index) def randn(self, *args, **kwargs): """ @@ -164,18 +164,20 @@ def test_print(self): @skipIfRocm def test_basic(self): - x, i, v = self._gen_sparse(3, 10, 100) - - self.assertEqual(i, x._indices()) - self.assertEqual(v, x._values()) - - x, i, v = self._gen_sparse(3, 10, [100, 100, 100]) - self.assertEqual(i, x._indices()) - self.assertEqual(v, x._values()) - self.assertEqual(x.ndimension(), 3) - self.assertEqual(self.safeCoalesce(x)._nnz(), 10) - for i in range(3): - self.assertEqual(x.size(i), 100) + def test_shape(sparse_dims, nnz, with_size): + if isinstance(with_size, Number): + with_size = [with_size] * sparse_dims + x, i, v = self._gen_sparse(sparse_dims, nnz, with_size) + self.assertEqual(i, x._indices()) + self.assertEqual(v, x._values()) + self.assertEqual(x.ndimension(), len(with_size)) + self.assertEqual(self.safeCoalesce(x)._nnz(), nnz) + self.assertEqual(list(x.size()), with_size) + + test_shape(3, 10, 100) + test_shape(3, 10, [100, 100, 100]) + test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0]) + test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0]) # Make sure that coalesce handles duplicate indices correctly i = self.IndexTensor([[9, 0, 0, 0, 8, 1, 1, 1, 2, 7, 2, 2, 3, 4, 6, 9]]) @@ -213,6 +215,13 @@ def test_ctor_size_checks(self): @skipIfRocm def test_to_dense(self): + def test_tensor(x, res): + x.to_dense() # Tests triple to_dense for memory corruption + x.to_dense() + x.to_dense() + self.assertEqual(res, x.to_dense()) + self.assertEqual(res, self.safeToDense(x)) + i = self.IndexTensor([ [0, 1, 2, 2], [0, 0, 0, 3], @@ -234,12 +243,17 @@ def test_to_dense(self): [0, 0, 0, 0, 0], [0, 0, 0, 0, 4]], ]) + test_tensor(x, res) - x.to_dense() # Tests double to_dense for memory corruption - x.to_dense() - x.to_dense() - self.assertEqual(res, x.to_dense()) - self.assertEqual(res, self.safeToDense(x)) + i = self.IndexTensor([ + [0, 1, 2, 2], + [0, 0, 0, 3], + [0, 0, 1, 4], + ]) + v = self.ValueTensor(4, 0) + x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0])) + res = self.ValueTensor(3, 4, 5, 0) + test_tensor(x, res) @skipIfRocm def test_shared(self): @@ -251,8 +265,21 @@ def test_shared(self): i[0][0] = 0 self.assertEqual(self.ValueTensor([6, 0, 0]), self.safeToDense(x)) + i = self.IndexTensor([[2]]) + v = self.ValueTensor(1, 0) + x = self.SparseTensor(i, v, torch.Size([3, 0])) + i[0][0] = 0 + self.assertEqual(self.ValueTensor(3, 0), self.safeToDense(x)) + @skipIfRocm def test_to_dense_hybrid(self): + def test_tensor(x, res): + x.to_dense() # Tests double to_dense for memory corruption + x.to_dense() + x.to_dense() + self.assertEqual(res, x.to_dense()) + self.assertEqual(res, self.safeToDense(x)) + i = self.IndexTensor([ [0, 1, 2, 2], [0, 0, 0, 3], @@ -273,15 +300,24 @@ def test_to_dense_hybrid(self): [0, 0], [4, 5]], ]) + test_tensor(x, res) - x.to_dense() # Tests double to_dense for memory corruption - x.to_dense() - x.to_dense() - self.assertEqual(res, x.to_dense()) - self.assertEqual(res, self.safeToDense(x)) + i = self.IndexTensor([ + [0, 1, 2, 2], + [0, 0, 0, 3], + ]) + v = self.ValueTensor(4, 2, 0) + x = self.SparseTensor(i, v, torch.Size([3, 4, 2, 0])) + res = self.ValueTensor(3, 4, 2, 0) + test_tensor(x, res) @skipIfRocm def test_contig(self): + def test_tensor(x, exp_i, exp_v): + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) + i = self.IndexTensor([ [1, 0, 35, 14, 39, 6, 71, 66, 40, 27], [92, 31, 62, 50, 22, 65, 89, 74, 56, 34], @@ -293,9 +329,7 @@ def test_contig(self): [31, 92, 65, 50, 34, 62, 22, 56, 74, 89], ]) exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7]) - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) + test_tensor(x, exp_i, exp_v) i = self.IndexTensor([ [2, 0, 2, 1], @@ -310,10 +344,22 @@ def test_contig(self): [0, 0, 1, 4], ]) exp_v = self.ValueTensor([2, 1, 3, 4]) + test_tensor(x, exp_i, exp_v) - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) + i = self.IndexTensor([ + [2, 0, 2, 1], + [0, 0, 3, 0], + [1, 0, 4, 0], + ]) + v = self.ValueTensor(4, 0) + x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0])) + exp_i = self.IndexTensor([ + [0, 1, 2, 2], + [0, 0, 0, 3], + [0, 0, 1, 4], + ]) + exp_v = self.ValueTensor(4, 0) + test_tensor(x, exp_i, exp_v) # Duplicate indices i = self.IndexTensor([ @@ -329,13 +375,30 @@ def test_contig(self): [0, 4], ]) exp_v = self.ValueTensor([6, 4]) + test_tensor(x, exp_i, exp_v) - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) + i = self.IndexTensor([ + [0, 0, 2, 0], + [0, 0, 3, 0], + [0, 0, 4, 0], + ]) + v = self.ValueTensor(4, 0) + x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0])) + exp_i = self.IndexTensor([ + [0, 2], + [0, 3], + [0, 4], + ]) + exp_v = self.ValueTensor(2, 0) + test_tensor(x, exp_i, exp_v) @skipIfRocm def test_contig_hybrid(self): + def test_tensor(x, exp_i, exp_v): + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) + i = self.IndexTensor([ [1, 0, 35, 14, 39, 6, 71, 66, 40, 27], [92, 31, 62, 50, 22, 65, 89, 74, 56, 34], @@ -353,9 +416,7 @@ def test_contig_hybrid(self): [2, 3], [1, 2], [6, 7], [4, 5], [10, 11], [3, 4], [5, 6], [9, 10], [8, 9], [7, 8], ]) - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) + test_tensor(x, exp_i, exp_v) i = self.IndexTensor([ [2, 0, 2, 1], @@ -370,10 +431,22 @@ def test_contig_hybrid(self): [0, 0, 1, 4], ]) exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]]) + test_tensor(x, exp_i, exp_v) - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) + i = self.IndexTensor([ + [2, 0, 2, 1], + [0, 0, 3, 0], + [1, 0, 4, 0], + ]) + v = self.ValueTensor(4, 3, 0) + x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0])) + exp_i = self.IndexTensor([ + [0, 1, 2, 2], + [0, 0, 0, 3], + [0, 0, 1, 4], + ]) + exp_v = self.ValueTensor(4, 3, 0) + test_tensor(x, exp_i, exp_v) # Duplicate indices i = self.IndexTensor([ @@ -389,51 +462,79 @@ def test_contig_hybrid(self): [0, 4], ]) exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]]) + test_tensor(x, exp_i, exp_v) - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) + i = self.IndexTensor([ + [0, 0, 2, 0], + [0, 0, 3, 0], + [0, 0, 4, 0], + ]) + v = self.ValueTensor(4, 3, 0) + x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0])) + exp_i = self.IndexTensor([ + [0, 2], + [0, 3], + [0, 4], + ]) + exp_v = self.ValueTensor(2, 3, 0) + test_tensor(x, exp_i, exp_v) @skipIfRocm def test_clone(self): - x, _, _ = self._gen_sparse(4, 20, 5) - if self.is_uncoalesced: - self.assertFalse(x.is_coalesced()) + def test_shape(sparse_dims, nnz, with_size): + x = self._gen_sparse(sparse_dims, nnz, with_size)[0] + if self.is_uncoalesced: + self.assertFalse(x.is_coalesced()) + y = x.clone() + self.assertFalse(y.is_coalesced()) + x = x.coalesce() + self.assertTrue(x.is_coalesced()) y = x.clone() - self.assertFalse(y.is_coalesced()) - x = x.coalesce() - self.assertTrue(x.is_coalesced()) - y = x.clone() - self.assertTrue(y.is_coalesced()) + self.assertTrue(y.is_coalesced()) + + test_shape(4, 20, 5) + test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0]) + test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0]) @cuda_only def test_cuda_empty(self): + def test_tensor(x): + y = x.cuda(0) + self.assertEqual(x._sparseDims(), y._sparseDims()) + self.assertEqual(x._denseDims(), y._denseDims()) + x = y.cpu() + self.assertEqual(y._sparseDims(), x._sparseDims()) + self.assertEqual(y._denseDims(), x._denseDims()) + x = torch.sparse.FloatTensor(2, 3, 4) - y = x.cuda(0) - self.assertEqual(x._sparseDims(), y._sparseDims()) - self.assertEqual(x._denseDims(), y._denseDims()) - x = y.cpu() - self.assertEqual(y._sparseDims(), x._sparseDims()) - self.assertEqual(y._denseDims(), x._denseDims()) + test_tensor(x) + + x = torch.sparse.FloatTensor(2, 3, 4, 0) + test_tensor(x) @skipIfRocm def test_transpose(self): - x = self._gen_sparse(4, 20, 5)[0] - y = self.safeToDense(x) + def test_shape(sparse_dims, nnz, with_size): + x = self._gen_sparse(sparse_dims, nnz, with_size)[0] + y = self.safeToDense(x) + + for i, j in itertools.combinations(range(4), 2): + x = x.transpose_(i, j) + y = y.transpose(i, j) + self.assertEqual(self.safeToDense(x), y) - for i, j in itertools.combinations(range(4), 2): - x = x.transpose_(i, j) - y = y.transpose(i, j) - self.assertEqual(self.safeToDense(x), y) + x = x.transpose(i, j) + y = y.transpose(i, j) + self.assertEqual(self.safeToDense(x), y) - x = x.transpose(i, j) - y = y.transpose(i, j) - self.assertEqual(self.safeToDense(x), y) + test_shape(4, 20, 5) + test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0]) + test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0]) @cpu_only def test_coalesce_transpose_mm(self): - def test_shape(di, dj, dk): - x, _, _ = self._gen_sparse(2, 20, [dj, di]) + def test_shape(di, dj, dk, nnz): + x, _, _ = self._gen_sparse(2, nnz, [dj, di]) y = torch.randn(dj, dk) x_coalesced = x.coalesce() @@ -446,43 +547,58 @@ def test_shape(di, dj, dk): expected = torch.mm(self.safeToDense(x_coalesced_t), y) self.assertEqual(res, expected) - test_shape(10, 20, 30) + test_shape(10, 20, 30, 20) + test_shape(0, 20, 30, 0) + test_shape(10, 0, 30, 0) + test_shape(10, 20, 0, 0) + test_shape(10, 20, 0, 20) def test_t_empty(self): - x = self.SparseTensor(2, 3) - x.t_() - self.assertEqual(torch.Size([3, 2]), x.size()) - self.assertEqual(0, x._indices().numel()) - self.assertEqual(0, x._values().numel()) - self.assertEqual(x._sparseDims(), 2) - self.assertEqual(x._denseDims(), 0) + def test_in_place(x): + shape_original = x.shape + x.t_() + self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), x.size()) + self.assertEqual(0, x._indices().numel()) + self.assertEqual(0, x._values().numel()) + self.assertEqual(x._sparseDims(), 2) + self.assertEqual(x._denseDims(), 0) + + def test_not_in_place(x): + shape_original = x.shape + y = x.t() + self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), y.size()) + self.assertEqual(0, y._indices().numel()) + self.assertEqual(0, y._values().numel()) + self.assertEqual(x._sparseDims(), 2) + self.assertEqual(x._denseDims(), 0) x = self.SparseTensor(2, 3) - y = x.t() - self.assertEqual(torch.Size([3, 2]), y.size()) - self.assertEqual(0, y._indices().numel()) - self.assertEqual(0, y._values().numel()) - self.assertEqual(x._sparseDims(), 2) - self.assertEqual(x._denseDims(), 0) + test_in_place(x) + test_not_in_place(x) + + x = self.SparseTensor(2, 0) + test_in_place(x) + test_not_in_place(x) @skipIfRocm def test_add_zeros(self): - def test_shape(sparse_dims, sizes): - x, _, _ = self._gen_sparse(sparse_dims, 20, sizes) + def test_shape(sparse_dims, nnz, sizes): + x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes) zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device) r1 = zeros + x r2 = x + zeros self.assertEqual(r1, x) self.assertEqual(r2, x) - test_shape(1, [1]) - test_shape(4, [3, 17, 19, 5]) - test_shape(2, [3, 17, 19, 5]) + test_shape(1, 20, [1]) + test_shape(4, 20, [3, 17, 19, 5]) + test_shape(2, 20, [3, 17, 19, 5]) + test_shape(2, 20, [3, 17, 19, 0]) @cpu_only def test_mm(self): - def test_shape(di, dj, dk): - x, _, _ = self._gen_sparse(2, 20, [di, dj]) + def test_shape(di, dj, dk, nnz): + x, _, _ = self._gen_sparse(2, nnz, [di, dj]) t = torch.randn(di, dk) y = torch.randn(dj, dk) alpha = random.random() @@ -500,15 +616,19 @@ def test_shape(di, dj, dk): expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res, expected) - test_shape(10, 100, 100) - test_shape(100, 1000, 200) - test_shape(64, 10000, 300) + test_shape(10, 100, 100, 20) + test_shape(100, 1000, 200, 20) + test_shape(64, 10000, 300, 20) + test_shape(0, 100, 100, 0) + test_shape(10, 0, 100, 0) + test_shape(10, 100, 0, 0) + test_shape(10, 100, 0, 20) @cpu_only def test_saddmm(self): - def test_shape(di, dj, dk): - x = self._gen_sparse(2, 20, [di, dj])[0] - t = self._gen_sparse(2, 20, [di, dk])[0] + def test_shape(di, dj, dk, nnz): + x = self._gen_sparse(2, nnz, [di, dj])[0] + t = self._gen_sparse(2, nnz, [di, dk])[0] y = torch.randn(dj, dk) alpha = random.random() beta = random.random() @@ -525,43 +645,52 @@ def test_shape(di, dj, dk): expected = torch.mm(self.safeToDense(x), y) self.assertEqual(self.safeToDense(res), expected) - test_shape(7, 5, 3) - test_shape(1000, 100, 100) - test_shape(3000, 64, 300) + test_shape(7, 5, 3, 20) + test_shape(1000, 100, 100, 20) + test_shape(3000, 64, 300, 20) + test_shape(0, 100, 100, 0) + test_shape(1000, 0, 100, 0) + test_shape(1000, 100, 0, 0) @skipIfRocm def test_dsmm(self): - def test_shape(di, dj, dk): - x = self._gen_sparse(2, 20, [di, dj])[0] + def test_shape(di, dj, dk, nnz): + x = self._gen_sparse(2, nnz, [di, dj])[0] y = self.randn(dj, dk) res = torch.dsmm(x, y) expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res, expected) - test_shape(7, 5, 3) - test_shape(1000, 100, 100) - test_shape(3000, 64, 300) + test_shape(7, 5, 3, 20) + test_shape(1000, 100, 100, 20) + test_shape(3000, 64, 300, 20) + test_shape(0, 100, 100, 0) + test_shape(1000, 0, 100, 0) + test_shape(1000, 100, 0, 0) + test_shape(1000, 100, 0, 20) @skipIfRocm def test_hsmm(self): - def test_shape(di, dj, dk): - x = self._gen_sparse(2, 20, [di, dj])[0] + def test_shape(di, dj, dk, nnz): + x = self._gen_sparse(2, nnz, [di, dj])[0] y = self.randn(dj, dk) res = torch.hsmm(x, y) - # TODO: use self.safeToDense(), but this triggers - # https://github.com/pytorch/pytorch/issues/3170 - expected = torch.mm(x.to_dense(), y) + expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res.to_dense(), expected) - test_shape(7, 5, 3) - test_shape(1000, 100, 100) - test_shape(3000, 64, 300) + test_shape(7, 5, 3, 20) + test_shape(1000, 100, 100, 20) + test_shape(3000, 64, 300, 20) + test_shape(0, 100, 100, 0) + test_shape(1000, 0, 100, 0) + test_shape(1000, 100, 0, 0) + test_shape(1000, 100, 0, 20) - def _test_spadd_shape(self, shape_i, shape_v=None): + def _test_spadd_shape(self, nnz, shape_i, shape_v=None): shape = shape_i + (shape_v or []) - x, _, _ = self._gen_sparse(len(shape_i), 10, shape) + x, _, _ = self._gen_sparse(len(shape_i), nnz, shape) y = self.randn(*shape) r = random.random() @@ -583,7 +712,7 @@ def _test_spadd_shape(self, shape_i, shape_v=None): self.assertEqual(res, expected) - x, i, v = self._gen_sparse(len(shape_i), 10, shape) + x, i, v = self._gen_sparse(len(shape_i), nnz, shape) nnz = i.size(1) # Non contiguous sparse indices tensor @@ -606,28 +735,40 @@ def _test_spadd_shape(self, shape_i, shape_v=None): @skipIfRocm def test_spadd(self): - self._test_spadd_shape([5, 6]) - self._test_spadd_shape([10, 10, 10]) - self._test_spadd_shape([50, 30, 20]) - self._test_spadd_shape([5, 5, 5, 5, 5, 5]) + self._test_spadd_shape(10, [5, 6]) + self._test_spadd_shape(10, [10, 10, 10]) + self._test_spadd_shape(10, [50, 30, 20]) + self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5]) + self._test_spadd_shape(0, [0, 30, 20]) + self._test_spadd_shape(0, [50, 0, 20]) + self._test_spadd_shape(0, [50, 30, 0]) @skipIfRocm def test_spadd_hybrid(self): - self._test_spadd_shape([5, 6], [2, 3]) - self._test_spadd_shape([10, 10, 10], [3]) - self._test_spadd_shape([50, 30, 20], [2]) - self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2]) + self._test_spadd_shape(10, [5, 6], [2, 3]) + self._test_spadd_shape(10, [10, 10, 10], [3]) + self._test_spadd_shape(10, [50, 30, 20], [2]) + self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5], [2]) + self._test_spadd_shape(0, [0, 30, 20], [2, 0]) + self._test_spadd_shape(0, [50, 0, 20], [2, 0]) + self._test_spadd_shape(0, [50, 30, 0], [2, 0]) + self._test_spadd_shape(10, [50, 30, 20], [2, 0]) @skipIfRocm def test_norm(self): - x, _, _ = self._gen_sparse(3, 10, 100) - y = x.coalesce() - self.assertEqual(x.norm(), y._values().norm()) + def test_shape(sparse_dims, nnz, with_size): + x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size) + y = x.coalesce() + self.assertEqual(x.norm(), y._values().norm()) + + test_shape(3, 10, 100) + test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0]) + test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0]) - def _test_basic_ops_shape(self, shape_i, shape_v=None): + def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None): shape = shape_i + (shape_v or []) - x1, _, _ = self._gen_sparse(len(shape_i), 9, shape) - x2, _, _ = self._gen_sparse(len(shape_i), 12, shape) + x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape) + x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape) y1 = x1 + x2 y2 = x1.clone() @@ -689,30 +830,49 @@ def _test_basic_ops_shape(self, shape_i, shape_v=None): @skipIfRocm def test_basic_ops(self): - self._test_basic_ops_shape([5, 6]) - self._test_basic_ops_shape([10, 10, 10]) - self._test_basic_ops_shape([50, 30, 20]) - self._test_basic_ops_shape([5, 5, 5, 5, 5, 5]) + self._test_basic_ops_shape(9, 12, [5, 6]) + self._test_basic_ops_shape(9, 12, [10, 10, 10]) + self._test_basic_ops_shape(9, 12, [50, 30, 20]) + self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5]) + self._test_basic_ops_shape(0, 12, [10, 10, 10]) + self._test_basic_ops_shape(9, 0, [10, 10, 10]) + self._test_basic_ops_shape(0, 0, [10, 10, 10]) + self._test_basic_ops_shape(0, 0, [10, 10, 0]) @skipIfRocm def test_basic_ops_hybrid(self): - self._test_basic_ops_shape([5, 6], [2, 3]) - self._test_basic_ops_shape([10, 10, 10], [3]) - self._test_basic_ops_shape([50, 30, 20], [2]) - self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2]) + self._test_basic_ops_shape(9, 12, [5, 6], [2, 3]) + self._test_basic_ops_shape(9, 12, [10, 10, 10], [3]) + self._test_basic_ops_shape(9, 12, [50, 30, 20], [2]) + self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5], [2]) + self._test_basic_ops_shape(0, 12, [10, 10, 10], [2]) + self._test_basic_ops_shape(9, 0, [10, 10, 10], [2]) + self._test_basic_ops_shape(0, 0, [10, 10, 10], [2]) + self._test_basic_ops_shape(9, 12, [10, 10, 10], [2, 0]) + self._test_basic_ops_shape(0, 12, [10, 10, 10], [2, 0]) + self._test_basic_ops_shape(9, 0, [10, 10, 10], [2, 0]) + self._test_basic_ops_shape(0, 0, [10, 10, 10], [2, 0]) + self._test_basic_ops_shape(0, 0, [10, 10, 0], [2, 0]) @skipIfRocm def test_add_dense_sparse_mismatch(self): - x = torch.zeros([3, 4], dtype=self.value_dtype, device=self.device) - sparse_y = self.SparseTensor(torch.zeros(1, 4, dtype=torch.int64, device=self.device), - torch.randn(4, 4, 4, dtype=self.value_dtype, device=self.device), - torch.Size([3, 4, 4])) - self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y) - - def _test_sparse_mask_shape(self, shape_i, shape_v=None): + def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size): + x = torch.zeros(dense_size, dtype=self.value_dtype, device=self.device) + sparse_y = self.SparseTensor(torch.zeros(sparse_dims_shape, dtype=torch.int64, device=self.device), + torch.randn(dense_dims_shape, dtype=self.value_dtype, device=self.device), + torch.Size(sparse_size)) + with self.assertRaisesRegex( + RuntimeError, + "add: expected 'self' and 'other' to have same size"): + x + sparse_y + + test_shape([3, 4], [1, 4], [4, 4, 4], [3, 4, 4]) + test_shape([3, 4, 0], [1, 4], [4, 4, 4, 0], [3, 4, 4, 0]) + + def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None): shape = shape_i + (shape_v or []) - x1, _, _ = self._gen_sparse(len(shape_i), 9, shape) - x2, _, _ = self._gen_sparse(len(shape_i), 12, shape) + x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape) + x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape) y1 = x1 + x2 y2 = x1.clone() @@ -740,87 +900,30 @@ def _test_sparse_mask_fixed(self): expected = self.SparseTensor(i, exp_v, torch.Size([5, 4])) self.assertEqual(res, expected) + i = self.IndexTensor([ + [1, 3, 0, 4], + [2, 1, 2, 3], + ]) + v = self.ValueTensor(4, 0) + x = self.SparseTensor(i, v, torch.Size([5, 4, 0])).coalesce() + dense = self.ValueTensor(5, 4, 0) + exp_v = self.ValueTensor(4, 0) + res = dense.sparse_mask(x) + expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 0])) + self.assertEqual(res, expected) + @skipIfRocm def test_sparse_mask(self): self._test_sparse_mask_fixed() - self._test_sparse_mask_shape([5, 6]) - self._test_sparse_mask_shape([10, 10, 10]) - self._test_sparse_mask_shape([50, 30, 20]) - self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5]) - - def _test_zeros(self, shape, out_shape_i, out_shape_v=None): - out_shape = out_shape_i + (out_shape_v or []) - for nnz in [9, 12]: - out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape) - torch.zeros(*shape, out=out) - self.assertEqual(tuple(out.size()), tuple(shape)) - self.assertTrue(out._indices().numel() == out._values().numel() == 0) - self.assertEqual(out._nnz(), 0) - self.assertEqual(out._sparseDims(), len(shape)) - self.assertEqual(out._denseDims(), 0) - - @skipIfRocm - def test_log1p(self): - if self.is_cuda: - input = torch.cuda.sparse.DoubleTensor( - torch.LongTensor([[0], [1], [2]]).transpose(1, 0).cuda(), - torch.FloatTensor([3, 4, 5]).cuda(), - torch.Size([3])) - else: - input = torch.sparse.DoubleTensor( - torch.LongTensor([[0], [1], [2]]).transpose(1, 0), - torch.FloatTensor([3, 4, 5]), - torch.Size([3])) - - expected_output = torch.tensor([3., 4., 5.]).log1p_() - self.assertEqual(expected_output, input.log1p().to_dense()) - self.assertEqual(expected_output, input.coalesce().log1p_().to_dense()) - - # test in-place op on uncoalesced input - self.assertExpectedRaises(RuntimeError, lambda: input.log1p_(), subname="uncoalesced") - - input.requires_grad_() - self.assertTrue(input.requires_grad) - - # test autograd - x = input.clone() - y = input.log1p() - self.assertExpectedRaises(RuntimeError, lambda: y.backward(x), subname="backward") - - # test uncoalesced input - input_uncoalesced = torch.sparse.DoubleTensor( - torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), - torch.FloatTensor([2, 3, 4, 1, 1, 1]), - torch.Size([3])) - self.assertEqual(expected_output, input_uncoalesced.log1p().to_dense()) - self.assertEqual(expected_output, input_uncoalesced.coalesce().log1p_().to_dense()) - - def test_zeros(self): - i_shapes = [2, 3, 4] - v_shapes = [3, 4, 5, 6] - for i_dim in range(1, len(i_shapes) + 1): - for v_dim in range(len(v_shapes) + 1): - self._test_zeros([2, 3, 4], i_shapes[:i_dim], v_shapes[:v_dim]) - - def _test_zeros_like(self, template_shape_i, template_shape_v=None): - template_shape_v = template_shape_v or [] - template_shape = template_shape_i + template_shape_v - for nnz in [9, 12]: - t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape) - res = torch.zeros_like(t) - self.assertEqual(tuple(res.size()), tuple(template_shape)) - self.assertTrue(res._indices().numel() == res._values().numel() == 0) - self.assertEqual(res._nnz(), 0) - self.assertEqual(res._sparseDims(), len(template_shape_i)) - self.assertEqual(res._denseDims(), len(template_shape_v)) - - def test_zeros_like(self): - i_shapes = [2, 3, 4] - v_shapes = [3, 4, 5, 6] - for i_dim in range(1, len(i_shapes) + 1): - for v_dim in range(len(v_shapes) + 1): - self._test_zeros_like(i_shapes[:i_dim], v_shapes[:v_dim]) + self._test_sparse_mask_shape(9, 12, [5, 6]) + self._test_sparse_mask_shape(9, 12, [10, 10, 10]) + self._test_sparse_mask_shape(9, 12, [50, 30, 20]) + self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5]) + self._test_sparse_mask_shape(0, 12, [10, 10, 10]) + self._test_sparse_mask_shape(9, 0, [10, 10, 10]) + self._test_sparse_mask_shape(0, 0, [10, 10, 10]) + self._test_sparse_mask_shape(0, 0, [10, 10, 0]) def _test_sparse_mask_hybrid_fixed(self): i = self.IndexTensor([ @@ -844,110 +947,130 @@ def _test_sparse_mask_hybrid_fixed(self): expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2])) self.assertEqual(res, expected) - @skipIfRocm - def test_sparse_variable_methods(self): - # TODO: delete when tensor/variable are merged - from torch.autograd import Variable - i = self.IndexTensor([[0, 1, 1], [2, 0, 2]]) - v = self.ValueTensor([3, 4, 5]) - sparse_mat = self.SparseTensor(i, v, torch.Size([2, 3])) - sparse_var = Variable(sparse_mat) - - to_test_one_arg = { - 'zeros_like': lambda x: torch.zeros_like(x), - 'transpose': lambda x: x.transpose(0, 1), - 'transpose_': lambda x: x.transpose_(0, 1), - 't': lambda x: x.t(), - 't_': lambda x: x.t_(), - 'div': lambda x: x.div(2), - 'div_': lambda x: x.div_(2), - 'pow': lambda x: x.pow(2), - '_nnz': lambda x: x._nnz(), - 'is_coalesced': lambda x: x.is_coalesced(), - 'coalesce': lambda x: x.coalesce(), - 'to_dense': lambda x: x.to_dense(), - '_sparseDims': lambda x: x._sparseDims(), - '_denseDims': lambda x: x._denseDims(), - 'norm': lambda x: x.norm(), - 'log1p': lambda x: x.log1p(), - } - - for test_name, test_fn in to_test_one_arg.items(): - var1 = sparse_var.clone() - tensor1 = sparse_mat.clone() - - out_var = test_fn(var1) - out_tensor = test_fn(tensor1) - - if isinstance(out_tensor, int) or isinstance(out_tensor, bool): - if not isinstance(out_var, int) and not isinstance(out_var, bool): - check_var = out_var.data[0] - else: - check_var = out_var - self.assertEqual(out_var, out_tensor) - continue - - # Assume output is variable / tensor - self.assertEqual(test_fn(var1).data, test_fn(tensor1), - test_name) - - i = self.IndexTensor([[0, 0, 1], [1, 2, 1]]) - v = self.ValueTensor([3, 3, 4]) - sparse_mat2 = self.SparseTensor(i, v, torch.Size([2, 3])) - sparse_var2 = Variable(sparse_mat2) - - to_test_two_arg = { - 'sub': lambda x, y: x.sub(y), - 'sub_': lambda x, y: x.sub_(y), - 'mul': lambda x, y: x.mul(y), - 'mul_': lambda x, y: x.mul_(y), - } - - for test_name, test_fn in to_test_two_arg.items(): - var1 = sparse_var.clone() - var2 = sparse_var2.clone() - tensor1 = sparse_mat.clone() - tensor2 = sparse_mat2.clone() - self.assertEqual(test_fn(var1, var2).data, - test_fn(tensor1, tensor2), test_name) - - to_test_mixed = [ - # test name, lambda expression, should_run_when_cuda - ('sspaddmm', lambda sp, de: sp.sspaddmm(sp, de), False), - ('sspaddmm_b', lambda sp, de: sp.sspaddmm(2, sp, de), False), - ('sspaddmm_b_a', lambda sp, de: sp.sspaddmm(3, 2, sp, de), False), - ('addmm', lambda sp, de: de.addmm(sp, de), True), - # TODO: This looks like a typo - ('addmm_', lambda sp, de: de.addmm(sp, de), True), - ('mm', lambda sp, de: torch.mm(sp, de), True), - ('mm_out', lambda sp, de: torch.mm(sp, de, out=de), True), - ] - - i = self.IndexTensor([[0, 0, 1, 2, 2], [1, 2, 1, 0, 1]]) - v = self.ValueTensor([3, 3, 4, 1, 2]) - sparse_mat = self.SparseTensor(i, v, torch.Size([3, 3])) - sparse_var = Variable(sparse_mat) - dense_mat = sparse_mat.to_dense().random_(0, 5) - dense_var = Variable(dense_mat) - - for test_name, test_fn, test_cuda in to_test_mixed: - if sparse_var.is_cuda and not test_cuda: - continue - sp_var = sparse_var.clone() - de_var = dense_var.clone() - sp_mat = sparse_mat.clone() - de_mat = dense_mat.clone() - self.assertEqual(test_fn(sp_var, de_var).data, - test_fn(sp_mat, de_mat), test_name) + i = self.IndexTensor([ + [1, 3, 0, 4], + [2, 1, 2, 3], + ]) + v = self.ValueTensor(4, 2, 0) + x = self.SparseTensor(i, v, torch.Size([5, 4, 2, 0])).coalesce() + dense = self.ValueTensor(5, 4, 2, 0) + res = dense.sparse_mask(x) + exp_v = self.ValueTensor(4, 2, 0) + expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2, 0])) + self.assertEqual(res, expected) @skipIfRocm def test_sparse_mask_hybrid(self): self._test_sparse_mask_hybrid_fixed() - self._test_sparse_mask_shape([5, 6], [2, 3]) - self._test_sparse_mask_shape([10, 10, 10], [3]) - self._test_sparse_mask_shape([50, 30, 20], [2]) - self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2]) + self._test_sparse_mask_shape(9, 12, [5, 6], [2, 3]) + self._test_sparse_mask_shape(9, 12, [10, 10, 10], [3]) + self._test_sparse_mask_shape(9, 12, [50, 30, 20], [2]) + self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5], [2]) + self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2]) + self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2]) + self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2]) + self._test_sparse_mask_shape(9, 12, [10, 10, 10], [2, 0]) + self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2, 0]) + self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2, 0]) + self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2, 0]) + self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0]) + + def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None): + out_shape = out_shape_i + (out_shape_v or []) + for nnz in nnzs: + out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape) + torch.zeros(*shape, out=out) + self.assertEqual(tuple(out.size()), tuple(shape)) + self.assertTrue(out._indices().numel() == out._values().numel() == 0) + self.assertEqual(out._nnz(), 0) + self.assertEqual(out._sparseDims(), len(shape)) + self.assertEqual(out._denseDims(), 0) + + def test_zeros(self): + def test_shape(i_shapes, v_shapes, shape, nnzs): + for i_dim in range(1, len(i_shapes) + 1): + for v_dim in range(len(v_shapes) + 1): + self._test_zeros(nnzs, shape, i_shapes[:i_dim], v_shapes[:v_dim]) + test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 4], [9, 12]) + test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 4], [0]) + test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 4], [9, 12]) + test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 0], [9, 12]) + test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 0], [0]) + test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12]) + + def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None): + template_shape_v = template_shape_v or [] + template_shape = template_shape_i + template_shape_v + for nnz in nnzs: + t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape) + res = torch.zeros_like(t) + self.assertEqual(tuple(res.size()), tuple(template_shape)) + self.assertTrue(res._indices().numel() == res._values().numel() == 0) + self.assertEqual(res._nnz(), 0) + self.assertEqual(res._sparseDims(), len(template_shape_i)) + self.assertEqual(res._denseDims(), len(template_shape_v)) + + def test_zeros_like(self): + def test_shape(i_shapes, v_shapes, nnzs): + for i_dim in range(1, len(i_shapes) + 1): + for v_dim in range(len(v_shapes) + 1): + self._test_zeros_like(nnzs, i_shapes[:i_dim], v_shapes[:v_dim]) + test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12]) + test_shape([0, 3, 4], [3, 4, 5, 6], [0]) + test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12]) + test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12]) + test_shape([0, 3, 4], [3, 4, 5, 6], [0]) + test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12]) + + def _test_log1p_tensor(self, input, dense_tensor): + expected_output = torch.tensor(dense_tensor).log1p_() + self.assertEqual(expected_output, input.log1p().to_dense()) + self.assertEqual(expected_output, input.coalesce().log1p_().to_dense()) + + # test in-place op on uncoalesced input + with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"): + input.log1p_() + + input.requires_grad_() + self.assertTrue(input.requires_grad) + + # test autograd + x = input.clone() + y = input.log1p() + with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"): + y.backward(x) + + @skipIfRocm + def test_log1p(self): + input = torch.sparse_coo_tensor( + torch.LongTensor([[0], [1], [2]]).transpose(1, 0), + torch.FloatTensor([3, 4, 5]), + torch.Size([3]), + device=self.device) + self._test_log1p_tensor(input, [3., 4., 5.]) + + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + torch.FloatTensor([2, 3, 4, 1, 1, 1]), + torch.Size([3]), + device=self.device) + self._test_log1p_tensor(input_uncoalesced, [3., 4., 5.]) + + input = torch.sparse_coo_tensor( + torch.zeros([2, 0]), + torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]), + device=self.device) + self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0])) + + input = torch.sparse_coo_tensor( + torch.zeros([1, 5]), + torch.zeros([5, 6, 0]), + torch.Size([5, 6, 0]), + device=self.device) + self._test_log1p_tensor(input, torch.zeros([5, 6, 0])) @skipIfRocm def test_sparse_add_coalesce(self): @@ -959,30 +1082,55 @@ def test_sparse_add_coalesce(self): self.assertFalse(z._indices().numel() != 2 and z.is_coalesced()) + i = self.IndexTensor([[1, 2, 1]]) + v = self.ValueTensor(3, 0) + x = self.SparseTensor(i, v, torch.Size([3, 0])) + y = self.SparseTensor(i, v, torch.Size([3, 0])) + z = x + y + + self.assertFalse(z._indices().numel() != 2 and z.is_coalesced()) + @cuda_only def test_storage_not_null(self): x = torch.cuda.sparse.FloatTensor(2) self.assertNotEqual(x.get_device(), -1) + x = torch.cuda.sparse.FloatTensor(2, 0) + self.assertNotEqual(x.get_device(), -1) + @cuda_only @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected") @skipIfRocm def test_same_gpu(self): + def check_device(x, device_id): + self.assertEqual(x.get_device(), device_id) + self.assertEqual(x._values().get_device(), device_id) + self.assertEqual(x._indices().get_device(), device_id) + i = self.IndexTensor([[2]]).cuda(1) v = self.ValueTensor([5]).cuda(1) x = self.SparseTensor(i, v, torch.Size([3]), device=1) - self.assertEqual(x.get_device(), 1) - self.assertEqual(x._values().get_device(), 1) - self.assertEqual(x._indices().get_device(), 1) + check_device(x, 1) + + i = self.IndexTensor([[2]]).cuda(1) + v = self.ValueTensor(1, 0).cuda(1) + x = self.SparseTensor(i, v, torch.Size([3, 0]), device=1) + check_device(x, 1) x = self.SparseTensor(3, device=1) - self.assertEqual(x.get_device(), 1) - self.assertEqual(x._values().get_device(), 1) - self.assertEqual(x._indices().get_device(), 1) + check_device(x, 1) + + x = self.SparseTensor(3, 0, device=1) + check_device(x, 1) + i = self.IndexTensor([[2]]).cuda(1) v = self.ValueTensor([5]).cuda(0) self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3]))) + i = self.IndexTensor([[2]]).cuda(1) + v = self.ValueTensor(1, 0).cuda(0) + self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3, 0]))) + def _test_new_device(self, size, device): with torch.cuda.device(device): x = torch.cuda.sparse.DoubleTensor(*size) @@ -997,6 +1145,7 @@ def test_new_device_single_gpu(self): self._test_new_device((), 0) self._test_new_device((30, 20), 0) self._test_new_device((30, 20, 10), 0) + self._test_new_device((30, 20, 10, 0), 0) @cuda_only @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected") @@ -1004,49 +1153,67 @@ def test_new_device_multi_gpu(self): self._test_new_device((), 1) self._test_new_device((30, 20), 1) self._test_new_device((30, 20, 10), 1) + self._test_new_device((30, 20, 10, 0), 1) @skipIfRocm def test_new(self): - x, indices, values = self._gen_sparse(3, 10, 100) - if not x.is_cuda: - # CUDA sparse tensors currently requires the size to be - # specified if nDimV > 0 - self.assertEqual(x.new(indices, values), x) - self.assertEqual(x.new(indices, values, x.size()), x) + def test_shape(sparse_dims, nnz, with_size): + x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size) + if not x.is_cuda: + # CUDA sparse tensors currently requires the size to be + # specified if nDimV > 0 + self.assertEqual(x.new(indices, values), x) + self.assertEqual(x.new(indices, values, x.size()), x) + + test_shape(3, 10, 100) + test_shape(3, 0, [100, 100, 0]) @cpu_only # not really, but we only really want to run this once @skipIfRocm def test_factory(self): - default_size = torch.Size([1, 3]) - size = torch.Size([3, 3]) - for include_size in [True, False]: - for use_tensor_idx in [True, False]: - for use_tensor_val in [True, False]: - for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]): - # have to include size with cuda sparse tensors - include_size = include_size or use_cuda - dtype = torch.float64 - long_dtype = torch.int64 - device = torch.device('cpu') if not use_cuda else torch.device(torch.cuda.device_count() - 1) - indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2]) - values = torch.tensor([1.], dtype=dtype) if use_tensor_val else 1. - if include_size: - sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype, - device=device, requires_grad=True) - else: - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype, - device=device, requires_grad=True) - self.assertEqual(indices, sparse_tensor._indices()) - self.assertEqual(values, sparse_tensor._values()) - self.assertEqual(size if include_size else default_size, sparse_tensor.size()) - self.assertEqual(dtype, sparse_tensor.dtype) - if use_cuda: - self.assertEqual(device, sparse_tensor._values().device) - self.assertEqual(True, sparse_tensor.requires_grad) + for test_empty_tensor in [True, False]: + if test_empty_tensor: + default_size = torch.Size([1, 3, 0]) + size = torch.Size([3, 3, 0]) + else: + default_size = torch.Size([1, 3]) + size = torch.Size([3, 3]) + for include_size in [True, False]: + for use_tensor_idx in [True, False]: + for use_tensor_val in [True, False]: + for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]): + # have to include size with cuda sparse tensors + include_size = include_size or use_cuda + dtype = torch.float64 + long_dtype = torch.int64 + device = torch.device('cpu') if not use_cuda else \ + torch.device(torch.cuda.device_count() - 1) + indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2]) + if test_empty_tensor: + values = self.ValueTensor(1, 0) + else: + if use_tensor_val: + values = torch.tensor([1.], dtype=dtype) + else: + values = 1. + if include_size: + sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype, + device=device, requires_grad=True) + else: + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype, + device=device, requires_grad=True) + self.assertEqual(indices, sparse_tensor._indices()) + self.assertEqual(values, sparse_tensor._values()) + self.assertEqual(size if include_size else default_size, sparse_tensor.size()) + self.assertEqual(dtype, sparse_tensor.dtype) + if use_cuda: + self.assertEqual(device, sparse_tensor._values().device) + self.assertEqual(True, sparse_tensor.requires_grad) @skipIfRocm def test_factory_size_check(self): - indices = self.IndexTensor([[1, 2], [0, 2]]) + indices = self.IndexTensor([[1, 2], + [0, 2]]) values = self.ValueTensor([.5, .5]) sizes = torch.Size([2, 3]) with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"): @@ -1056,12 +1223,34 @@ def test_factory_size_check(self): with self.assertRaisesRegex(RuntimeError, "found negative index"): torch.sparse_coo_tensor(indices, values, sizes) - indices = self.IndexTensor([[1, 2], [0, 2]]) + indices = self.IndexTensor([[1, 2], + [0, 2]]) + values = self.ValueTensor(2, 1, 0) + sizes = torch.Size([2, 3, 1, 0]) + with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"): + torch.sparse_coo_tensor(indices, values, sizes) + + indices = self.IndexTensor([[1, 2], + [0, 2]]) + values = self.ValueTensor(2, 2, 2) + sizes = torch.Size([0, 0, 2, 2]) + with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"): + torch.sparse_coo_tensor(indices, values, sizes) + + indices = self.IndexTensor([[1, 2], + [0, 2]]) values = self.ValueTensor([[1, 1, 1], [1, 1, 1]]) sizes = torch.Size([3, 3, 2]) with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): torch.sparse_coo_tensor(indices, values, sizes) + indices = self.IndexTensor([[1, 2], + [0, 2]]) + values = self.ValueTensor(2, 1, 0) + sizes = torch.Size([3, 3, 2, 0]) + with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): + torch.sparse_coo_tensor(indices, values, sizes) + def test_factory_default(self): tensor = self.SparseTensor() expected_indices = self.IndexTensor(1, 0) @@ -1095,25 +1284,31 @@ def test_factory_nnz(self): with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"): torch.sparse_coo_tensor(indices, values, sizes) - def _test_factory_tensor_shape(self, i_shape, v_shape, size, expected_size): - device = 'cuda' if self.is_cuda else 'cpu' - if size: - t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device) - else: - t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device) - expected_indices = torch.empty(i_shape, device=device) - expected_values = torch.empty(v_shape, device=device) - expected_size = torch.Size(expected_size) - self.assertEqual(t._indices(), expected_indices) - self.assertEqual(t._values(), expected_values) - self.assertEqual(t.size(), expected_size) + indices = self.IndexTensor([[0]]) # (sparseDims, nnz): (1, 1) + values = self.ValueTensor(2, 0) # (nnz, ...): (2, 0) + sizes = torch.Size([2, 0]) + with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"): + torch.sparse_coo_tensor(indices, values, sizes) def test_factory_nnz_zero(self): - self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0]) - self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0]) - self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0]) - self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0]) - self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0]) + def test_shape(i_shape, v_shape, size, expected_size): + device = 'cuda' if self.is_cuda else 'cpu' + if size: + t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device) + else: + t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device) + expected_indices = torch.empty(i_shape, device=device) + expected_values = torch.empty(v_shape, device=device) + expected_size = torch.Size(expected_size) + self.assertEqual(t._indices(), expected_indices) + self.assertEqual(t._values(), expected_values) + self.assertEqual(t.size(), expected_size) + + test_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0]) + test_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0]) + test_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0]) + test_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0]) + test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0]) @skipIfRocm def test_factory_dense_dims(self): @@ -1123,6 +1318,12 @@ def test_factory_dense_dims(self): with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): torch.sparse_coo_tensor(indices, values, sizes) + indices = self.IndexTensor([[0]]) + values = self.ValueTensor(1, 2, 3, 0) + sizes = torch.Size([1, 3, 4, 0]) + with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): + torch.sparse_coo_tensor(indices, values, sizes) + @cpu_only def test_factory_type_inference(self): t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32)) @@ -1132,6 +1333,13 @@ def test_factory_type_inference(self): t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1])) self.assertEqual(torch.int64, t.dtype) + t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0)) + self.assertEqual(torch.float32, t.dtype) + t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0)) + self.assertEqual(torch.float64, t.dtype) + t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.LongTensor(1, 0)) + self.assertEqual(torch.int64, t.dtype) + @cuda_only @skipIfRocm def test_factory_device_type_inference(self): @@ -1140,41 +1348,66 @@ def test_factory_device_type_inference(self): for indices_device in ['cuda', 'cpu']: for values_device in ['cuda', 'cpu']: for sparse_device in ['cuda', 'cpu', None]: - t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device), - torch.tensor([1.], device=values_device), - (1, 3), device=sparse_device) - should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda') - self.assertEqual(should_be_cuda, t.is_cuda) + for test_empty_tensor in [True, False]: + if test_empty_tensor: + t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device), + self.ValueTensor(1, 0).to(values_device), + (1, 3, 0), device=sparse_device) + else: + t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device), + torch.tensor([1.], device=values_device), + (1, 3), device=sparse_device) + should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda') + self.assertEqual(should_be_cuda, t.is_cuda) @cpu_only def test_factory_copy(self): + def test_tensor(indices, values, indices_equal, values_equal): + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) + if indices_equal: + self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) + else: + self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) + if values_equal: + self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + else: + self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + # both correct indices = torch.tensor(([0], [2]), dtype=torch.int64) values = torch.tensor([1.], dtype=torch.float64) - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) - self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) - self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + test_tensor(indices, values, True, True) + + indices = torch.tensor(([0], [2]), dtype=torch.int64) + values = torch.DoubleTensor(1, 0) + test_tensor(indices, values, True, True) # only indices correct indices = torch.tensor(([0], [2]), dtype=torch.int64) values = torch.tensor([1.], dtype=torch.float32) - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) - self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) - self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + test_tensor(indices, values, True, False) + + indices = torch.tensor(([0], [2]), dtype=torch.int64) + values = torch.FloatTensor(1, 0) + test_tensor(indices, values, True, True) # An empty tensor's data_ptr is always equal to 0 # only values correct indices = torch.tensor(([0], [2]), dtype=torch.int32) values = torch.tensor([1.], dtype=torch.float64) - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) - self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) - self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + test_tensor(indices, values, False, True) + + indices = torch.tensor(([0], [2]), dtype=torch.int32) + values = torch.DoubleTensor(1, 0) + test_tensor(indices, values, False, True) # neither correct indices = torch.tensor(([0], [2]), dtype=torch.int32) values = torch.tensor([1.], dtype=torch.float32) - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) - self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) - self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) + test_tensor(indices, values, False, False) + + indices = torch.tensor(([0], [2]), dtype=torch.int32) + values = torch.FloatTensor(1, 0) + test_tensor(indices, values, False, True) # An empty tensor's data_ptr is always equal to 0 @cpu_only # just run once, we test both cpu and cuda def test_constructor_device_legacy(self): @@ -1225,9 +1458,15 @@ def test_is_sparse(self): x = torch.randn(3, 3) self.assertFalse(x.is_sparse) + x = torch.randn(3, 3, 0) + self.assertFalse(x.is_sparse) + x = self.SparseTensor() self.assertTrue(x.is_sparse) + x = self.SparseTensor(1, 0) + self.assertTrue(x.is_sparse) + @skipIfRocm def test_resize_as(self): def do_test(t): @@ -1238,6 +1477,8 @@ def do_test(t): self.assertEqual(t, t + y) do_test(self.SparseTensor()) + do_test(self.SparseTensor(3, 0)) + do_test(self.SparseTensor(3, 3)) @skipIfRocm def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size): @@ -1264,10 +1505,13 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size): @skipIfRocm def test_resize(self): - # 1. Increase the size of some dense dimensions [Supported] + # 1. Expand the size of some dense dimensions [Supported] self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 4], [2, 2, 4]) + self._test_resize_shape([1, 1], [1, 2, 0], [2, 2, 0], + [1, 1], [1, 2, 4], [2, 2, 4]) + # 2. Expand the size of some sparse dimensions [Supported] self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 3], [4, 2, 3]) @@ -1276,11 +1520,18 @@ def test_resize(self): self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3], [2, 0], [0, 2, 4, 5], [1, 1, 2, 4, 5]) + self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3], + [2, 0], [0, 2, 4, 0], [1, 1, 2, 4, 0]) + # 4. Add dims to dense dimensions [Not Supported] with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 3, 4], [2, 2, 3, 4]) + with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"): + self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], + [1, 1], [1, 2, 3, 0], [2, 2, 3, 0]) + # 5. Remove dims from dense dimensions [Not Supported] with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], @@ -1301,6 +1552,10 @@ def test_resize(self): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 2], [2, 2, 2]) + with self.assertRaisesRegex(RuntimeError, "shrinking the size of dense dimensions"): + self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], + [1, 1], [1, 2, 0], [2, 2, 0]) + def test_is_nonzero(self): self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero()) self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero()) @@ -1308,6 +1563,8 @@ def test_is_nonzero(self): self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (0., 0.), (1,)).is_nonzero()) self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (-1., 1.), (1,)).is_nonzero()) self.assertTrue(torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, []).is_nonzero()) # scalar sparse tensor + with self.assertRaisesRegex(RuntimeError, "bool value of Tensor with no values is ambiguous"): + torch.sparse_coo_tensor(([0, 1],), self.ValueTensor(2, 0), (4, 0)).is_nonzero() class TestUncoalescedSparse(TestSparse): @@ -1338,11 +1595,26 @@ class TestSparseOneOff(TestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @skipIfRocm def test_cuda_from_cpu(self): - self.assertExpectedRaises( - RuntimeError, - lambda: torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), - torch.randn(4, 4, 4), - [3, 4, 4])) + with self.assertRaisesRegex( + RuntimeError, + "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), + torch.randn(4, 4, 4), + [3, 4, 4]) + + with self.assertRaisesRegex( + RuntimeError, + "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), + torch.randn(4, 4, 4, 0), + [3, 4, 4, 0]) + + with self.assertRaisesRegex( + RuntimeError, + "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), + torch.randn(0, 4, 4, 0), + [0, 4, 4, 0]) @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @skipIfRocm @@ -1351,7 +1623,22 @@ def test_cuda_sparse_cpu_dense_add(self): sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4).cuda(), [3, 4, 4]) - self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y) + with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"): + x + sparse_y + + x = torch.zeros(3, 4, 4, 0) + sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), + torch.randn(4, 4, 4, 0).cuda(), + [3, 4, 4, 0]) + with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"): + x + sparse_y + + x = torch.zeros(0, 4, 4, 0) + sparse_y = torch.cuda.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), + torch.randn(0, 4, 4, 0).cuda(), + [0, 4, 4, 0]) + with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"): + x + sparse_y if __name__ == '__main__': From 6073f3073e71d5e05bbe537a32b363917dcf5dfc Mon Sep 17 00:00:00 2001 From: Peter Goldsborough Date: Tue, 18 Sep 2018 10:15:51 -0700 Subject: [PATCH 223/237] Document torch::nn::init (#11778) Summary: Doc fixes and documentation for `torch::nn::init`. ebetica soumith ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/11778 Differential Revision: D9886648 Pulled By: goldsborough fbshipit-source-id: 22eb78add1dc32b92cc32253683ab3d746505a64 --- torch/csrc/api/include/torch/nn/init.h | 50 ++++++++++++++++++- torch/csrc/api/include/torch/nn/module.h | 8 +-- torch/csrc/api/include/torch/nn/modules/any.h | 6 +++ torch/csrc/api/src/nn/init.cpp | 6 +-- 4 files changed, 62 insertions(+), 8 deletions(-) diff --git a/torch/csrc/api/include/torch/nn/init.h b/torch/csrc/api/include/torch/nn/init.h index 9054e8446f3e0c..0fd68df5bbd0c8 100644 --- a/torch/csrc/api/include/torch/nn/init.h +++ b/torch/csrc/api/include/torch/nn/init.h @@ -6,16 +6,64 @@ namespace torch { namespace nn { namespace init { +/// Fills the given `tensor` with the provided `value` in-place, and returns it. +/// No gradient will be recorded for this operation. Tensor constant_(Tensor tensor, Scalar value); + +/// Fills the given `tensor` with the Dirac delta function in-place, and returns +/// it. No gradient will be recorded for this operation. Tensor dirac_(Tensor tensor); -Tensor eye_(Tensor tensor); + +/// Fills the given 2-dimensional `matrix` with an identity matrix. +/// No gradient will be recorded for this operation. +Tensor eye_(Tensor matrix); + +/// Fills the given 2-dimensional `matrix` with values drawn from a normal +/// distribution parameterized by `mean` and `std`. +/// No gradient will be recorded for this operation. Tensor normal_(Tensor tensor, double mean = 0, double std = 1); + +/// Fills the given `tensor` with ones. +/// No gradient will be recorded for this operation. Tensor ones_(Tensor tensor); + +/// Fills the input `Tensor` with a (semi) orthogonal matrix, as described in +/// "Exact solutions to the nonlinear dynamics of learning in deep linear neural +/// networks" - Saxe, A. et al. (2013). The input tensor must have at least 2 +/// dimensions, and for tensors with more than 2 dimensions the trailing +/// dimensions are flattened. +/// No gradient will be recorded for this operation. Tensor orthogonal_(Tensor tensor, double gain = 1.0); + +/// Fills the 2D input `Tensor` as a sparse matrix, where the +/// non-zero elements will be drawn from a centered normal distribution +/// with the given standard deviation `std`, as described in "Deep learning via +/// Hessian-free optimization" - Martens, J. (2010). The `sparsity` is a real +/// value between 0 and 1 that controls the fraction of elements in each column +/// to be set to zero. +/// No gradient will be recorded for this operation. Tensor sparse_(Tensor tensor, double sparsity, double std = 0.01); + +/// Fills the given 2-dimensional `matrix` with values drawn from a uniform +/// distribution parameterized by `low` and `high`. +/// No gradient will be recorded for this operation. Tensor uniform_(Tensor tensor, double low = 0, double high = 1); + +/// Fills the input `Tensor` with values according to the method +/// described in "Understanding the difficulty of training deep feedforward +/// neural networks" - Glorot, X. & Bengio, Y. (2010). Values are scaled by the +/// `gain` parameter. No gradient will be recorded for this operation. Tensor xavier_normal_(Tensor tensor, double gain = 1.0); + +/// Fills the input `Tensor` with values according to the method +/// described in "Understanding the difficulty of training deep feedforward +/// neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform +/// distribution. Values are scaled by the `gain` parameter +/// No gradient will be recorded for this operation. Tensor xavier_uniform_(Tensor tensor, double gain = 1.0); + +/// Fills the given `tensor` with zeros. +/// No gradient will be recorded for this operation. Tensor zeros_(Tensor tensor); } // namespace init diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h index 2cb71e3470574e..3dd730fcca5c68 100644 --- a/torch/csrc/api/include/torch/nn/module.h +++ b/torch/csrc/api/include/torch/nn/module.h @@ -260,7 +260,7 @@ class Module { /// methods such as `parameters()`, `clone()` or `to().` /// /// \rst - /// .. code-block: cpp + /// .. code-block:: cpp /// MyModule::MyModule() { /// weight_ = register_parameter("weight", torch::randn({A, B})); /// } @@ -277,7 +277,7 @@ class Module { /// to methods such as `buffers()`, `clone()` or `to(). /// /// \rst - /// .. code-block: cpp + /// .. code-block:: cpp /// MyModule::MyModule() { /// mean_ = register_buffer("mean", torch::empty({num_features_})); /// } @@ -290,7 +290,7 @@ class Module { /// `clone()` or `to()`. /// /// \rst - /// .. code-block: cpp + /// .. code-block:: cpp /// MyModule::MyModule() { /// submodule_ = register_module("linear", torch::nn::Linear(3, 4)); /// } @@ -308,7 +308,7 @@ class Module { /// `clone()` or `to()`. /// /// \rst - /// .. code-block: cpp + /// .. code-block:: cpp /// MyModule::MyModule() { /// submodule_ = register_module("linear", torch::nn::Linear(3, 4)); /// } diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h index 512e6d2cc1c5af..1544ce31a835f9 100644 --- a/torch/csrc/api/include/torch/nn/modules/any.h +++ b/torch/csrc/api/include/torch/nn/modules/any.h @@ -184,11 +184,13 @@ class AnyModule { bool is_empty() const noexcept; private: + /// \internal /// The static type of the object we store in the `AnyModule`, which erases /// the actual type, but allows us to call `forward()` on the underlying /// module. struct Placeholder; + /// \internal /// The dynamic type of the object stored in the `AnyModule`. It contains the /// concrete instance to which all calls are forwarded. It is parameterized /// over the concrete type of the module, and the types of the arguments the @@ -289,6 +291,7 @@ class AnyModule::Value { explicit Value(autograd::Variable variable) : Value(Tensor(std::move(variable))) {} + /// \internal /// The static type of the object we store in the `Value`, which erases the /// actual object's type, allowing us only to check the `type_info` of the /// type stored in the dynamic type. @@ -299,6 +302,7 @@ class AnyModule::Value { const std::type_info& type_info; }; + /// \internal /// The dynamic type of the object we store in the `Value`, which hides the /// actual object we have erased in this `Value`. template @@ -337,6 +341,7 @@ struct AnyModule::Placeholder : public AnyModule::Value::Placeholder { template struct AnyModule::Holder : public AnyModule::Placeholder { + /// \internal struct CheckedGetter { template decay_t&& operator()(size_t index) { @@ -356,6 +361,7 @@ struct AnyModule::Holder : public AnyModule::Placeholder { std::vector& arguments_; }; + /// \internal struct InvokeForward { template Value operator()(Ts&&... ts) { diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp index 1afc1858b2aa99..275e9639abf152 100644 --- a/torch/csrc/api/src/nn/init.cpp +++ b/torch/csrc/api/src/nn/init.cpp @@ -69,11 +69,11 @@ Tensor dirac_(Tensor tensor) { return tensor; } -Tensor eye_(Tensor tensor) { +Tensor eye_(Tensor matrix) { NoGradGuard guard; AT_CHECK( - tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported"); - return torch::eye_out(tensor, tensor.size(0), tensor.size(1)); + matrix.ndimension() == 2, "Only tensors with 2 dimensions are supported"); + return torch::eye_out(matrix, matrix.size(0), matrix.size(1)); } Tensor normal_(Tensor tensor, double mean, double std) { From 98aebed88ea85bfb76e97cc0948a326f2a32ccba Mon Sep 17 00:00:00 2001 From: Ansha Yu Date: Tue, 18 Sep 2018 10:40:45 -0700 Subject: [PATCH 224/237] Refactor tests part 1 (#11350) Summary: Followup to [the serialized test framework](https://github.com/pytorch/pytorch/pull/10594) Round 1 for refactoring tests, starting alphabetically. I added some functionality, so I wanted to send out some of these initial changes sooner. I'm skipping all tests that don't explicitly call assertReferenceChecks. Some tests directly call np.allclose, and others are simply TestCase (rather than HypothesisTestCase). 1. Start alphabetically producing serialized outputs for test functions, annotating those we want to include with `serialized_test_util.given`. So far I've only added one test per operator, but this already does seem to add quite a few tests. 2. Add functionality to allow us to generate outputs using pytest by adding pytest argument options. This allows us to skip adding a `__main__` function to quite a few tests. 3. Catch any exceptions generating the gradient operator and skip serializing/reading it, since certain operators don't have gradients. 4. Add functionality to better handle jagged array inputs, which numpy doesn't handle very well. We simply explicitly do the conversion to dtype=object. 5. Make only one file per test function, rather than 4, to reduce the number of files in the github repo. I also noticed that there is some hypothesis handling that makes `serialized_test_util.given` not compatible with adding more hypothesis decorators on top. For example, there are tests that do ``` settings(...) given(...) def test_my_stuff(...) ``` But there is a hypothesis handler that explicitly checks that `given` is called below `settings`, so we cannot refactor this to `serialized_test_util.given`. I've just avoided decorating these kinds of tests for now, I hope that's alright. Pull Request resolved: https://github.com/pytorch/pytorch/pull/11350 Reviewed By: houseroad Differential Revision: D9693857 Pulled By: ajyu fbshipit-source-id: a9b4279afbe51c90cf2025c5ac6b2db2111f4af7 --- .../operator_test/activation_ops_test.py | 11 +- caffe2/python/operator_test/adadelta_test.py | 7 +- caffe2/python/operator_test/adagrad_test.py | 10 +- .../operator_test/affine_channel_op_test.py | 16 +- caffe2/python/operator_test/arg_ops_test.py | 13 +- .../operator_test/batch_box_cox_test.py | 5 +- .../operator_test/batch_bucketize_op_test.py | 7 +- .../operator_test/batch_moments_op_test.py | 15 +- .../batch_sparse_to_dense_op_test.py | 10 +- .../operator_test/bbox_transform_test.py | 6 +- .../python/operator_test/boolean_mask_test.py | 14 +- .../operator_test/boolean_unmask_test.py | 11 +- .../box_with_nms_limit_op_test.py | 14 +- caffe2/python/operator_test/ceil_op_test.py | 7 +- .../channel_backprop_stats_op_test.py | 10 +- .../operator_test/channel_shuffle_test.py | 9 +- .../operator_test/channel_stats_op_test.py | 10 +- caffe2/python/operator_test/clip_op_test.py | 5 +- .../operator_test/clip_tensor_op_test.py | 6 +- ...nd_distribute_fpn_rpn_proposals_op_test.py | 27 +-- .../operator_test/concat_split_op_test.py | 17 +- .../python/operator_test/conditional_test.py | 10 +- caffe2/python/operator_test/conftest.py | 39 ++++ caffe2/python/operator_test/conv_test.py | 21 ++- .../cosine_embedding_criterion_op_test.py | 5 +- .../ctc_beam_search_decoder_op_test.py | 6 +- .../ctc_greedy_decoder_op_test.py | 7 +- .../python/operator_test/distance_op_test.py | 13 +- .../python/operator_test/dropout_op_test.py | 5 +- .../elementwise_linear_op_test.py | 5 +- .../elementwise_logical_ops_test.py | 12 +- .../elementwise_op_broadcast_test.py | 5 +- caffe2/python/operator_test/expand_op_test.py | 5 +- .../python/operator_test/fc_operator_test.py | 5 +- .../python/operator_test/filler_ops_test.py | 18 +- caffe2/python/operator_test/find_op_test.py | 14 +- .../operator_test/flexible_top_k_test.py | 13 +- caffe2/python/operator_test/floor_op_test.py | 9 +- .../python/operator_test/gather_ops_test.py | 9 +- .../operator_test/gather_ranges_op_test.py | 9 +- caffe2/python/operator_test/glu_op_test.py | 7 +- .../operator_test/group_norm_op_test.py | 19 +- caffe2/python/operator_test/gru_test.py | 5 +- .../operator_test/hyperbolic_ops_test.py | 9 +- .../operator_test/index_hash_ops_test.py | 6 +- .../operator_test/instance_norm_test.py | 6 +- .../operator_test/integral_image_ops_test.py | 9 +- caffe2/python/operator_test/jsd_ops_test.py | 6 +- .../operator_test/layer_norm_op_test.py | 11 +- .../learning_rate_adaption_op_test.py | 9 +- .../operator_test/learning_rate_op_test.py | 5 +- .../operator_test/length_split_op_test.py | 5 +- .../operator_test/lengths_pad_op_test.py | 5 +- .../operator_test/lengths_tile_op_test.py | 5 +- .../operator_test/lengths_top_k_ops_test.py | 6 +- .../listwise_l2r_operator_test.py | 1 + .../locally_connected_op_test.py | 5 +- caffe2/python/operator_test/loss_ops_test.py | 5 +- .../margin_ranking_criterion_op_test.py | 11 +- caffe2/python/operator_test/math_ops_test.py | 5 +- caffe2/python/operator_test/matmul_op_test.py | 9 +- caffe2/python/operator_test/mean_op_test.py | 9 +- .../operator_test/merge_id_lists_op_test.py | 13 +- .../python/operator_test/moments_op_test.py | 7 +- .../python/operator_test/momentum_sgd_test.py | 7 +- .../operator_test/negate_gradient_op_test.py | 13 +- .../operator_test/numpy_tile_op_test.py | 5 +- .../python/operator_test/one_hot_ops_test.py | 11 +- .../python/operator_test/onnx_while_test.py | 15 +- caffe2/python/operator_test/pack_ops_test.py | 5 +- .../pack_rnn_sequence_op_test.py | 7 +- caffe2/python/operator_test/pad_test.py | 14 +- .../piecewise_linear_transform_test.py | 9 +- .../python/operator_test/weighted_sum_test.py | 2 +- caffe2/python/serialized_test/README.md | 14 +- .../activation_ops_test.test_elu.zip | Bin 0 -> 1192 bytes .../activation_ops_test.test_leaky_relu.zip | Bin 0 -> 1200 bytes .../activation_ops_test.test_relu.zip | Bin 0 -> 1166 bytes .../activation_ops_test.test_relu_n.zip | Bin 0 -> 1184 bytes .../adadelta_test.test_adadelta.zip | Bin 0 -> 1167 bytes ...adelta_test.test_sparse_adadelta_empty.zip | Bin 0 -> 1209 bytes .../adagrad_test.test_adagrad.zip | Bin 0 -> 1121 bytes ...est.test_row_wise_sparse_adagrad_empty.zip | Bin 0 -> 1161 bytes ...adagrad_test.test_sparse_adagrad_empty.zip | Bin 0 -> 1154 bytes ...channel_op_test.test_affine_channel_2d.zip | Bin 0 -> 1299 bytes .../arg_ops_test.test_argmax.zip | Bin 0 -> 1061 bytes .../arg_ops_test.test_argmin.zip | Bin 0 -> 1061 bytes .../batch_box_cox_test.test_batch_box_cox.zip | Bin 0 -> 1116 bytes ...e_op_test.test_batch_bucketize_example.zip | Bin 0 -> 1265 bytes ..._moments_op_test.test_batch_moments_2d.zip | Bin 0 -> 1246 bytes ...nse_op_test.test_batch_sparse_to_dense.zip | Bin 0 -> 1501 bytes ...box_transform_test.test_bbox_transform.zip | Bin 0 -> 1254 bytes .../boolean_mask_test.test_boolean_mask.zip | Bin 0 -> 1077 bytes ...k_test.test_sequence_mask_with_lengths.zip | Bin 0 -> 1337 bytes .../boolean_unmask_test.test.zip | Bin 0 -> 1092 bytes ...box_with_nms_limit_op_test.test_simple.zip | Bin 0 -> 1238 bytes .../operator_test/ceil_op_test.test_ceil.zip | Bin 0 -> 1029 bytes ...stats_op_test.testChannelBackpropStats.zip | Bin 0 -> 1568 bytes ...nnel_shuffle_test.test_channel_shuffle.zip | Bin 0 -> 1241 bytes ...channel_stats_op_test.testChannelStats.zip | Bin 0 -> 1275 bytes .../operator_test/clip_op_test.test_clip.zip | Bin 0 -> 1208 bytes ...or_op_test.test_clip_tensor_by_scaling.zip | Bin 0 -> 1174 bytes ...roposals_op_test.test_collect_and_dist.zip | Bin 0 -> 43168 bytes .../concat_split_op_test.test_concat.zip | Bin 0 -> 1246 bytes .../concat_split_op_test.test_split.zip | Bin 0 -> 1226 bytes ...at_split_op_test.test_split_by_lengths.zip | Bin 0 -> 1233 bytes .../conditional_test.test_conditional.zip | Bin 0 -> 3409 bytes .../operator_test/conv_test.test_1x1_conv.zip | Bin 0 -> 1632 bytes ...p_test.test_cosine_embedding_criterion.zip | Bin 0 -> 1355 bytes ...r_op_test.test_ctc_beam_search_decoder.zip | Bin 0 -> 1101 bytes ...ecoder_op_test.test_ctc_greedy_decoder.zip | Bin 0 -> 1133 bytes .../distance_op_test.test_dot_product.zip | Bin 0 -> 1199 bytes .../dropout_op_test.test_dropout_is_test.zip | Bin 0 -> 1215 bytes .../elementwise_linear_op_test.test.zip | Bin 0 -> 1280 bytes ...ise_logical_ops_test.test_is_member_of.zip | Bin 0 -> 1050 bytes ...lementwise_logical_ops_test.test_where.zip | Bin 0 -> 1072 bytes ..._op_broadcast_test.test_broadcast_powt.zip | Bin 0 -> 3427 bytes .../expand_op_test.test_expand_rand_shape.zip | Bin 0 -> 1191 bytes .../fc_operator_test.test_fc.zip | Bin 0 -> 1219 bytes ...r_ops_test.test_diagonal_fill_op_float.zip | Bin 0 -> 1057 bytes ...iller_ops_test.test_lengths_range_fill.zip | Bin 0 -> 1054 bytes .../operator_test/find_op_test.test_find.zip | Bin 0 -> 1051 bytes ...lexible_top_k_test.test_flexible_top_k.zip | Bin 0 -> 1258 bytes .../floor_op_test.test_floor.zip | Bin 0 -> 1030 bytes .../gather_ops_test.test_batch_gather_ops.zip | Bin 0 -> 1234 bytes .../gather_ops_test.test_gather_ops.zip | Bin 0 -> 1853 bytes ...ther_ranges_op_test.test_gather_ranges.zip | Bin 0 -> 1103 bytes ...anges_op_test.test_gather_ranges_split.zip | Bin 0 -> 1106 bytes .../glu_op_test.test_glu_old.zip | Bin 0 -> 1080 bytes .../group_norm_op_test.test_group_norm_2d.zip | Bin 0 -> 1433 bytes .../gru_test.test_gru_unit_op.zip | Bin 0 -> 1628 bytes .../hyperbolic_ops_test.test_cosh.zip | Bin 0 -> 1163 bytes .../hyperbolic_ops_test.test_sinh.zip | Bin 0 -> 1162 bytes .../hyperbolic_ops_test.test_tanh.zip | Bin 0 -> 1162 bytes ...ndex_hash_ops_test.test_index_hash_ops.zip | Bin 0 -> 1075 bytes ...est.test_instance_norm_reference_check.zip | Bin 0 -> 3032 bytes ..._test.test_integral_image_gradient_ops.zip | Bin 0 -> 1747 bytes ...image_ops_test.test_integral_image_ops.zip | Bin 0 -> 1608 bytes .../jsd_ops_test.test_bernoulli_jsd.zip | Bin 0 -> 1314 bytes ...r_norm_op_test.test_layer_norm_grad_op.zip | Bin 0 -> 1128 bytes ...earning_rate_adaption_op_normalization.zip | Bin 0 -> 1107 bytes ...te_op_test.test_alter_learning_rate_op.zip | Bin 0 -> 1135 bytes ...h_split_op_test.test_length_split_edge.zip | Bin 0 -> 1093 bytes .../lengths_pad_op_test.test_lengths_pad.zip | Bin 0 -> 1114 bytes ...lengths_tile_op_test.test_lengths_tile.zip | Bin 0 -> 1207 bytes ...s_top_k_ops_test.test_lengths_top_k_op.zip | Bin 0 -> 1240 bytes .../locally_connected_op_test.test_lc_2d.zip | Bin 0 -> 1277 bytes .../loss_ops_test.test_averaged_loss.zip | Bin 0 -> 1179 bytes ..._op_test.test_margin_ranking_criterion.zip | Bin 0 -> 1398 bytes .../operator_test/math_ops_test.test_sign.zip | Bin 0 -> 1027 bytes .../matmul_op_test.test_batch_matmul.zip | Bin 0 -> 1393 bytes .../matmul_op_test.test_matmul.zip | Bin 0 -> 1396 bytes ...matmul_op_test.test_numpy_batch_matmul.zip | Bin 0 -> 1118 bytes .../operator_test/mean_op_test.test_mean.zip | Bin 0 -> 1165 bytes ...d_lists_op_test.test_merge_id_lists_op.zip | Bin 0 -> 1122 bytes .../moments_op_test.test_moments.zip | Bin 0 -> 1268 bytes .../momentum_sgd_test.test_momentum_sgd.zip | Bin 0 -> 1187 bytes ...ntum_sgd_test.test_sparse_momentum_sgd.zip | Bin 0 -> 1182 bytes .../negate_gradient_op_test.test_forward.zip | Bin 0 -> 1161 bytes .../numpy_tile_op_test.test_numpy_tile.zip | Bin 0 -> 1081 bytes ...ops_test.test_batch_bucketized_one_hot.zip | Bin 0 -> 1099 bytes .../one_hot_ops_test.test_batch_one_hot.zip | Bin 0 -> 1071 bytes .../one_hot_ops_test.test_one_hot.zip | Bin 0 -> 1070 bytes .../one_hot_ops_test.test_segment_one_hot.zip | Bin 0 -> 1090 bytes .../onnx_while_test.test_onnx_while_fibb.zip | Bin 0 -> 1472 bytes ...ops_test.test_pack_with_max_length_ops.zip | Bin 0 -> 1341 bytes ...sequence_op_test.test_pack_rnn_seqence.zip | Bin 0 -> 1218 bytes ...quence_op_test.test_unpack_rnn_seqence.zip | Bin 0 -> 1218 bytes .../data/operator_test/pad_test.test_crop.zip | Bin 0 -> 3245 bytes ...test_multi_predictions_params_from_arg.zip | Bin 0 -> 1139 bytes .../weighted_sum_test.test_weighted_sum.zip | Bin 0 -> 1212 bytes .../gradient_0.pb | Bin 67 -> 0 bytes .../inputs.npz | Bin 390 -> 0 bytes .../operator_0.pb | Bin 43 -> 0 bytes .../outputs.npz | Bin 234 -> 0 bytes .../serialized_test/serialized_test_util.py | 172 +++++++++++------- 176 files changed, 523 insertions(+), 369 deletions(-) create mode 100644 caffe2/python/operator_test/conftest.py create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip create mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb delete mode 100644 caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py index 0d35110740825b..5be8b689f115cb 100644 --- a/caffe2/python/operator_test/activation_ops_test.py +++ b/caffe2/python/operator_test/activation_ops_test.py @@ -11,12 +11,13 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.mkl_test_util as mu +import caffe2.python.serialized_test.serialized_test_util as serial import unittest -class TestActivations(hu.HypothesisTestCase): - @given(X=hu.tensor(), in_place=st.booleans(), +class TestActivations(serial.SerializedTestCase): + @serial.given(X=hu.tensor(), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), **mu.gcs) def test_relu(self, X, in_place, engine, gc, dc): if gc == mu.mkl_do: @@ -74,7 +75,7 @@ def relu_grad_ref(g_out, outputs, fwd_inputs): output_to_grad="X" if in_place else "Y", grad_reference=relu_grad_ref) - @given(X=hu.tensor(elements=st.floats(-3.0, 3.0)), + @serial.given(X=hu.tensor(elements=st.floats(-3.0, 3.0)), n=st.floats(min_value=0.5, max_value=2.0), in_place=st.booleans(), **hu.gcs) def test_relu_n(self, X, n, in_place, gc, dc): @@ -100,7 +101,7 @@ def relu_n_ref(X): self.assertDeviceChecks(dc, op, [X], [0]) self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.005) - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), alpha=st.floats(min_value=0.1, max_value=2.0), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) @@ -169,7 +170,7 @@ def prelu_ref(X, W): # Gradient check wrt W self.assertGradientChecks(gc, op, [X, W], 1, [0], stepsize=1e-2) - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), alpha=st.floats(min_value=0.1, max_value=2.0), inplace=st.booleans(), **hu.gcs) diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py index 9112d50f38df86..2976b06108ff51 100644 --- a/caffe2/python/operator_test/adadelta_test.py +++ b/caffe2/python/operator_test/adadelta_test.py @@ -12,9 +12,10 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestAdadelta(hu.HypothesisTestCase): +class TestAdadelta(serial.SerializedTestCase): @staticmethod def ref_adadelta(param_in, mom_in, @@ -44,7 +45,7 @@ def ref_adadelta(param_in, return (param_out.astype(np.float32), mom_out.astype(np.float32), mom_delta_out.astype(np.float32)) - @given(inputs=hu.tensors(n=4), + @serial.given(inputs=hu.tensors(n=4), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, @@ -136,7 +137,7 @@ def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay, ref_using_fp16 ], ref_sparse) - @given(inputs=hu.tensors(n=3), + @serial.given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py index e4101e92cf01cb..69aead865d1c3f 100644 --- a/caffe2/python/operator_test/adagrad_test.py +++ b/caffe2/python/operator_test/adagrad_test.py @@ -15,11 +15,13 @@ from caffe2.python.operator_test.adagrad_test_helper import ( ref_adagrad, adagrad_sparse_test_helper ) +import caffe2.python.serialized_test.serialized_test_util as serial import unittest import os -class TestAdagrad(hu.HypothesisTestCase): + +class TestAdagrad(serial.SerializedTestCase): @staticmethod def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.mean(np.square(grad)) @@ -27,7 +29,7 @@ def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon): param_out = param_in + grad_adj return (param_out, mom_out) - @given(inputs=hu.tensors(n=3), + @serial.given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, @@ -112,7 +114,7 @@ def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc): return adagrad_sparse_test_helper(self, inputs, lr, epsilon, None, ref_adagrad, gc, dc) - @given(inputs=hu.tensors(n=2), + @serial.given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, @@ -223,7 +225,7 @@ def ref_row_wise_sparse(param, momentum, indices, grad, lr): [param, momentum, indices, grad, lr], ref_row_wise_sparse) - @given(inputs=hu.tensors(n=1), + @serial.given(inputs=hu.tensors(n=1), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py index 6e56da29b7f6a9..bcce4efc8ec529 100644 --- a/caffe2/python/operator_test/affine_channel_op_test.py +++ b/caffe2/python/operator_test/affine_channel_op_test.py @@ -2,16 +2,15 @@ from __future__ import division from __future__ import print_function -import numpy as np - from caffe2.python import core -from hypothesis import given - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given import hypothesis.strategies as st +import numpy as np -class TestAffineChannelOp(hu.HypothesisTestCase): +class TestAffineChannelOp(serial.SerializedTestCase): def affine_channel_nchw_ref(self, X, scale, bias): dims = X.shape N = dims[0] @@ -30,9 +29,10 @@ def affine_channel_nhwc_ref(self, X, scale, bias): Y = X * scale + bias return [Y.reshape(dims)] - @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5), - W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), - is_learnable=st.booleans(), in_place=st.booleans(), **hu.gcs) + @serial.given(N=st.integers(1, 5), C=st.integers(1, 5), + H=st.integers(1, 5), W=st.integers(1, 5), + order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(), + in_place=st.booleans(), **hu.gcs) def test_affine_channel_2d( self, N, C, H, W, order, is_learnable, in_place, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py index 6492189808a72c..9bdea7ecf5cdbf 100644 --- a/caffe2/python/operator_test/arg_ops_test.py +++ b/caffe2/python/operator_test/arg_ops_test.py @@ -9,11 +9,13 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestArgOps(hu.HypothesisTestCase): - @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), - keepdims=st.booleans(), **hu.gcs) +class TestArgOps(serial.SerializedTestCase): + @serial.given( + X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), + keepdims=st.booleans(), **hu.gcs) def test_argmax(self, X, axis, keepdims, gc, dc): if axis >= len(X.shape): axis %= len(X.shape) @@ -32,8 +34,9 @@ def argmax_ref(X): self.assertReferenceChecks(gc, op, [X], argmax_ref) self.assertDeviceChecks(dc, op, [X], [0]) - @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), - keepdims=st.booleans(), **hu.gcs) + @serial.given( + X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5), + keepdims=st.booleans(), **hu.gcs) def test_argmin(self, X, axis, keepdims, gc, dc): if axis >= len(X.shape): axis %= len(X.shape) diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py index f8bc77ba9e5a7a..7252499352ee84 100644 --- a/caffe2/python/operator_test/batch_box_cox_test.py +++ b/caffe2/python/operator_test/batch_box_cox_test.py @@ -7,6 +7,7 @@ from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -51,8 +52,8 @@ def _inputs(draw): ) -class TestBatchBoxCox(hu.HypothesisTestCase): - @given( +class TestBatchBoxCox(serial.SerializedTestCase): + @serial.given( inputs=_inputs(), **hu.gcs_cpu_only ) diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py index 711240de9cfdfc..301941afb590c3 100644 --- a/caffe2/python/operator_test/batch_bucketize_op_test.py +++ b/caffe2/python/operator_test/batch_bucketize_op_test.py @@ -6,13 +6,14 @@ import numpy as np from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -class TestBatchBucketize(hu.HypothesisTestCase): - @given(**hu.gcs_cpu_only) +class TestBatchBucketize(serial.SerializedTestCase): + @serial.given(**hu.gcs_cpu_only) def test_batch_bucketize_example(self, gc, dc): op = core.CreateOperator('BatchBucketize', ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"], diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py index 2db25e73892563..91d49b76ee4119 100644 --- a/caffe2/python/operator_test/batch_moments_op_test.py +++ b/caffe2/python/operator_test/batch_moments_op_test.py @@ -2,16 +2,15 @@ from __future__ import division from __future__ import print_function -import numpy as np - from caffe2.python import core -from hypothesis import given - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given import hypothesis.strategies as st +import numpy as np -class TestBatchMomentsOp(hu.HypothesisTestCase): +class TestBatchMomentsOp(serial.SerializedTestCase): def batch_moments_nchw_ref(self, X): dims = X.shape N = dims[0] @@ -29,9 +28,9 @@ def batch_moments_nhwc_ref(self, X): var = np.mean(np.square(X), axis=0) return [mu, var] - @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5), - W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), - **hu.gcs) + @serial.given(N=st.integers(1, 5), C=st.integers(1, 5), + H=st.integers(1, 5), W=st.integers(1, 5), + order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) def test_batch_moments_2d(self, N, C, H, W, order, gc, dc): op = core.CreateOperator( "BatchMoments", diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py index a8e04fb2e14750..a47cc44e593c8b 100644 --- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py +++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py @@ -3,17 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np - from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu +import numpy as np -class TestBatchSparseToDense(hu.HypothesisTestCase): +class TestBatchSparseToDense(serial.SerializedTestCase): - @given( + @serial.given( batch_size=st.integers(5, 10), dense_last_dim=st.integers(5, 10), default_value=st.floats(min_value=2.0, max_value=3.0), diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py index b54a4435513be7..f76891bae4dc89 100644 --- a/caffe2/python/operator_test/bbox_transform_test.py +++ b/caffe2/python/operator_test/bbox_transform_test.py @@ -2,9 +2,11 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -201,8 +203,8 @@ def generate_rois_rotated(roi_counts, im_dims): return rotated_rois -class TestBBoxTransformOp(hu.HypothesisTestCase): - @given( +class TestBBoxTransformOp(serial.SerializedTestCase): + @serial.given( num_rois=st.integers(1, 10), num_classes=st.integers(1, 10), im_dim=st.integers(100, 600), diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 638248d60bafe5..8811f5667503b8 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -2,18 +2,18 @@ from __future__ import division from __future__ import print_function -import numpy as np -from hypothesis import assume, given -import hypothesis.strategies as st - from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import assume, given +import hypothesis.strategies as st +import numpy as np -class TestBooleanMaskOp(hu.HypothesisTestCase): +class TestBooleanMaskOp(serial.SerializedTestCase): - @given(x=hu.tensor(min_dim=1, + @serial.given(x=hu.tensor(min_dim=1, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0)), **hu.gcs) @@ -54,7 +54,7 @@ def _dtype_conversion(x, dtype, gc, dc): x = x.astype(dtype) return x, dc - @given(x=hu.tensor(min_dim=2, + @serial.given(x=hu.tensor(min_dim=2, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0)), dtype=st.sampled_from([np.float32, np.float16]), diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py index 86b2fedb49a245..e3bc9f248d3a26 100644 --- a/caffe2/python/operator_test/boolean_unmask_test.py +++ b/caffe2/python/operator_test/boolean_unmask_test.py @@ -3,16 +3,15 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np -from hypothesis import given -import hypothesis.strategies as st - from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st +import numpy as np -class TestUnmaskOp(hu.HypothesisTestCase): - @given(N=st.integers(min_value=2, max_value=20), +class TestUnmaskOp(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=2, max_value=20), dtype=st.sampled_from([ np.bool_, np.int8, diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py index 8cd9acbd6a5d3c..52155c0a5d7649 100644 --- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py +++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py @@ -3,13 +3,13 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np -import unittest +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st - -import caffe2.python.hypothesis_test_util as hu -from caffe2.python import core +import unittest +import numpy as np def get_op(input_len, output_len, args): @@ -64,8 +64,8 @@ def gen_multiple_boxes(centers, scores, count, num_classes): return ret_box, ret_scores -class TestBoxWithNMSLimitOp(hu.HypothesisTestCase): - @given(**HU_CONFIG) +class TestBoxWithNMSLimitOp(serial.SerializedTestCase): + @serial.given(**HU_CONFIG) def test_simple(self, gc): in_centers = [(0, 0), (20, 20), (50, 50)] in_scores = [0.9, 0.8, 0.6] diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py index 79293fc6453232..130364261ea166 100644 --- a/caffe2/python/operator_test/ceil_op_test.py +++ b/caffe2/python/operator_test/ceil_op_test.py @@ -4,17 +4,18 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np import unittest -class TestCeil(hu.HypothesisTestCase): +class TestCeil(serial.SerializedTestCase): - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_ceil(self, X, gc, dc, engine): diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py index 5c59b8d6f05c1b..e516288b436c38 100644 --- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py +++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py @@ -3,17 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import assume, given import hypothesis.strategies as st import numpy as np -from caffe2.python import core -import caffe2.python.hypothesis_test_util as hu -from caffe2.proto import caffe2_pb2 import unittest -class TestChannelBackpropStats(hu.HypothesisTestCase): - @given( +class TestChannelBackpropStats(serial.SerializedTestCase): + @serial.given( size=st.integers(7, 10), inputChannels=st.integers(1, 10), batchSize=st.integers(1, 3), diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py index e17d7a736063c4..34417fd1847337 100644 --- a/caffe2/python/operator_test/channel_shuffle_test.py +++ b/caffe2/python/operator_test/channel_shuffle_test.py @@ -3,14 +3,15 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np -import caffe2.python.hypothesis_test_util as hu from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st +import numpy as np -class ChannelShuffleOpsTest(hu.HypothesisTestCase): +class ChannelShuffleOpsTest(serial.SerializedTestCase): def _channel_shuffle_nchw_ref(self, X, group): dims = X.shape N = dims[0] @@ -31,7 +32,7 @@ def _channel_shuffle_nhwc_ref(self, X, group): Y = np.transpose(X, axes=(0, 1, 3, 2)) return [Y.reshape(dims)] - @given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5), + @serial.given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5), H=st.integers(1, 5), W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc): diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py index 2a238e67542f5a..f1daddee7721dd 100644 --- a/caffe2/python/operator_test/channel_stats_op_test.py +++ b/caffe2/python/operator_test/channel_stats_op_test.py @@ -3,17 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import assume, given import hypothesis.strategies as st import numpy as np -from caffe2.python import core -import caffe2.python.hypothesis_test_util as hu -from caffe2.proto import caffe2_pb2 import unittest -class TestChannelStats(hu.HypothesisTestCase): - @given( +class TestChannelStats(serial.SerializedTestCase): + @serial.given( size=st.integers(7, 10), inputChannels=st.integers(1, 10), batchSize=st.integers(1, 3), diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py index 38499a69eb1d90..46163d30dedc2f 100644 --- a/caffe2/python/operator_test/clip_op_test.py +++ b/caffe2/python/operator_test/clip_op_test.py @@ -10,10 +10,11 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestClip(hu.HypothesisTestCase): - @given(X=hu.tensor(), +class TestClip(serial.SerializedTestCase): + @serial.given(X=hu.tensor(), min_=st.floats(min_value=-2, max_value=0), max_=st.floats(min_value=0, max_value=2), inplace=st.booleans(), diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py index bea2133d5182ee..042b4ef2a8326c 100644 --- a/caffe2/python/operator_test/clip_tensor_op_test.py +++ b/caffe2/python/operator_test/clip_tensor_op_test.py @@ -4,15 +4,15 @@ from __future__ import unicode_literals from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestClipTensorByScalingOp(hu.HypothesisTestCase): +class TestClipTensorByScalingOp(serial.SerializedTestCase): - @given(n=st.integers(5, 8), d=st.integers(2, 4), + @serial.given(n=st.integers(5, 8), d=st.integers(2, 4), threshold=st.floats(0.1, 10), additional_threshold=st.floats(0.1, 10), use_additional_threshold=st.booleans(), diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py index facb675a4944ec..e37738801745c2 100644 --- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py +++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals import numpy as np -import unittest import os +import unittest from hypothesis import given, settings import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -from caffe2.python import core, utils from caffe2.proto import caffe2_pb2 +from caffe2.python import core, utils +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial # # Should match original Detectron code at @@ -129,17 +130,17 @@ def collect_and_distribute_fpn_rpn_ref(*inputs): return outputs -class TestCollectAndDistributeFpnRpnProposals(hu.HypothesisTestCase): +class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase): @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") - @given(proposal_count=st.integers(min_value=1000, max_value=8000), - rpn_min_level=st.integers(min_value=1, max_value=4), - rpn_num_levels=st.integers(min_value=1, max_value=6), - roi_min_level=st.integers(min_value=1, max_value=4), - roi_num_levels=st.integers(min_value=1, max_value=6), - rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), - roi_canonical_scale=st.integers(min_value=100, max_value=300), - roi_canonical_level=st.integers(min_value=1, max_value=8), - **hu.gcs_cpu_only) + @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000), + rpn_min_level=st.integers(min_value=1, max_value=4), + rpn_num_levels=st.integers(min_value=1, max_value=6), + roi_min_level=st.integers(min_value=1, max_value=4), + roi_num_levels=st.integers(min_value=1, max_value=6), + rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), + roi_canonical_scale=st.integers(min_value=100, max_value=300), + roi_canonical_level=st.integers(min_value=1, max_value=8), + **hu.gcs_cpu_only) def test_collect_and_dist( self, proposal_count, diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py index af8d5486e6e6ab..3d2c4ae31946d9 100644 --- a/caffe2/python/operator_test/concat_split_op_test.py +++ b/caffe2/python/operator_test/concat_split_op_test.py @@ -3,13 +3,14 @@ from __future__ import division from __future__ import print_function -import numpy as np -import hypothesis.strategies as st -import unittest -import caffe2.python.hypothesis_test_util as hu from caffe2.proto import caffe2_pb2 from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given +import hypothesis.strategies as st +import numpy as np +import unittest @st.composite @@ -44,8 +45,8 @@ def _tensor_splits(draw, add_axis=False): ) -class TestConcatSplitOps(hu.HypothesisTestCase): - @given(tensor_splits=_tensor_splits(), +class TestConcatSplitOps(serial.SerializedTestCase): + @serial.given(tensor_splits=_tensor_splits(), **hu.gcs) def test_concat(self, tensor_splits, gc, dc): axis, _, splits = tensor_splits @@ -92,7 +93,7 @@ def test_concat_add_axis(self, tensor_splits, gc, dc): for i in range(len(splits)): self.assertGradientChecks(gc, op, splits, i, [0]) - @given(tensor_splits=_tensor_splits(), + @serial.given(tensor_splits=_tensor_splits(), split_as_arg=st.booleans(), **hu.gcs) def test_split(self, tensor_splits, split_as_arg, gc, dc): @@ -127,7 +128,7 @@ def split_ref(input, split=split_info): self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad) self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad) - @given( + @serial.given( inputs=hu.lengths_tensor( dtype=np.float32, min_value=1, diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py index b96b530d687751..88d8fd8b7a27a3 100644 --- a/caffe2/python/operator_test/conditional_test.py +++ b/caffe2/python/operator_test/conditional_test.py @@ -2,15 +2,15 @@ from __future__ import division from __future__ import print_function -import numpy as np -from hypothesis import given -import hypothesis.strategies as st from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st +import numpy as np -class TestConditionalOp(hu.HypothesisTestCase): - @given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only) +class TestConditionalOp(serial.SerializedTestCase): + @serial.given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only) def test_conditional(self, rows_num, gc, dc): op = core.CreateOperator( "Conditional", ["condition", "data_t", "data_f"], "output" diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py new file mode 100644 index 00000000000000..54a57dfd51928d --- /dev/null +++ b/caffe2/python/operator_test/conftest.py @@ -0,0 +1,39 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import caffe2.python.serialized_test.serialized_test_util as serial + + +def pytest_addoption(parser): + parser.addoption( + '-G', + '--generate-serialized', + action='store_true', + dest='generate', + help='generate output files (default=false, compares to current files)', + ) + parser.addoption( + '-O', + '--output', + default=serial.DATA_DIR, + dest='output', + help='output directory (default: %(default)s)' + ) + parser.addoption( + '-D', + '--disable-serialized-check', + action='store_true', + dest='disable', + help='disable checking serialized tests' + ) + + +def pytest_configure(config): + generate = config.getoption('generate', default=False) + output = config.getoption('output', default=serial.DATA_DIR) + disable = config.getoption('disable', default=False) + serial._output_context.__setattr__('should_generate_output', generate) + serial._output_context.__setattr__('output_dir', output) + serial._output_context.__setattr__('disable_serialized_check', disable) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 8e65a9324535a6..d29d724b89c29d 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -13,11 +13,13 @@ from caffe2.python import brew, core, workspace import caffe2.python.hypothesis_test_util as hu from caffe2.python.model_helper import ModelHelper +import caffe2.python.serialized_test.serialized_test_util as serial import caffe2.python._import_c_extension as C import unittest import os + def _cudnn_supports( dilation=False, nhwc=False, @@ -54,7 +56,7 @@ def _cudnn_convolution_algo_count(direction): return st.sampled_from([-1]) -class TestConvolution(hu.HypothesisTestCase): +class TestConvolution(serial.SerializedTestCase): # CUDNN does NOT support different padding values and we skip it @given(op_type=st.sampled_from(["Conv", "Conv2D"]), stride_h=st.integers(1, 3), @@ -636,14 +638,15 @@ def test_use_cudnn_engine_interactions(self): self.assertEqual(model.Proto().op[-1].engine, expected_engine) - @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4), - G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4), - H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(), - order=st.sampled_from(["NCHW", "NHWC"]), - force_algo_fwd=_cudnn_convolution_algo_count("fwd"), - force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"), - force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"), - **hu.gcs) + @serial.given( + op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4), + G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4), + H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(), + order=st.sampled_from(["NCHW", "NHWC"]), + force_algo_fwd=_cudnn_convolution_algo_count("fwd"), + force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"), + force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"), + **hu.gcs) def test_1x1_conv(self, op_type, N, G, DX, DY, H, W, use_bias, order, force_algo_fwd, force_algo_dgrad, force_algo_wgrad, gc, dc): diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py index d67df5fd3e1f32..1124df94e67ae7 100644 --- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py +++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py @@ -9,10 +9,11 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestCosineEmbeddingCriterion(hu.HypothesisTestCase): - @given(N=st.integers(min_value=10, max_value=20), +class TestCosineEmbeddingCriterion(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=10, max_value=20), seed=st.integers(min_value=0, max_value=65535), margin=st.floats(min_value=-0.5, max_value=0.5), **hu.gcs) diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py index 51d2bbc6f484ab..4deef35c5bb506 100644 --- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py @@ -2,10 +2,12 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from collections import defaultdict, Counter from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -15,9 +17,9 @@ DEFAULT_PRUNE_THRESHOLD = 0.001 -class TestCTCBeamSearchDecoderOp(hu.HypothesisTestCase): +class TestCTCBeamSearchDecoderOp(serial.SerializedTestCase): - @given( + @serial.given( batch=st.sampled_from([1, 2, 4]), max_time=st.sampled_from([1, 8, 64]), alphabet_size=st.sampled_from([1, 2, 32, 128, 512]), diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py index 0e638e8155e9e7..98079d2b026ae0 100644 --- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py @@ -2,18 +2,19 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np - import unittest -class TestCTCGreedyDecoderOp(hu.HypothesisTestCase): +class TestCTCGreedyDecoderOp(serial.SerializedTestCase): - @given( + @serial.given( batch=st.sampled_from([2, 4, 128, 256]), max_time=st.sampled_from([2, 10, 30, 50]), num_classes=st.sampled_from([2, 10, 26, 40]), diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py index 0a5f9a38a4a7c6..753b94d20f1f54 100644 --- a/caffe2/python/operator_test/distance_op_test.py +++ b/caffe2/python/operator_test/distance_op_test.py @@ -3,15 +3,16 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st +import numpy as np -class DistanceTest(hu.HypothesisTestCase): - @given(n=st.integers(1, 3), +class DistanceTest(serial.SerializedTestCase): + @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_cosine_similarity(self, n, dim, gc, dc): @@ -32,7 +33,7 @@ def test_cosine_similarity(self, n, dim, gc, dc): self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) - @given(inputs=hu.tensors(n=2, + @serial.given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32), @@ -57,7 +58,7 @@ def dot_ref(X, Y): # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0]) - @given(n=st.integers(1, 3), + @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_L1_distance(self, n, dim, gc, dc): @@ -88,7 +89,7 @@ def test_L1_distance(self, n, dim, gc, dc): self.assertGradientChecks(gc, op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) - @given(n=st.integers(1, 3), + @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_L2_distance(self, n, dim, gc, dc): diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index 89d63b7e13286b..2bbd9ba4efe114 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -10,11 +10,12 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestDropout(hu.HypothesisTestCase): +class TestDropout(serial.SerializedTestCase): - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), in_place=st.booleans(), ratio=st.floats(0, 0.999), engine=st.sampled_from(["", "CUDNN"]), diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py index c67a84921a4ad0..8c7df5f33625b2 100644 --- a/caffe2/python/operator_test/elementwise_linear_op_test.py +++ b/caffe2/python/operator_test/elementwise_linear_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestElementwiseLinearOp(hu.HypothesisTestCase): +class TestElementwiseLinearOp(serial.SerializedTestCase): - @given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs) + @serial.given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs) # @given(n=st.integers(2, 50), d=st.integers(2, 50), **hu.gcs_cpu_only) def test(self, n, d, gc, dc): X = np.random.rand(n, d).astype(np.float32) diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py index 7279dd5e1fb99c..8f665a06cd9e9a 100644 --- a/caffe2/python/operator_test/elementwise_logical_ops_test.py +++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu - import numpy as np import unittest @@ -21,7 +21,7 @@ def rowmux(select_vec, left, right): return mux(select, left, right) -class TestWhere(hu.HypothesisTestCase): +class TestWhere(serial.SerializedTestCase): def test_reference(self): self.assertTrue(( @@ -35,7 +35,7 @@ def test_reference(self): [[3], [4]])[0] ).all()) - @given(N=st.integers(min_value=1, max_value=10), + @serial.given(N=st.integers(min_value=1, max_value=10), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_cpu_only) def test_where(self, N, gc, dc, engine): @@ -107,9 +107,9 @@ def test_rowwhere_dim2(self, N, gc, dc, engine): self.assertReferenceChecks(gc, op, [C, X, Y], rowmux) -class TestIsMemberOf(hu.HypothesisTestCase): +class TestIsMemberOf(serial.SerializedTestCase): - @given(N=st.integers(min_value=1, max_value=10), + @serial.given(N=st.integers(min_value=1, max_value=10), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_cpu_only) def test_is_member_of(self, N, gc, dc, engine): diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index e767a0db161a9f..161f5fc0724b14 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -11,10 +11,11 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial # TODO(jiayq): make them hypothesis tests for better coverage. -class TestElementwiseBroadcast(hu.HypothesisTestCase): +class TestElementwiseBroadcast(serial.SerializedTestCase): @given(**hu.gcs) def test_broadcast_Add(self, gc, dc): # Set broadcast and no axis, i.e. broadcasting last dimensions. @@ -168,7 +169,7 @@ def test_broadcast_Sub(self, gc, dc): self.assertDeviceChecks(dc, op, [X, Y], [0]) self.assertGradientChecks(gc, op, [X, Y], 1, [0]) - @given(**hu.gcs) + @serial.given(**hu.gcs) def test_broadcast_powt(self, gc, dc): np.random.seed(101) diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py index efd056c8f1654d..4b608a4418dddc 100644 --- a/caffe2/python/operator_test/expand_op_test.py +++ b/caffe2/python/operator_test/expand_op_test.py @@ -7,11 +7,12 @@ from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestExpandOp(hu.HypothesisTestCase): +class TestExpandOp(serial.SerializedTestCase): def _rand_shape(self, X_shape, max_length): length = np.random.randint(max_length) shape = np.ones(length, dtype=np.int64) @@ -39,7 +40,7 @@ def ref(X, shape): self.assertDeviceChecks(dc, op, [X, shape], [0]) self.assertGradientChecks(gc, op, [X, shape], 0, [0]) - @given(X=hu.tensor(max_dim=5, dtype=np.float32), + @serial.given(X=hu.tensor(max_dim=5, dtype=np.float32), **hu.gcs) def test_expand_rand_shape(self, X, gc, dc): shape = self._rand_shape(X.shape, 5) diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py index 1c444da6a8b80e..d10e19e4932ade 100644 --- a/caffe2/python/operator_test/fc_operator_test.py +++ b/caffe2/python/operator_test/fc_operator_test.py @@ -7,11 +7,12 @@ from caffe2.python import core from hypothesis import assume, given, settings, HealthCheck import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestFcOperator(hu.HypothesisTestCase): +class TestFcOperator(serial.SerializedTestCase): def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc): if dtype == np.float16: # fp16 only supported with CUDA @@ -76,7 +77,7 @@ def fc_tranposed_op(X, W, b): threshold=threshold, stepsize=stepsize) @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much]) - @given(n=st.integers(1, 5), + @serial.given(n=st.integers(1, 5), m=st.integers(0, 5), k=st.integers(1, 5), multi_dim=st.sampled_from([True, False]), diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py index df13cba4a3b961..df7cd1dc4e7960 100644 --- a/caffe2/python/operator_test/filler_ops_test.py +++ b/caffe2/python/operator_test/filler_ops_test.py @@ -3,13 +3,13 @@ from __future__ import print_function from __future__ import unicode_literals -import hypothesis.strategies as st - -from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 -from hypothesis import given +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given +import hypothesis.strategies as st import numpy as np @@ -19,7 +19,7 @@ def _fill_diagonal(shape, value): return (result,) -class TestFillerOperator(hu.HypothesisTestCase): +class TestFillerOperator(serial.SerializedTestCase): @given(**hu.gcs) def test_shape_error(self, gc, dc): @@ -127,7 +127,7 @@ def test_uniform_fill_using_arg(self, gc, dc): self.assertNotEqual(min_data, max_data) - @given( + @serial.given( shape=st.sampled_from( [ [3, 3], @@ -168,9 +168,9 @@ def test_diagonal_fill_op_int(self, gc, dc): # Check against numpy reference self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal) - @given(lengths=st.lists(st.integers(min_value=0, max_value=10), - min_size=0, - max_size=10), + @serial.given(lengths=st.lists(st.integers(min_value=0, max_value=10), + min_size=0, + max_size=10), **hu.gcs) def test_lengths_range_fill(self, lengths, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py index febf6efeef76ec..153724a5f49d29 100644 --- a/caffe2/python/operator_test/find_op_test.py +++ b/caffe2/python/operator_test/find_op_test.py @@ -4,20 +4,18 @@ from __future__ import unicode_literals from caffe2.python import core -import hypothesis.strategies as st -from hypothesis import given - - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +import hypothesis.strategies as st import numpy as np -class TestFindOperator(hu.HypothesisTestCase): +class TestFindOperator(serial.SerializedTestCase): - @given(n=st.sampled_from([1, 4, 8, 31, 79, 150]), - idxsize=st.sampled_from([2, 4, 8, 1000, 5000]), - **hu.gcs) + @serial.given(n=st.sampled_from([1, 4, 8, 31, 79, 150]), + idxsize=st.sampled_from([2, 4, 8, 1000, 5000]), + **hu.gcs) def test_find(self, n, idxsize, gc, dc): maxval = 10 diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py index 08f079b5f8cfdd..fcd20278c6caba 100644 --- a/caffe2/python/operator_test/flexible_top_k_test.py +++ b/caffe2/python/operator_test/flexible_top_k_test.py @@ -3,15 +3,16 @@ from __future__ import print_function from __future__ import unicode_literals -from collections import OrderedDict -import numpy as np - from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from collections import OrderedDict +from hypothesis import given +import numpy as np -class TestFlexibleTopK(hu.HypothesisTestCase): + +class TestFlexibleTopK(serial.SerializedTestCase): def flexible_top_k_ref(self, X, k): X_flat = X.reshape((-1, X.shape[-1])) indices_ref = np.ndarray(shape=sum(k), dtype=np.int32) @@ -38,7 +39,7 @@ def flexible_top_k_ref(self, X, k): return (values_ref, indices_ref) - @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only) + @serial.given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only) def test_flexible_top_k(self, X, gc, dc): X = X.astype(dtype=np.float32) k_shape = (int(X.size / X.shape[-1]), ) diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py index aac1e81efa6570..4cbd269620673d 100644 --- a/caffe2/python/operator_test/floor_op_test.py +++ b/caffe2/python/operator_test/floor_op_test.py @@ -4,17 +4,18 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np - import unittest -class TestFloor(hu.HypothesisTestCase): +class TestFloor(serial.SerializedTestCase): - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_floor(self, X, gc, dc, engine): diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py index 2c2bc33910c560..d5ab8e58cec0f2 100644 --- a/caffe2/python/operator_test/gather_ops_test.py +++ b/caffe2/python/operator_test/gather_ops_test.py @@ -7,12 +7,13 @@ from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import hypothesis.extra.numpy as hnp -class TestGatherOps(hu.HypothesisTestCase): - @given(rows_num=st.integers(1, 10000), +class TestGatherOps(serial.SerializedTestCase): + @serial.given(rows_num=st.integers(1, 10000), index_num=st.integers(0, 5000), **hu.gcs) def test_gather_ops(self, rows_num, index_num, gc, dc): @@ -52,8 +53,8 @@ def _inputs(draw): ) -class TestBatchGatherOps(hu.HypothesisTestCase): - @given(inputs=_inputs(), +class TestBatchGatherOps(serial.SerializedTestCase): + @serial.given(inputs=_inputs(), **hu.gcs) def test_batch_gather_ops(self, inputs, gc, dc): data, ind = inputs diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py index d653dd3297bcd2..a16b92ba7d6992 100644 --- a/caffe2/python/operator_test/gather_ranges_op_test.py +++ b/caffe2/python/operator_test/gather_ranges_op_test.py @@ -6,8 +6,8 @@ from caffe2.python import core, workspace from hypothesis import given from hypothesis import strategies as st - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np @@ -121,8 +121,9 @@ def gather_ranges_to_dense_with_key(data, ranges, key, lengths): return outputs -class TestGatherRanges(hu.HypothesisTestCase): - @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only) +class TestGatherRanges(serial.SerializedTestCase): + @serial.given( + boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only) def test_gather_ranges(self, boarders_and_data, gc, dc): boarders, data = boarders_and_data @@ -142,7 +143,7 @@ def boarders_to_range(boarders): reference=gather_ranges, ) - @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only) + @serial.given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only) def test_gather_ranges_split(self, tensor_splits, gc, dc): data, ranges, lengths, _ = tensor_splits diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py index 98ebc9bed4012d..56902bb444efde 100644 --- a/caffe2/python/operator_test/glu_op_test.py +++ b/caffe2/python/operator_test/glu_op_test.py @@ -4,20 +4,21 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import assume, given, settings, HealthCheck import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np import unittest -class TestGlu(hu.HypothesisTestCase): +class TestGlu(serial.SerializedTestCase): # Suppress filter_too_much health check. # Reproduce by commenting @settings and uncommenting @seed. # @seed(302934307671667531413257853548643485645) @settings(suppress_health_check=[HealthCheck.filter_too_much]) - @given( + @serial.given( X=hu.tensor(), axis=st.integers(min_value=0, max_value=3), **hu.gcs diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py index caa9121e924a0c..febf05136e4ded 100644 --- a/caffe2/python/operator_test/group_norm_op_test.py +++ b/caffe2/python/operator_test/group_norm_op_test.py @@ -2,16 +2,16 @@ from __future__ import division from __future__ import print_function -import numpy as np - from caffe2.python import core -from hypothesis import given - import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + +from hypothesis import given import hypothesis.strategies as st +import numpy as np -class TestGroupNormOp(hu.HypothesisTestCase): +class TestGroupNormOp(serial.SerializedTestCase): def group_norm_nchw_ref(self, X, gamma, beta, group, epsilon): dims = X.shape N = dims[0] @@ -40,10 +40,11 @@ def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon): Y = gamma * (X - mu) / std + beta return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)] - @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5), - H=st.integers(2, 5), W=st.integers(2, 5), - epsilon=st.floats(min_value=1e-5, max_value=1e-4), - order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) + @serial.given( + N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5), + H=st.integers(2, 5), W=st.integers(2, 5), + epsilon=st.floats(min_value=1e-5, max_value=1e-4), + order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) def test_group_norm_2d( self, N, G, D, H, W, epsilon, order, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py index 1292d843d0a827..ed8945b7927e90 100644 --- a/caffe2/python/operator_test/gru_test.py +++ b/caffe2/python/operator_test/gru_test.py @@ -7,6 +7,7 @@ from caffe2.python.model_helper import ModelHelper from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from caffe2.proto import caffe2_pb2 from functools import partial @@ -246,11 +247,11 @@ def generate_input_state(n, d): return hidden_t, model.net -class GRUCellTest(hu.HypothesisTestCase): +class GRUCellTest(serial.SerializedTestCase): # Test just for GRUUnitOp @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") - @given( + @serial.given( seed=st.integers(0, 2**32 - 1), input_tensor=gru_unit_op_input(), fwd_only=st.booleans(), diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py index dbb308f680021b..252855b39e4f25 100644 --- a/caffe2/python/operator_test/hyperbolic_ops_test.py +++ b/caffe2/python/operator_test/hyperbolic_ops_test.py @@ -6,11 +6,12 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestHyperbolicOps(hu.HypothesisTestCase): +class TestHyperbolicOps(serial.SerializedTestCase): def _test_hyperbolic_op(self, op_name, np_ref, X, in_place, engine, gc, dc): op = core.CreateOperator( op_name, @@ -30,15 +31,15 @@ def ref(X): self.assertDeviceChecks(dc, op, [X], [0]) self.assertGradientChecks(gc, op, [X], 0, [0]) - @given(X=hu.tensor(dtype=np.float32), **hu.gcs) + @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs) def test_sinh(self, X, gc, dc): self._test_hyperbolic_op("Sinh", np.sinh, X, False, "", gc, dc) - @given(X=hu.tensor(dtype=np.float32), **hu.gcs) + @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs) def test_cosh(self, X, gc, dc): self._test_hyperbolic_op("Cosh", np.cosh, X, False, "", gc, dc) - @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(), + @serial.given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) def test_tanh(self, X, in_place, engine, gc, dc): self._test_hyperbolic_op("Tanh", np.tanh, X, in_place, engine, gc, dc) diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py index 1f8b7344b74b9a..6a3678abdbe3fb 100644 --- a/caffe2/python/operator_test/index_hash_ops_test.py +++ b/caffe2/python/operator_test/index_hash_ops_test.py @@ -2,15 +2,17 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestIndexHashOps(hu.HypothesisTestCase): - @given( +class TestIndexHashOps(serial.SerializedTestCase): + @serial.given( indices=st.sampled_from([ np.int32, np.int64 ]).flatmap(lambda dtype: hu.tensor(min_dim=1, max_dim=1, dtype=dtype)), diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py index a91154a4e45f6d..1d072a230ae3a7 100644 --- a/caffe2/python/operator_test/instance_norm_test.py +++ b/caffe2/python/operator_test/instance_norm_test.py @@ -8,11 +8,13 @@ from caffe2.python import core, model_helper, brew import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import unittest import os -class TestInstanceNorm(hu.HypothesisTestCase): + +class TestInstanceNorm(serial.SerializedTestCase): def _get_inputs(self, N, C, H, W, order): if order == 'NCHW': @@ -131,7 +133,7 @@ def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean, atol=1e-4, rtol=1e-4) - @given(gc=hu.gcs['gc'], + @serial.given(gc=hu.gcs['gc'], dc=hu.gcs['dc'], N=st.integers(2, 10), C=st.integers(3, 10), diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py index 011720d109d6d2..6f9e5d90572ab1 100644 --- a/caffe2/python/operator_test/integral_image_ops_test.py +++ b/caffe2/python/operator_test/integral_image_ops_test.py @@ -2,15 +2,16 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -from hypothesis import given + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestIntegralImageOps(hu.HypothesisTestCase): - @given(batch_size=st.integers(1, 3), +class TestIntegralImageOps(serial.SerializedTestCase): + @serial.given(batch_size=st.integers(1, 3), height=st.integers(7, 10), width=st.integers(7, 10), channels=st.integers(1, 8), @@ -45,7 +46,7 @@ def integral_image(im): self.assertDeviceChecks(dc, op, [im], [0]) self.assertReferenceChecks(gc, op, [im], integral_image) - @given(batch_size=st.integers(1, 3), + @serial.given(batch_size=st.integers(1, 3), height=st.integers(7, 10), width=st.integers(7, 10), channels=st.integers(1, 8), diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py index 97a1fbaeec2222..51faa14b9029fe 100644 --- a/caffe2/python/operator_test/jsd_ops_test.py +++ b/caffe2/python/operator_test/jsd_ops_test.py @@ -2,9 +2,11 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -24,8 +26,8 @@ def jsd_grad(go, o, pq_list): return [np.log(p * (1 - m) / (1 - p) / m) / 2. * go, None] -class TestJSDOps(hu.HypothesisTestCase): - @given(n=st.integers(10, 100), **hu.gcs_cpu_only) +class TestJSDOps(serial.SerializedTestCase): + @serial.given(n=st.integers(10, 100), **hu.gcs_cpu_only) def test_bernoulli_jsd(self, n, gc, dc): p = np.random.rand(n).astype(np.float32) q = np.random.rand(n).astype(np.float32) diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index fa0958afd1f99a..59203fd960c88b 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -4,17 +4,18 @@ from __future__ import unicode_literals from caffe2.python import brew, core +from caffe2.python.model_helper import ModelHelper from hypothesis import given import caffe2.python.hypothesis_test_util as hu -import unittest -import os +import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np +import os +import unittest -from caffe2.python.model_helper import ModelHelper -class TestLayerNormOp(hu.HypothesisTestCase): +class TestLayerNormOp(serial.SerializedTestCase): @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") - @given(X=hu.tensors(n=1), **hu.gcs) + @serial.given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_grad_op(self, X, gc, dc): X = X[0] if len(X.shape) == 1: diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py index 2284fdbba0785c..84e1307568f22d 100644 --- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py +++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py @@ -3,16 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals -from hypothesis import given -import hypothesis.strategies as st from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial +from hypothesis import given +import hypothesis.strategies as st import numpy as np -class TestLearningRateAdaption(hu.HypothesisTestCase): - @given(inputs=hu.tensors(n=2), +class TestLearningRateAdaption(serial.SerializedTestCase): + @serial.given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr_alpha=st.floats(min_value=0.01, max_value=0.99, diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py index 73710e520f9495..3677239817d7e7 100644 --- a/caffe2/python/operator_test/learning_rate_op_test.py +++ b/caffe2/python/operator_test/learning_rate_op_test.py @@ -5,6 +5,7 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st @@ -15,8 +16,8 @@ import numpy as np -class TestLearningRate(hu.HypothesisTestCase): - @given(**hu.gcs_cpu_only) +class TestLearningRate(serial.SerializedTestCase): + @serial.given(**hu.gcs_cpu_only) def test_alter_learning_rate_op(self, gc, dc): iter = np.random.randint(low=1, high=1e5, size=1) active_period = int(np.random.randint(low=1, high=1e3, size=1)) diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py index 08ec2638886291..20ac2b25ba1103 100644 --- a/caffe2/python/operator_test/length_split_op_test.py +++ b/caffe2/python/operator_test/length_split_op_test.py @@ -6,11 +6,12 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthSplitOperator(hu.HypothesisTestCase): +class TestLengthSplitOperator(serial.SerializedTestCase): def _length_split_op_ref(self, input_lengths, n_split_array): output = [] @@ -26,7 +27,7 @@ def _length_split_op_ref(self, input_lengths, n_split_array): output.append(val - 1) return [np.array(output).astype(np.int32)] - @given(**hu.gcs_cpu_only) + @serial.given(**hu.gcs_cpu_only) def test_length_split_edge(self, gc, dc): input_lengths = np.array([3, 4, 5]).astype(np.int32) n_split_ = np.array([5]).astype(np.int32) diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py index f879b702cd5092..d9cd2b2446045d 100644 --- a/caffe2/python/operator_test/lengths_pad_op_test.py +++ b/caffe2/python/operator_test/lengths_pad_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthsPadOp(hu.HypothesisTestCase): +class TestLengthsPadOp(serial.SerializedTestCase): - @given( + @serial.given( inputs=hu.lengths_tensor( dtype=np.float32, min_value=1, diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index 42ef87ac364f70..4a9a6b0ff1a9da 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthsTileOp(hu.HypothesisTestCase): +class TestLengthsTileOp(serial.SerializedTestCase): - @given( + @serial.given( inputs=st.integers(min_value=1, max_value=20).flatmap( lambda size: st.tuples( hu.arrays([size], dtype=np.float32), diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py index 6ffb5fc4fa843f..8bc27c31144f48 100644 --- a/caffe2/python/operator_test/lengths_top_k_ops_test.py +++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py @@ -2,15 +2,17 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLengthsTopKOps(hu.HypothesisTestCase): - @given(N=st.integers(min_value=0, max_value=10), +class TestLengthsTopKOps(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=0, max_value=10), K=st.integers(min_value=1, max_value=10), **hu.gcs_cpu_only) def test_lengths_top_k_op(self, N, K, gc, dc): diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py index c690b3aed3891e..b98100168df022 100644 --- a/caffe2/python/operator_test/listwise_l2r_operator_test.py +++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals + from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py index 29ff72cd1e72db..49051442350e28 100644 --- a/caffe2/python/operator_test/locally_connected_op_test.py +++ b/caffe2/python/operator_test/locally_connected_op_test.py @@ -8,10 +8,11 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestLocallyConnectedOp(hu.HypothesisTestCase): - @given(N=st.integers(1, 3), +class TestLocallyConnectedOp(serial.SerializedTestCase): + @serial.given(N=st.integers(1, 3), C=st.integers(1, 3), H=st.integers(1, 5), W=st.integers(1, 5), diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py index a6ea88e55737e5..e57bdb7a1d41df 100644 --- a/caffe2/python/operator_test/loss_ops_test.py +++ b/caffe2/python/operator_test/loss_ops_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestLossOps(hu.HypothesisTestCase): +class TestLossOps(serial.SerializedTestCase): - @given(n=st.integers(1, 8), **hu.gcs) + @serial.given(n=st.integers(1, 8), **hu.gcs) def test_averaged_loss(self, n, gc, dc): X = np.random.rand(n).astype(np.float32) diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py index 28f8e0a20f0a90..9e0168eacf9354 100644 --- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py +++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py @@ -3,16 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals +from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st import numpy as np -from caffe2.python import core -import caffe2.python.hypothesis_test_util as hu - -class TestMarginRankingCriterion(hu.HypothesisTestCase): - @given(N=st.integers(min_value=10, max_value=20), +class TestMarginRankingCriterion(serial.SerializedTestCase): + @serial.given(N=st.integers(min_value=10, max_value=20), seed=st.integers(min_value=0, max_value=65535), margin=st.floats(min_value=-0.5, max_value=0.5), **hu.gcs) diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py index 4661c7715ec518..0772aee5c9b285 100644 --- a/caffe2/python/operator_test/math_ops_test.py +++ b/caffe2/python/operator_test/math_ops_test.py @@ -7,12 +7,13 @@ from hypothesis import given from hypothesis import strategies as st import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import numpy as np import unittest -class TestMathOps(hu.HypothesisTestCase): +class TestMathOps(serial.SerializedTestCase): @given(X=hu.tensor(), exponent=st.floats(min_value=2.0, max_value=3.0), @@ -31,7 +32,7 @@ def powf_grad(g_out, outputs, fwd_inputs): output_to_grad="Y", grad_reference=powf_grad), - @given(X=hu.tensor(), + @serial.given(X=hu.tensor(), exponent=st.floats(min_value=-3.0, max_value=3.0), **hu.gcs) def test_sign(self, X, exponent, gc, dc): diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py index 67fdf2cf5ffe11..1872a129e569c8 100644 --- a/caffe2/python/operator_test/matmul_op_test.py +++ b/caffe2/python/operator_test/matmul_op_test.py @@ -13,10 +13,11 @@ from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestMatMul(hu.HypothesisTestCase): - @given( +class TestMatMul(serial.SerializedTestCase): + @serial.given( M=st.integers(min_value=1, max_value=10), K=st.integers(min_value=1, max_value=10), N=st.integers(min_value=1, max_value=10), @@ -125,7 +126,7 @@ def matmul_ref(X, Y, axis_a, axis_b, trans_a, trans_b): self.assertGradientChecks(gc, op, [X, Y], 1, [0]) -class TestBatchMatMul(hu.HypothesisTestCase): +class TestBatchMatMul(serial.SerializedTestCase): @settings(max_examples=30) @given( C=st.integers(min_value=0, max_value=3), # number of batch dims @@ -214,7 +215,7 @@ def matmul_ref(X, Y, trans_a, trans_b, dtype): # Check over multiple devices self.assertDeviceChecks(dc, op, [X, Y], [0]) - @given( + @serial.given( C_1=st.integers(min_value=0, max_value=3), # number of batch dims C_2=st.integers(min_value=0, max_value=3), M=st.integers(min_value=1, max_value=10), diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py index cbb1adc954784d..77c6b82625b139 100644 --- a/caffe2/python/operator_test/mean_op_test.py +++ b/caffe2/python/operator_test/mean_op_test.py @@ -4,16 +4,17 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np - import unittest -class TestMean(hu.HypothesisTestCase): - @given( +class TestMean(serial.SerializedTestCase): + @serial.given( k=st.integers(1, 5), n=st.integers(1, 10), m=st.integers(1, 10), diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py index 1b4322e0624f70..9f3302c6e75a3f 100644 --- a/caffe2/python/operator_test/merge_id_lists_op_test.py +++ b/caffe2/python/operator_test/merge_id_lists_op_test.py @@ -3,15 +3,13 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np - -from hypothesis import given -import hypothesis.strategies as st - from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.extra.numpy as hnp +import hypothesis.strategies as st +import numpy as np @st.composite @@ -53,7 +51,7 @@ def merge_arrays(vs, offs, j): return merged_lengths, merged_values -class TestMergeIdListsOp(hu.HypothesisTestCase): +class TestMergeIdListsOp(serial.SerializedTestCase): def test_merge_id_lists_ref(self): # Verify that the reference implementation is correct! lengths_0 = np.array([3, 0, 4], dtype=np.int32) @@ -69,8 +67,7 @@ def test_merge_id_lists_ref(self): np.testing.assert_array_equal(merged_lengths, expected_lengths) np.testing.assert_array_equal(merged_values, expected_values) - @given(inputs=id_list_batch(), - **hu.gcs_cpu_only) + @serial.given(inputs=id_list_batch(), **hu.gcs_cpu_only) def test_merge_id_lists_op(self, inputs, gc, dc): num_inputs = int(len(inputs) / 2) op = core.CreateOperator( diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py index fa456c8382f64b..ae9d9158f5062b 100644 --- a/caffe2/python/operator_test/moments_op_test.py +++ b/caffe2/python/operator_test/moments_op_test.py @@ -7,12 +7,13 @@ from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st -import numpy as np import itertools as it +import numpy as np -class TestMomentsOp(hu.HypothesisTestCase): +class TestMomentsOp(serial.SerializedTestCase): def run_moments_test(self, X, axes, keepdims, gc, dc): if axes is None: op = core.CreateOperator( @@ -41,7 +42,7 @@ def ref(X): self.assertDeviceChecks(dc, op, [X], [0, 1]) self.assertGradientChecks(gc, op, [X], 0, [0, 1]) - @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(), + @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(), num_axes=st.integers(1, 4), **hu.gcs) def test_moments(self, X, keepdims, num_axes, gc, dc): self.run_moments_test(X, None, keepdims, gc, dc) diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py index 7bfceb61121ca2..39e358f30d386e 100644 --- a/caffe2/python/operator_test/momentum_sgd_test.py +++ b/caffe2/python/operator_test/momentum_sgd_test.py @@ -5,6 +5,7 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis from hypothesis import given @@ -13,8 +14,8 @@ import unittest -class TestMomentumSGD(hu.HypothesisTestCase): - @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs) +class TestMomentumSGD(serial.SerializedTestCase): + @serial.given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs) def test_momentum_sgd(self, n, nesterov, gc, dc): param = np.random.rand(n).astype(np.float32) grad = np.random.rand(n).astype(np.float32) @@ -69,7 +70,7 @@ def momentum_sgd(grad, param_momentum, lr, param=None): reference=momentum_sgd ) - @given( + @serial.given( inputs=hu.tensors(n=3), momentum=st.floats(min_value=0.1, max_value=0.9), nesterov=st.booleans(), diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py index d37955ac1b4d2e..309236a281a492 100644 --- a/caffe2/python/operator_test/negate_gradient_op_test.py +++ b/caffe2/python/operator_test/negate_gradient_op_test.py @@ -3,18 +3,17 @@ from __future__ import print_function from __future__ import unicode_literals -import numpy as np +from caffe2.python import workspace, core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu -from caffe2.python import workspace, core +import numpy as np -class TestNegateGradient(hu.HypothesisTestCase): +class TestNegateGradient(serial.SerializedTestCase): - @given(X=hu.tensor(), - inplace=st.booleans(), - **hu.gcs) + @serial.given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs) def test_forward(self, X, inplace, gc, dc): def neg_grad_ref(X): return (X,) diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py index c1d02de1d09690..42fde4c9452251 100644 --- a/caffe2/python/operator_test/numpy_tile_op_test.py +++ b/caffe2/python/operator_test/numpy_tile_op_test.py @@ -11,10 +11,11 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial -class TestNumpyTile(hu.HypothesisTestCase): - @given(ndim=st.integers(min_value=1, max_value=4), +class TestNumpyTile(serial.SerializedTestCase): + @serial.given(ndim=st.integers(min_value=1, max_value=4), seed=st.integers(min_value=0, max_value=65536), **hu.gcs_cpu_only) def test_numpy_tile(self, ndim, seed, gc, dc): diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py index da1c11fbc2cc2e..19e6ee10e3ddf9 100644 --- a/caffe2/python/operator_test/one_hot_ops_test.py +++ b/caffe2/python/operator_test/one_hot_ops_test.py @@ -7,6 +7,7 @@ from caffe2.proto import caffe2_pb2 from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np @@ -25,8 +26,8 @@ def _one_hots(): max_size=sum(x[1])))) -class TestOneHotOps(hu.HypothesisTestCase): - @given( +class TestOneHotOps(serial.SerializedTestCase): + @serial.given( x=hu.tensor( min_dim=2, max_dim=2, dtype=np.int32, elements=st.integers(min_value=0, max_value=10)), @@ -56,7 +57,7 @@ def ref(x, lens, vals): op = core.CreateOperator('BatchOneHot', ["X", "LENS", "VALS"], ["Y"]) self.assertReferenceChecks(gc, op, [x, lens, vals], ref) - @given( + @serial.given( x=hu.tensor( min_dim=2, max_dim=2, dtype=np.float32, elements=st.integers(min_value=-5, max_value=5)), @@ -108,7 +109,7 @@ def ref(x, lens, boundaries): ["X", "LENS", "BOUNDARIES"], ["Y"]) self.assertReferenceChecks(gc, op, [x, lens, boundaries], ref) - @given( + @serial.given( hot_indices=hu.tensor( min_dim=1, max_dim=1, dtype=np.int64, elements=st.integers(min_value=0, max_value=42)), @@ -134,7 +135,7 @@ def one_hot_ref(hot_indices, size): one_hot_ref, input_device_options={'size': core.DeviceOption(caffe2_pb2.CPU)}) - @given(hot_indices=_one_hots()) + @serial.given(hot_indices=_one_hots()) def test_segment_one_hot(self, hot_indices): index_size, lengths, indices = hot_indices diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py index 0cba8053d53e05..eaf0ef58ba30e6 100644 --- a/caffe2/python/operator_test/onnx_while_test.py +++ b/caffe2/python/operator_test/onnx_while_test.py @@ -2,17 +2,18 @@ from __future__ import division from __future__ import print_function -import numpy as np -import hypothesis.strategies as st -import unittest -import caffe2.python.hypothesis_test_util as hu +from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given -from caffe2.proto import caffe2_pb2 +import hypothesis.strategies as st +import numpy as np +import unittest -class TestONNXWhile(hu.HypothesisTestCase): - @given( +class TestONNXWhile(serial.SerializedTestCase): + @serial.given( condition=st.booleans(), max_trip_count=st.integers(0, 100), save_scopes=st.booleans(), diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py index 3935ca8c8f17ab..f6674ed625bd45 100644 --- a/caffe2/python/operator_test/pack_ops_test.py +++ b/caffe2/python/operator_test/pack_ops_test.py @@ -5,6 +5,7 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given from hypothesis import strategies as st @@ -12,7 +13,7 @@ import time -class TestTensorPackOps(hu.HypothesisTestCase): +class TestTensorPackOps(serial.SerializedTestCase): def pack_segments_ref(self, return_presence_mask=False, max_length=None): def pack_segments_ref(lengths, data, max_length=max_length): @@ -53,7 +54,7 @@ def pack_segments_ref(lengths, data, max_length=max_length): return pack_segments_ref - @given( + @serial.given( num_seq=st.integers(10, 100), cell_size=st.integers(1, 10), **hu.gcs diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py index a5a3d6de537d01..6bf2315ca0c52e 100644 --- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py +++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py @@ -6,13 +6,14 @@ from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st import numpy as np -class TestPackRNNSequenceOperator(hu.HypothesisTestCase): +class TestPackRNNSequenceOperator(serial.SerializedTestCase): - @given(n=st.integers(0, 10), k=st.integers(1, 5), + @serial.given(n=st.integers(0, 10), k=st.integers(1, 5), dim=st.integers(1, 5), **hu.gcs_cpu_only) def test_pack_rnn_seqence(self, n, k, dim, gc, dc): lengths = np.random.randint(k, size=n).astype(np.int32) + 1 @@ -47,7 +48,7 @@ def pack_op(values, lengths): # Gradient check self.assertGradientChecks(gc, op, [values, lengths], 0, [0]) - @given(n=st.integers(0, 10), k=st.integers(2, 5), + @serial.given(n=st.integers(0, 10), k=st.integers(2, 5), dim=st.integers(1, 5), **hu.gcs_cpu_only) def test_unpack_rnn_seqence(self, n, k, dim, gc, dc): lengths = np.random.randint(k, size=n).astype(np.int32) + 1 diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py index ee5e001a91e2ce..43cd10c231887e 100644 --- a/caffe2/python/operator_test/pad_test.py +++ b/caffe2/python/operator_test/pad_test.py @@ -2,16 +2,18 @@ from __future__ import division from __future__ import print_function -import numpy as np -import hypothesis.strategies as st -import unittest -import caffe2.python.hypothesis_test_util as hu from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given +import hypothesis.strategies as st +import numpy as np +import unittest -class TestPad(hu.HypothesisTestCase): - @given(pad_t=st.integers(-5, 0), +class TestPad(serial.SerializedTestCase): + @serial.given(pad_t=st.integers(-5, 0), pad_l=st.integers(-5, 0), pad_b=st.integers(-5, 0), pad_r=st.integers(-5, 0), diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py index f09dcdcde2b6d5..83d67f4beea966 100644 --- a/caffe2/python/operator_test/piecewise_linear_transform_test.py +++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py @@ -4,15 +4,16 @@ from __future__ import unicode_literals from caffe2.python import core +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.serialized_test.serialized_test_util as serial + from hypothesis import given import hypothesis.strategies as st -import caffe2.python.hypothesis_test_util as hu import numpy as np - import unittest -class TestPiecewiseLinearTransform(hu.HypothesisTestCase): +class TestPiecewiseLinearTransform(serial.SerializedTestCase): def constrain(self, v, min_val, max_val): def constrain_internal(x): return min(max(x, min_val), max_val) @@ -31,7 +32,7 @@ def transform(self, x, bounds, slopes, intercepts): y = slopes[index] * x_ + intercepts[index] return y - @given(n=st.integers(1, 100), **hu.gcs) + @serial.given(n=st.integers(1, 100), **hu.gcs) def test_multi_predictions_params_from_arg(self, n, gc, dc): slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32) intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32) diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py index 9cdeafaae50bc9..5882f7aef8346d 100644 --- a/caffe2/python/operator_test/weighted_sum_test.py +++ b/caffe2/python/operator_test/weighted_sum_test.py @@ -14,7 +14,7 @@ class TestWeightedSumOp(serial.SerializedTestCase): - @serial.given_and_seeded( + @serial.given( n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4), in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), seed=st.integers(min_value=0, max_value=65535), diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md index 00d104d309f652..2885ed290cab8e 100644 --- a/caffe2/python/serialized_test/README.md +++ b/caffe2/python/serialized_test/README.md @@ -4,9 +4,15 @@ Major functionality lives in `serialized_test_util.py` ## How to use 1. Extend the test case class from `SerializedTestCase` -2. Change the `@given` decorator to `@given_and_seeded`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run. -3. Change a call to `unittest.main()` in `__main__` to `testWithArgs`. -4. Run your test `python caffe2/python/operator_test/my_test.py -g` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one folder per test function -5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. +2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run. +3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`. +4. Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one npz file per test function. Use `-O` to change the output directory. +5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. + +##Additional Notes If we'd like to extend the test framework beyond that for operator tests, we can create a new subfolder for them inside `caffe2/python/serialized_test/data`. + +Note, we currently don't support using other hypothesis decorators on top of `given_and_seeded`. Hypothis has some handling to explicitly check that `@given` is on the bottom of the decorator stack. + +If there are multiple calls to assertReferenceChecks in a test function, we'll serialize and write the last one. The actual input checked may then differ if we refactor a test function that calls this multiple times, though the serialized test should still pass since we then use the serialized input to generate a dynamic output. diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip new file mode 100644 index 0000000000000000000000000000000000000000..415a47d71c3166a785e06108d24af457f02a9002 GIT binary patch literal 1192 zcmWIWW@Zs#0D%Jyy1pAV)iPv(Y!GGz;`{==f+Q}+2qDHuDF!8G*PK!<9xm3zoPvx* z(X(gH*jh0Jpy*$te@IoInSlX>If42!^YTke^zsU-V9Gea)@&1EPPhV;c4cH>-~uZv zC@m=ls;iv+M*Nkuz{wK<#{$;Qn-n!CW=Y=C1#yqpuUoue-n96TpeZxN=f_W(#3fYb zeMZxdwM#r*MQjPvPpLMmSyo(Ixu!6zV_WuQ&!sy}UVESXSroGO&ZRk%&Ky~^=!n<4 zAB(O$`LpNIq(zsW2(5E+>(E-aN2DW1ak9B{wnlPLb%5gLHuJ^i$=V@~UY@CrTH%V` z8k6_8nm3y(2QaQ~GcVRi?wvn>{`GnV2H|_(R6XZzY|WAEk>r?sLs2c2!F2gd2h-vP z)7uNO(ipP%QXlSGmlHi{O2nik?C>Ch_}yxwX__n0&mBO_55y^{WtqvT@g;~yXsE!xmEqlY;{a~?3wBgH^~H!B-Rk{Jls0qJZe5D%7JAi46| zGltzJKv56|<*M|e#FTggaL$b6VvCFi@rBqTz%($Mas#ueJ5VSyHLnCcyLvDpT#7x* y1I-76CE(xz7Xav5&~rILivy5}l;>dqirpIsZPS3o1>94}A+ivt<|9y&fdK&AiE7dS literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip new file mode 100644 index 0000000000000000000000000000000000000000..e4584245ab112cc07ded499bf6d2d0d771345f9d GIT binary patch literal 1200 zcmWIWW@Zs#0D%Jyy1uVp{@_;xvO$;?i1Q2d3X-@OBZL?ur5KbreNq#%D}z#VO0{^n zSQB#!G7?44o;hP1fMUiP{X?qy%nS@5%n39jGcUiiL@%$P3Z{$$Y}Ga)=7cLiX;(%D z1}?C&g3^*=pt{Q0Z^U0o3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180t zNnAo@-e)xZSi8j2Rm7Gs{gi66nq|ecm1_#aI<{p`_FTHtCBNu zi;j4$`?2WClRtYNO=;fK} zs1>g0tuc9jt9i4zascD%HuGYQ)Rp~{EDe(s2+!@Km78wuX3$aCjX<$|rglAQEphRYBUI}WJ^Luztr!AOw8U%f z6V+#CU;trGpq9+M{L&J=yn-s2G7hlGH|F?Hy#kbWWn^IB0xK&hEhz@7tDOBt{FSu8 z$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm( zsWz)wR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFok zt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=L+98f!o~e#n;fmfGllQlpH=8R5Fs^Ph zFV;xzoj-s6^?C&c;d|dyJ?Cz0&5`VpJ!1on1cN2u5CLa%bS>x^ k8==J*$VAHKurS2#a)h={UD2XY`R1gg0Xlw@E40OUYqG5`Po literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip new file mode 100644 index 0000000000000000000000000000000000000000..07e439a921cfcf7799227f9c8eeb49605f8d3ab5 GIT binary patch literal 1184 zcmWIWW@Zs#0D*%Iy1q*&xgc#Q6n!1xZ|t5kicSQVdG0L8&>Vep(z{jCrC! zN&5g4y&@A!KkG9yFn};8P&zX&zqCXzub>L1j00?iAWv@76`-^$BLf2$SXn`7Nik4e zSM=7Hyua1F z*<3k*adn${u|{(5{Q2{**DEjx-}|QOId@}gj%1G{$K)G|YN-sS%V#>67B`sQUXYc> zkj0mpxMSaqWh$B*RXq6_Q2lPT(KO8!=;sa~<_F@G)UwRv)cBIhf>fLVkT*GVAw~$4 zqlN(3|00ZfY#Iz`HX?^MH_%34lnn3SW@Hj!2Bl~0i47n$g1?p*atj=8<9`z?+o~B*_ef>wt7N6Nm>(GLRgpQh(jl04NH= zpj?$+l$a840M3<>Tx^l?AifY=1egY9QC@Hsbq5M&rskC(vZ@Co!j;&wI?!w|SON|j qaK=a1f}X<>T5Nz!r2GyGOzd7jXqyBqD&QVM4vmFCH7|jZ3=9B2lwn8! literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip new file mode 100644 index 0000000000000000000000000000000000000000..2bdb95bdaf798c9f6fa4cb0d2d03cd7c92361ae2 GIT binary patch literal 1167 zcmWIWW@Zs#0D+)JU0>OkO~C~~HVCrP$(~Dhn!NTt`Lifw@109?CY?F5Xwea` zbw3tedGcq^qe+V{JrP>x&X_AVFu*E{Pf`sYM4(Xrmji1HtIqUVVO}?H%S)P;R;9&;| z6swJwPuKGjGcY6=nM9aDnF)K61WJHGLnDaF%!36C zx<2$+gXm&lXlN7#hALVdqH9Kv0EFgtpk}1#2=HcQ14%Li;W{8akqN{@7>B(;0I30i lC15{*iwATq=ouEG0in8L-BQzQgYF9cODgrZ)GL5VLIXkKxuV@e{}1T8);_SAyn z%$)o@QLgLCHd;JftSPC0~9y6 znJ+f4)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3d9g-v@BI1muh%Ou2;cjr>N$5~YmSs; zk0i(98;)*<+tL_Hm(O%4Ep8~iz2H?E!z;ejM24nGDk8!b3)Kn|rawERf0lPr8pG}R zoW-{Me!nl4es(G?PVoAuc=L2=?9F8=C-3fC`fA~CtBJ3!Oj5Ch2P-7JtTvjaxdOwf z1Bm&7I3=|V7^(3kl?ACd11)cI=0c26D@P4AaF~fO>al4spxOB6uJ7rGK$}{CvBV9e zfze256dj{h6cy7diYEB!dR}4%h6E#%2s0?#VNb0<2{34A1W}oJuz*3=haPJXT?`Bj zjRwF_MT0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm(DK)EER$N=T zrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFokt#fkg&|0@g zq$5Xhvbl4%MsiSffa2yh^Tp=g+98f!o~e#n;fmfGllQlpH=8R5Fs^PhFV;xzoj-s6 z^?C&c;d|dyJ?Cz0&5@Gqk>pr>!_m!fTN*>@@|g~$#SNvm7raVic*U2R$j~%NMMT(Q zp;|%0^k;|k&-SXEEsedoOht36iYGrj;2al4spxL-@gQW6ZpiQa3IN%1-z=)zW65Oa23Bk08gab>1 zy(fJEh5{p#2s0>qU{8xc2{34A1W}oJus}iAhaO`PT?`BjjY7Z>MT5Reo8H~nq|ecm1_#aI<{p`_FTHtCBNui;j4$ z`?2WClRtYNOg0 ztuc9jt9i4zascD%HuGYQ8^D-!IP#$OsRgM}` z;BXRQ)ML|NK(le*21(_+K$}v5vBC|cfl)_kw75|#T7qd2EeDncdr$fT3N58dZQHiWYz9n$ZIvp?MlmGg4Fpc(byBB$G1mYo#!(J4C)PTSeupht$1G*OUybI9)($WNEBIRU={Y(r@3|T-4P8N_t004`2 A#{d8T literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip new file mode 100644 index 0000000000000000000000000000000000000000..8ddd81fbf44a9f09e9f420c3aacdc5b616c88735 GIT binary patch literal 1154 zcmWIWW@Zs#0D;IxUEhwK-zG)?*&xgc#Q6n!1xZ}21&Kw8xm+B%`MIfiC8fDs?3sBf znaQceTrBBDi78x6IYmMcH9~MTQVdGG!9ab*sg5a$AXQp?T>GxPFGOZ4&zs$j}Ez%HB?{Y>@`Q2H(-0|OUWSwU$@F;HFQ z>^I`Cqy50%fC$|o*b$diQaug?>Zld2=&dn%f2(=3 zxpDyG>NfLYjpW|>^XFf$S6~pn_f6Gv?#9*}Dajs5j>R_|-3+&-F_bQ!=}=nSPx*5L3wBu=EUlRM=IO3=Il+^r zL`_=44i8F57+GyJO>+f?O$QM3193`f88EiuODYReaRyZ0g z#b*aIGcbTKC(xeEy!_Gnig>Hx*fZRU&3i?u@>y*yJLwZav>H74(GHE%Xo4q#l} zW?rn3+&h2%{Ok1!48r%msd~=c*qS3H*(1rZ_(r2!qYv}pNjwV_Gt~l|Ee^EZSRi$X z!9-=sQky^j_f1j}X|_1HY)e`J!_rps>2KU8KXaN?C#uIk#ZTAs5;Fs8&{%CWO#{VJ z2N3fEaY|}gW^!tLNo7GQ&hW{boVgGqfXYz=2pl{jjCyPu3}`l{#CW#c1lr^Sj0|od z4U92LW222)v5`zlY=9zT@{LBdMqcKFz{rpUMuxG)fwUV7W*uZOQJ#`IdFzXPla?Hr zq+$yU0Y)YfW>EgXo*;n|V9?MAqB8Sf0fMd%J+>ga7#JEFrGcS|7I)~H(E}c#xfiGz zDH;O2S=m66%s{vfNcS^=c(8N>$>BCPsu_cUq96>)dFe%oDe(rNT+S9558^ue2xY!GGz;`{==f+Q}+2qAXQyp+u3)M62?i|=E|$cKOd!jMAppfl&2?Efw3!(gK$sI~WM*D|X^CE5K^06H z2iWEn{9HRP0i|6T85p?0$_h$Lih=4XXTK4DB`t9BM8L6tweu!L&52o(w{$_=_)ZBG={Bdhqe?oY`wK0E0rOO_h~`{BCr{mM3_NI9D70oN`OH_BZ$h(gM~zZH>y7L zXoBcsU}$Lk1!ST{8oFlm5Jzav0mcDR_y>5ivVkO-fp8s=PGa`&}L>}0AWs`k(qhiC}GsNe|PnpCeROWp~(~q@FJY7X>3DZxhHmg}yTwA%OFsx%+_GHhcJ564D zpZr-AviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@TDM1}BS&$vxpTHga!_@E z;^sE<#pcP{A&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1KY#x9dIbjId*4(&=WcAx zk?fJ=n0!M~EtSD^`Ai4X;s(>(3$oG}viMRH8JZ?7IWkGb79KzlpIdD-O>+hMxC4m! zfjA|#3>eb!C6xuKIK7`YIddU~|I1PR5B9wXqaK?E1DcK2;+y`1!rC1e*4#iE7$HMB zup8Af(-^j<9okaVu=Uo0tW<_9-lqu-h`?rK5@7}15C9x!ti#;bXHz_625XLYPV#_Zr zDJU(GVo>6CN-RmvaLTW6&acoC;^NEA%!^OT$xqIXFV3t=m0-}~nig>Hx*fZRU&3i?u@>y*yJLwZav>H74(GHE%Xo4q#l} zW?rn3+&h2%{Ok1!48r%msd~=c*qS3H*(1rZ_=ck!qv_UEhOf(KI(#i|_+f?MF$Y`193`f zS!Qx-d`V?ND$an(o1D22BV@`^Lk1i!B8+-$8VqPQp6%ghx(Kw%85j%PKpGPL6h%WD zwW1-JlxP6O!sHu{YK&jEq%v#;#=_R(hOM_3WTi7?0b`+4)w2rdb4Df+W>C(+o&tdq zV9?MAqB8Sf=@DHYdMrV7F)%bViU30qEzZz2qX#%ba~n`IQt$_Ov$BCCnSpQ}kZxcC w@esyg&)pz3Ag~1N2XJ0T*MgouAsRqh3V}?d+zPRuiGhhB3n+01n4K9I05p7Ii2wiq literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip new file mode 100644 index 0000000000000000000000000000000000000000..73717a440d9581a254f5b9c6572bde552a149eb2 GIT binary patch literal 1265 zcmWIWW@Zs#0D-(lUEikF*Y#|GY!GGz;`{==f+Q|>H&@4y&>&YXc27SSPiNO)E-okk zP(K&PAWtBh-N)6>J;Wneh|yn)L5a^Pu_QUesWdq|wIs7DH2}qUw(RQ+ubCOZm=kDx zW?p`2iC$ho6-*fi*cDPQI40IJGBEfsF)(m}l@*kh6a&>&&VD2QN?PFLiGX7PYv)ah zniI1mZ|Q=#$LrTEUNCQ3d`Qrg8RGNfr%d7!D)T<0>Brh7o~|Ocgz2Z0n$;{TuB}{C z7}l{Zd$Q-!ohGlnPyQ?l*?Z^GoJnVnELwELYu%4USDyUY^JvndOHYK>Ik|Oct=l8g zk)t@-+&Nn#IjA~7adVsbVsmfp5JxZ1R7b6FMQ@GC`&-SM&6NWfSGSoLYb5v1pFjV4 zy#j;qy>F_Xb2ql;NJ;icaxA{#=(e$H^R`rmughmTd@XMHdV9gEbcR>_zAx>!T>Y0Q z+I`rZSH697dc_;=+}$!TaU3Veu)Wcto z_{%agbHDv<-*+pd`-bI-?Ox1<<^g+mH>dU31=O`WM@7y0JKKS`ea1iMGF4BFyeBL> zC+-w|a>!=}_s)V{6WSR$BpPS1c3kv%#5VKDwOI!*N@_7q*_xh^!SIOLVarSWtBNN-18Ou`Z8S}D1x8N?5c30ZN@`hVa%y}@WkD*= zxXPQHxez0=%26W=991HWdTbgDXf_HQ>dJWvvl1WJ!poB5` zhNIfXD&y^`3|rG@IczO%*m`?GRyso#|JKB$goK2aq!cp_Wo9)M(=!1ZCoMTLNyQcz z8jMUL%%JRwJ);06z@VWKL}li|0ta0mddxv|F)%bVE(eAzT0Ej_Mvnx9W=UXvMT(LD zZ&o&tBr_1M1JdfioDYgom>;kgIUqG4umtP}Z~=s_1-%%7XaH%62QrZg6^Q*z3``7J KK#4*YkU{|Cudg-$ literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip new file mode 100644 index 0000000000000000000000000000000000000000..bb95ce7149a6962fe7c564be633bf1731a4e6993 GIT binary patch literal 1246 zcmWIWW@Zs#0D+Q5U0>bZn+mjnY!GGz;`{==f+Q}+2qC82QX%HD#3CsMB_5~5lH?5E z{M^*Ml431hF4p{_l++?67C&c?@BkFk=0}~-abad)0AWs`X_WMJR|D=R21DF&*moc%`pm9)Ug69LBp*3O$0H790C-qHnekJqnTykOq6 z_>iC}GsNe|PnpCeROWp~(~q@FJY7X>3DZxhHmg}yTwA%OFsx%+_GHhcJ564DpZr-A zviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@TDM1}BS&$vxpTHga!_@E;^sE< z#pcP{A&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1KY#x9dIbjId*4(&=WcAxk?fJ= zn0%vAt&x}c;3S>}l9_4&#uf+CZY-E}kikTGit3NYn!6`0IWkGbmH{;YtTvjaxdJ`j z0mS@3oRV6WnVcG5Qdy9SGZgYBXD-ADhH}(k00)8yqaK?E1DcHmqCab%0BuSI201s7 z2F44eVeUq)Fb^gr%%wnqzW9cto8h)JhSKFT9ZHKEN^dWCmB#RjFO~CO#i?~FoJU#w zR&dHVyo}`+ZMd{j)w2p17>rCJ%%EhCJp}?Kz@VWKL}li|!UkO*dc;9=F)%bVG6Dk@ zEgI1^qsIY4b1_geQj7$6v$BCCnSpQ}kZxcC@nA^^l5s|`dqDxT5)77rLkwJO npld-d4G>!VfJ~&a0TwsdeTLAs6IiUk{fQh%3xR6XSwIQ_28(|g literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip new file mode 100644 index 0000000000000000000000000000000000000000..eaddc47759a9c0fabfa64e0a71d16108d8ee6781 GIT binary patch literal 1501 zcmWIWW@Zs#0D-DTUEd>JEF9KAHVCriC}GsNe|PnpCeROWp~(~q@FJY7X>3DZw0HLF=xTwA%O zFsx%+_GHhcJ564DpZr-AviHuVIg`#DS+wYg*Sa5zt~~j(=h37^m!1f%b8_p@TDM1} zBS&$vxpTHga!_@E;^sE<#pd4HA&y?2sg7FViryNN_qUoin=1z}u5L3g)=2K1KY#x9 zdIbjId*4(&=WcAxk&^6@cGErri1N~-rLTzXYYOc zJv`)wbm`WcovU@W-A+A{zu)q6=i9SSYIdu0>%5)jyDLFFE58{G1?r`+;Zxxe1F|MK~{x1THK=3BR4zL&?<{+H1` z?)RV7X5Tb_=iU67|7CmquBc;m?;q!vd7Y{HHeYPlZSL+Y>!TeR$W@jCFHDRr&*- literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip new file mode 100644 index 0000000000000000000000000000000000000000..f51ee2ee182b35363f73e1677511f2e2b8f03e0d GIT binary patch literal 1254 zcmWIWW@Zs#0D*=^UEkckX=<~9Y!GGz;`{==f+Q}MqWsKaF1D1^oRY+1F80ja_{_Yt zd?EIv{EGPe(h?~KC0-||{ECpG#Ju9P{Gwbf87_(B{Gy`N_+Q zHy3+Reo10UYKjDdmJk5Reo7s)nq|ecm1_#aI<{p`_FTHtCBNui;j4$`?2WClRtYNOg0tuc9jt9i4zascD%HuGYQ#<9&=kTKlpY+>2&Nu9?x^_^?Ho#hhX8?5Igg*cniR z&uXJ-nkz8aI)IoTh*MI_GLuu|ODYReafV^uk)|Tw2>e;F4Sp^IWMkWzvP#(pee1Q^R(9j5?GV@?TgRT!f-XOX_ z`Q9EFu4u7{t{FWF5SljvH6ukxfHx}}NRk-{*8%ChOduY@IPB#NNDT-q0s8@5>Y!^u d&)g6VAT9fVOr)$1v7d>7i6ILpk;Vd22mnJ(q{RRL literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip new file mode 100644 index 0000000000000000000000000000000000000000..668efa6e1643b587b301ee6303bc589b22495296 GIT binary patch literal 1077 zcmWIWW@Zs#0D;ysDdfu0Nc3TL$&B7P`Z_o zfq@IGte~`{7^tpt_8ak6(gG(>1RM)kJ8x3doR}qfOBcjFUcYYff_c;8LxQHv5T74E zWfGTAnfDn@Kh`esbQQ5BOh2XEtY%qpZRMK6u#RonlRcO2G>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z&eD6n+I!$IC^=e zI%;Be{3}{Q1}G6&Qr?eN*+EyRkJ#O0q|iWAP0~H^Xgd z45iCwI+PYSl-^$ODvjY4Uut4P0)td@5|jF&78U0eoG~}vu+KhlB4W~%s7XuM8Bhbn zYNKhID=;8BfS4bMQ&P(^lT+hMDhpC^hDzS#%!L@iQjQud;6M>!)MEq2Evk*ew-ldW z2HNBbi~?>T4U8vBqaZLG(P9l!F6QTj6#T&>(%B&FknHZQDvVam> Jfw`H10RV1=Sb+cl literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip new file mode 100644 index 0000000000000000000000000000000000000000..126920673705348236aeaa9196b31413c2171484 GIT binary patch literal 1337 zcmWIWW@Zs#0D-nfUEg=M$NxkE*&xgc#Q6n!1xZ{iDTyVCT)V$9gSz?Z8d_kvymH-z^ZhlIt5=SvqnHC2Z zOJYT4u>@lPirofxn+|v~GcbTKC(!Q9y!_GXO1jdbi`}j zk40CW{Mqwp(xOXGgw{E^b!e^IBhrzhIN97eTO&EBIzVxAoB3k%VC@h`FV9p*t#Cze zjmi64&6~}Y0~lAgnHOs$_s*X`|9ZUwgYdm?s-ANA6#P(O<^ z#GXB3KkuqKwgvwJru}I0Yo4ScVr+3RZA+8SiI$lMT5q$kDQI|l>3UvbW3!q4teB+>6rSY~@hONtII&3X&*m`?GRyso#e`=@x3lm_7 z>p()Bkx7IZlvA)LL!bm0G&F*!%sg0d1bCzBLys$nE(V5%MqyxBqQxA#X7pf3Xl?~+ zMv8<0Z&o&tBr_1M1JbQbARa8$K=S$pzs!yKKv56|<+t>r#FTggP<|Ic%HZ)JVMN~N z1Z8@#7)3cBR0eN`o7n6Pnp2 zJ*7@2WY0Dc?GoN2a`UErL~!_r>v`4f`Zf2f=YOB~arW(M2mco{pY00ki*qTlUF_*1 z@BMtP#lzY6s%O?Qp$3Z8M$nig z>Hx*fZRU&3leI$}y*yJLwZav>H74(GHE%Xo4q#l}W?rn3+&h2%{Ok1!48r%msd~=c z*qS5RBgrxOhNGI{)-(oFbx9}F;)Kg*7_P{iI>NC;$a~TkpuZWJM3_N20DA%iN`OH_ zBZ$h(gC$3Feduun(Z#^f&?pKFL$nw}*Nh(A2+i$4%}9YC;LXYgl4J(LbwIkF3B*Gf uhdmdA)PTSeuphvA8eI!|_Jn8vX^8?dkuod9ekKMchAg1O5nyg+U;qGP^<|L& literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip new file mode 100644 index 0000000000000000000000000000000000000000..ccdd2257ffc71befc65ec1c36d7e5032b92ef518 GIT binary patch literal 1238 zcmWIWW@Zs#0D-BEy1pBiFbXCD*&xgc#Q6n!1xZ|-nR)TW$@xX8#atXfW>S7dYOxR- zL_~-c!evj+Nh~f-EtXj&fG{UeIx{c7v_vnjpbDmp1MJl5dq=m4F)}cSurV-j zft3}MmJ|ckRnC4R{z_Wl*lo{glBRQx#Kyh=M`C{{6?GQ&V&s0aP za7Ay8$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`;pld<(s*ks z!`J0A9ljPfe7(KkRXW2f{=XBZO-u9lt@+l&ANq04%t;|fRXsHx9_8fJ@7u=HcWwgn z86JlX4;VzO9X@KzSE*}MW|`mcD9|pmMp&JruMx=7{4;6El}ReL45*=Jwb3-q6&PwA zK+F%sDXC?d$*J)rl?ACdgD`J$=0c1xEJqDPa0rSp>al4spxKyy^+_ooFuo1}F*ndg zV5CwSVaKQyVZ~%bn4()MLn$z_N{bsxfsvKQ@QUxLr?>ZJ`|4xI>#q5DE-2JcD{S)p zl$Lq=!3!SKgoKC!hAvr!9R~!P{2exS3bPz`zSFXV?+BxctkNWi7@_A@asF=PQH!dO5G E0bfI&7ytkO literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip new file mode 100644 index 0000000000000000000000000000000000000000..928a74f90cec5d452b12b372cc85a909dba4e5b2 GIT binary patch literal 1029 zcmWIWW@Zs#0D+l}y1wSeGin5YY!GGz;`{==f+Q}+2qDHuDF!7L=hVy`D~13RErIi! zo3xo37(kd4s3kKmzqCXzub>L1j00@)3VyDgmw?i)j0_B1U}XiRCB;B>m9yW7zmgU> zc_QFgz}k6}qUOXb$y>T0?(zC{ixH*%*#v7aY{_iE-1<`2rfx1DGorf z&#QdJQcj>nU<|Y`GY@F2US2^JOc@8*S)P9;sm^6$U@&B6VBi8P16o}SR9CrlLbgY6 zp~SKCGpDZ=O-vLF+T?P3isCG`YuolKdz^L-KJ0ig_1!I&yHDTftgk$r7~HwYgJZL$ zj<2`LiY4dn@r2E2Yii4!*(Wt|+N{Wo)Egg}md01x?fGDL{_y+5#hz;RbBcJ5pG){z z(|c`^+Rxt=f0|PEO*npC(z7c>O2jQ_aqJf1DKo`%A4h-jnmp|%>&8O`itlya*B{YM zTgiJ(Cpd4(Esrea&1+_cv`tc(x-*7#WvDc-Sl59$zZlj_*8kr z-y9rsK5ShpViK~>^R(&jQ;{bNS==7@9Q1eiapEJ>OrL364rII(_&DWM-K97B7asrG zuVL04wd5ho?1bPdwng2sjRn&b9$gjaX*_Xvq2u3m*KWsoF8E=4>+?^;mcB~{5tDhp z+rMqmUnnY+emQu`_f_(>-^-jr4)esC?dFuc^I7B6dv;kDlZBS^`5)wcSoGzxe?ZK) z{2ez{o0hOD&TvbaBzDx~$~!xjulnr*=?jFOw(huiAdbCcZao9z#gFU$t`~Ncs(Cu` zVYswb_B^J0XG))3Xuq&oaPk35QC`^+A&=MhIzwj_u8^Fpl;gcm!~OH+*1*n1>o0$w zu6?YL)WS-jt&jiw{;2^H><0J?c$QjUlqHmsYIjf+9J2-VFLF(9Phe54v4nwW-{tq zd{X@RgTxBHJ9n?L&zsZjuzl(QvvjRbOSE71{Y-nH8Y8fMdf2X0Tw;rzJ$ ziCy^L&3Ptv*SagD`*xY_mrXLe6twDuJ!&Gh+Gv{Q3QW8mK+F%sDXC?d$*J)rl?ADw zlsx;5_$z6FlP3a>1+1MnDQZs4lDwr0@+N04T(@|^ylL?vK~rXk&ySxniA$*5`;4X^ zYnOPsir5mSpHj1|xVCbMFzT^sFfgFn$i|#e`xIzXDkB2}H;{&8S#Z9=YFpgn^%$0w zp<3o<1-5Pq!#cKQPxf58)8w`H$)80bd+%JDGwIBcMT?Gjt^2X)%9B5P9!*+w>50%f zC$|o*b$diQaug?>Zld2=&dn%f2(=3xpDyG>NfLY zjpW|>^XFf$S6~pn_f6Gv?#9*}Dajs5j>R_|-3+&-F_bQ!=}=nSPuD*+7!aK)4P_yRv|I2;;Dq&>%G+umtP} jaCwcc1-+1kXaH#`05Xw^T!{Tl3``7JK#2vQ3V;CsurW)y literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip new file mode 100644 index 0000000000000000000000000000000000000000..1c9f5abc9bd0c552b83a644a28fce27f054dcd1f GIT binary patch literal 1241 zcmWIWW@Zs#0D)zVy1u7giY+k$vO$;?i1Q2d3X-@OBZL?ur5Kd>oHG*h@=|kxGfLCa za#FQ8xmeSS@=FUO7`1r0So4cgQj3&W{G2_)15gYt>3PX&$;`k2!kj=uGxPFGOZ4&z zs$j}Ez;<^Wy)gMEP}+x)fq@IGte~`{7^tpt_8ak6(gG(>1RM)kJ8x3doR}qfOBcjF zUcYYff_c;8LxQHv5T74EWfGTAnfDn@Kh`esbQQ5BOh2XCtY%qpZRMK6u#RonlRcO2 zG>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z&eD6n;Be{3}{Q1}G6&Qr?eN*+E zyRkJ#vPY6*@{LBdMqcKFlXw_DCAAfT!;}2<*2~`4g?WKJvI#nG#inFoEvB( zFkVJLm@_hoFoTjj_GAZ?0E31`5S5t+3y=VBRDI~N1<}R8(9p;TOu%Syhprht;1QaO zftrz`A;6oJ4J647gzJEGDHDhXOE-{AJ84}Dvl~zpghBZ$y(lpy-T;(w*&^dX93i#{ zFb&MYVyIcz9VnZbnpc85D|;{^+>X7B0NMx!OTYmJE-}!xpqBs$EularQh5LiV(cD6 VXxj=bO5ol^4x@!YHA*ZXg#e_TfExe+ literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip new file mode 100644 index 0000000000000000000000000000000000000000..f0e22405a92f76c30182684b9df4bffee2113f39 GIT binary patch literal 1275 zcmWIWW@Zs#0D-lQy1vDmxU}SfY!GGz;`{==f+Q}+2qEU;(p(`{AW>W>#h}FFoROH9 zmzon?l2}q4fMSCE@25UOEDQ`F%n39hGcUiiL@%$P3Z{$$Y*Fo>pVyZ&GBEHkF)(m} zl@*kh6a&>&&VD2QN?PFLiGX7PYv)ahniI1mZ|Q=#$LrTEUNCQ3d`Qrg8RGNfr%d7! zD)T<0>Brh7o~|Ocgz2YLo7F5UuB}{C7}l{Zd$Q-!ohGlnPyQ?l*?Z^GoJnVnELwEL zYu%4USDyUY^JvndOHYK>Ik|Oct=l8gk)t@-+&Nn#IjA~7adVsbV)JC}5JxZ1R7b6F zMQ@GC`&-SM&6NWfSGSoLYb5v1pFjV4y#j;qy>F_Xb2ql;NcKo_Ouo^m*2w!g?R%R3 zNrqVy)0j+~nRD8BS4r|X7(4aPP*s2bdPl+iYYY6Z=NB zxdL&ypH&}zv1{8UU9A_BwfXC=@8|R8--_SeH!ZtNmVNoH*i$$6=kI#ZTI=N0|5jwx z{hO*jGq$*^mmj+qetYV-RF(Wq!au~*tY5w|-5p=F`rFHyZ|;`o-;CYOdC6*<(zof~ zxjN2li4EFvz)yM6c|C{FlkYBH{r|g*Z|}zFFVkw{J%f@wC&@9OMyJ(A(==CLG5eL!5z|hcm9vHA_ z(TJ`YJq{3>9f8>zDMkXkS=m66%s{vfNc%B^cnIUL7d{|0Ag~1N2XIk@t_8icfoK3} ZaRM@t${vXQObkp6SwM+uVBy5T002})%jp0B literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip new file mode 100644 index 0000000000000000000000000000000000000000..d19477ff24f9401ae1fd0046e1b4425c286f02c5 GIT binary patch literal 1208 zcmWIWW@Zs#0D(=7y1r4J*Y2tT*&xgc#Q6n!1xZ|t5kicSQVdEg&N-O{T3lSrxrr5` zKv@PbJ2MZ=4nQ#|e66*kE;9oI2y+5eX6EIWmgwabRKb*SfbHXd!`gQRDDBF~z`zAo zR!~|}3{+P+`;GW3X@Qd`0*(c&oi{0JPRx?Lr3>O7uV1%#!MthlAwg4Sh|iCoGKov5 z%=?U{A8VI*x{BBmrk_%6RP$(~Dhn!NTt`Lifw@109?CY?F5Xwea` zbw3tedGcq^qe+V{JrP>xVhUT0(y zVFo2_>`4tM0R|0?ASyEt782ewH}NJCu1)HfEIzl5^y+y3j}m6=$Rd%#S_Rx%KESv!R{Y~wzGE`9)mJWr;aLjFD0dO2W=L znFS$WQK!n_&dkd% zEz!#>sDdfu06QU4eA!EWMh1pXAm#!qD<~}~2CA!^{YLziw7|&|0mlN?&YKi9CuT|B z(gksk*RNZ=VBWO&kf13u#OKFPnZzYj=6y!fkF`rYT}5mO(@!Zkt65fDTe+q%tYcgD zWY48LOSbW3LZDW=3)>MYC%V#=#EpGUFd%>%8hFAPo6Y4GQ>V4>KuGLvT zZSVc1D!1eA36~wZnN{}r^y4?LMJK=IJ}0~3@kh=L+MZ4u+l4dw)7ae8HBP^5FK^WI z)b+f?%zzqfRvS&zT!F#W0mS@3oRV6WnVcG5Qdy9SGwkvvXD-ADymHjQ0|%W5qaK?E z1DcJ@*{9e41=^Gij3{m(4UA1nW2%i>F_lb8Oo1Y5@(o8dNJMP~M%32ghOM_3WTi7? z@n5~M=6g=;8Ejcnt z#TFP!j7%cTpge^=K?5bgprH{&W#+*G4P764j6!rVFf=rp14A4we$h3fM-D>sN}y(> zs0#3AWdlhv1K~O#eVz%#Ll}p>TmY#7fhAx+fJ+HEc KfD&RXAcX)VtC6$- literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip new file mode 100644 index 0000000000000000000000000000000000000000..a1768f9b2d700f7c9eb916fe9d639f6114ad7cf8 GIT binary patch literal 43168 zcmZU)18go%@HSf8wr$(CZQFKFZTr;jX;1A_^R11ywr#if_x~<(zk4^C&F(YLBr{L4 z*=#baA`cFM0rEeiD}+%|B`RP85)=gFf7ySSvkQ}p862XUi<7aNv#qHFkqrv_^Bs7%&wY0VHF?O?b_i%Qz!~-EC7k74Wur&7&b+VAKb@y+sc341Rq!{tYehhayHMB9^1bbZ3>{VhTk}BK2O@Hd*C&8kud;g21e#?1KOf zTX=FRsaPsPPSnnu&}{RHvtY3BM^f|3+DYJb^K8$~i>F~dVXN2TZ%_TVH-Xp0>9(zl zHh|}|+0mNOaq|h?Y@556r}x&C#nq)<*GsSx^TyR_TbB3sbkNH{Pxp%*pZoF&oj?6g zAHjaF`7;k8pUZ9ZF_HCZ&u?fX-hcd;UY=Wl5}WIuH$>Ld+}^9={+lC)uRkwzuGAYp zUwO10gD_D=AYF@SO9~0Z(6;s z)-!&506%`ZmitcbZc%X!zXkigevIY9we6iL`RyntO?*e7K%j_K7>hOQyl^Sk<%l*@H4&+y$t%=Z7

G+k33l*R=flE?utiC*lj0pvhxHN6Q#k`GY%xY$b)sH5ux_Me4 z+}k+CTOL6X4o+B_bEP|l4zMvA63MSl13SvpluNmS&m`6u%9wh_1seiF<7#^5?1BpS zwdLHC$6-q(zIt8)w%k4U-F-@RUJ;m&m6f;ZkAbd0|yx+XUf|-_P6_M z;`mkN?SNd)r?H{v#v})hXATQ0r8QobTJ&P@m(7L}?z7lEChW*b$vQb^*qJ{H&n;|> zePF{hJwok=r|CWlXigv4rF1D@ZgD}6i?Z?}eBF<0s%ej)?z^zt~MVp_AY=;8uaR!si-=uVcF7{YluO z`g>qD-_obU4DYqTGHoQw=G>p>muz*$Yprc;|K4QTg=g4%dk;nJXxgdEx9Fex6<+n8f3cBeARm%W(L-Ia4_#Wxxp12`Q5O$t4x-jvC z1J!}twqHzVh-wp)l-(QXGl%5i%gg+jK|oSED&1?c0N%40dap%ZoMy51r{T z0e(8QwAu&I1(*v}I^#o-ZegFyh#7UwZFl)-uestW5&c=HYkJ`cZQzz{K7k*nnzkY0 zRWOm>@Lo5bRnN_8;fMAjiDqj0koPQk8MVUZ>d+Sa3#!mc!9YE}Q~_eS{ZaP_AgB_g z7B^?fnf<|n%lTV9XL)6fA>fCkrzbOxW_d`)n@J>6g*+e}S!FvBOJTzFWyR_dUwq@GU$h`P1#uM<4MOw%CTI46KOoR|#8I6(8v zCa+gZmRy*7;g7SgfwY-Wq?nUCr}}{@@00NT^XKFhTXp%?#f+%8COqy%&E92YKF;q! zKoHTPdGi|ix>ZjI8@9orSk_b+U`KdquMYY<4+Vxk$}~}vdC}2SHprL^-enQ@6M8pdt`7im~m$;ihFuj z;zEw4;28T`7^M2wT<3P=qGn-SRQ1NKn<2RtGqPxk2XSq7dnObBirmqG`*PCk^dx~z z1KM#U`7ccu4tt_($LUZ7J=gf6cYon{(vW$0Z zzdb{5^n0IX4t8D0Aztuo{CU_#+lr3&ZcO;#%{`|3I_)c#7%M zSU748N074c)q2CL1-~xfQAElQqJUGA<6u%Yd`Zg{D)j)EEP_}j?7bLb!HZy=6CT<; zYx*%q7y?jQ*rVrDNqkp`-3pl6kD!e0%^qNAfa2fpp1<`c?fTr&s1W)fh=(4!-3=hG zyVkVc69zQx)-dw~4vF5|xt3lX3^w}RLLOQE+g)lIw@GD+SoA^VGH=AVxHu#2%1X@c zs>LX*Yd1fFr#)^h&P|+N46|-z`_xw!6}n~nu6t9e^-H^bc<~dc3y6%#^1<`z13Myu zOzsK{SSd3}qN4k=(YGA{uiiikOukjx}5@O$gjE#OL#6``~u2AZf42VY?((d$Wg zs9`NnDv~z{9X$jdL;#^dn?T_ZtFz2S_wtGdq9eKNQU1Jc(* zq{aDr$}hG#^hwK_=j4su-{?iL>`bLyp@>H#M5R;c$B0TSfI+P#5P?@ zRse5h>zPt))Hg0H{N_Ot^0_#Fy-wgwK*j@tU0Hg&JL(ouTpYUazD1y*z+~23{~&K! z=MBd<(e%*oC4*smjiJU4$T;$FV6|b%)L93)K>I3n&EoH@xhL1bQlGji;*lpEyG6hLUfGP(}U-DsK)`ehsg)YIm^?A=yxRaWN7gM zb0h$C-=VAr63%zgM1u?atQ~nzAp$p)DbX5le2%cWIU6-h zhJNidBX}9#R54GYy=0LWx=_STfS|edF-#NSNf)krVm9-!K~<(RJG46EufrgOs=jG{ zrg(v!w(MEi=-~~zQkxzADrM@rJk#5J zW95pEFQ>U3fin z(RX)1_~B;1K_R}6<8dtQ0rq@zRWpvFP*k*W+S)7>`sbHXV%~s6-OrM=j$l$m!kq&_ z3tahbn)~Tl2N%v%3&8&c?Q$|^xIGBdMn_O|dst4OClJ;5TJ@FygT*$Nw}28s+oaim zr*5Q>3W)yc|E&_^jO0LAV>-52E?mJpcE9O7B}XHw>j$2f$h#1eoQv_V1srUewZpRq zL+zt<++&a`+E_PXII2?z*Z{)rgKIf7#uL4TW<`|W9k3gWH5qA~c2O1xb8|VU3eWm{ z-G(D~JX-2$gmFzo8}&DnO+Q!+UkMfDM~e`_duL>Z+mNT-gSVs&TTJX5M~>~A8aa4khP` z(&R)p^rhp$S$bLmH$dYIX2}+tl+#hs*VpYenDMjQVDK*7oTKD$d9*^Q zWoo`o7L5^589KqP(ttU->_H1P1nQ7$Bq2Jj=<_g`R1z6{;8Rbt?m28qGqvZn!zZvo zI!w0gJH`1YyzLL+0Otnep~DwUHR(fx;R$kiG}>_>f7~7$Jpqo2RaBppnBECTjJKxw zqXJB-5o%3OhK#E*_g(iH;#Y4PVq%8KToV|mxw`9v9t2ZIbko1XoXclv_0u+t;uk5b z67heHR5Cf8Mtvq~!uiBVYuYg6IaRFo;VC&}F@eTm@{-F{G)a+uuO3__eju3M0gC&K z@r`Xq2y)vwLoc4JTyb?_OK9p&D7F;+~Sycei(hQ#!jzCur{T z*QZI4y1orBL5W!Af>_7ig$&zR*?;dT(+&J&sGiJ$fmqlpWliHvJqE6bS z0=QQwN@M+(7=iJ(U6wX+zlYyBDh3=$h<=lH*_LEd!EXm+vZ+PQH@s7L`_ItEF&NxTy_mymfLK z`f^opb0lJr@Fv-&L;fR;Z{phl3X5OSsOfs;L%bYoiNzGX91t7Mc~kthBV`_QHh$Pw zMTFamk9y9P5R)!YYL41E$0!_EcJR|6qERp+J6$S%gXwxG4GVc86uxH~)UmB4w)0Mn z;o(M!L`gVuqh5IHe55hMv4S5EMJL11f{@GUBx-6Wn27sDN?oIHdEmOhkP~ysI=e$g zmZ(yZ4+Uq>N4nUG+Pq+t8(^>xLoKMt(}Qh|4lSYc9f^!#&Ivh1JjUicN(O3c6~u$U zn6IA9;F9F`0^fL(v6#%m1r}#XcNf5eVowaq{&fyU@Wzgc{8!oh8DE&@{-rN;TrVIeuS(rgNf2MmKtb?lABO9vBEKQqK$T!g%J}5`@?+FAfkhmy)7*AlC zKkT*xYwTa^u7?c1$MG|nVxo?+7kePfV$^rz!|4_(45Phh8wdaIZK-LhkApbbpJq@A zAvDu=Td{r>(oV^aj;z6Lrp;u@D3>TSJA+kJR+B(YM`M~>)Na0j30UHj^ zAxVo>Ym`Dv)|UF6my#eg(;RAbqhVty8v@e$;N>60qYCjX`ch8&jU$|#Fml-Jdv*=& z@M~g{C6VAXPfIU|>Dj^EV-HmvjH5KDvzHS$(9JshAf>Vi((;eGj(h+@(?psHCWjc~ z%p$M})abdHCA8)iR-uDpLh-g2r1nEtwqP}^_hE&?k|O%`QRluDY54mrfX7rbE{m)a z1sBr-JqUY$bdaG#88-YylUr>%e`IvrfLa|Omd@W@3dzptdSTA3B=Jm2-Gs+@U66dM zS~jEJ#4yx`!nm>r^PWC?=0`tpAr3oU5ilA~TZxjP6RJQ2uWyh)B4?qH^N=f^o#Xdw zHMf7q+qd*O(TQ{i?GCW-axp2%2R)*ndf@l zHD;w2RK1>n;-6_<3ein=Q?z6+FBZ0l1e4`Ks?i=>5LI&{ng3JgjcmtPt@-w-^SVJy z58RB%s-~sTC5#_Q3UCe9V*CAja?X08pYU{+>LWFw%!Y$-N4ZjbgDGb6ohvL9j$3{C zdCeu89a+ts|MWE7-6qCaIvt<*!xZG1K^&Hy$u2pr4^FQ_Q998bci{>g<+%_PL84|J zn{mo+LQ`j29cp9&vCLf_7V&+JWt}jzXsfz~`JL@~ODN&!;z)PEL6?eMF!#hbJSvjX z=UYE?ZwaIcVyl705xnWdQ8eQ~;y6DwtVkl410 zTJ3Ft3!<+B5wnq>iqtxwxRj6B+X7R^#F!xzKHdkf1**idONQJ6qn;zTVY55cT`fTV zqkEGl!CjbzAqYnT}a3c6V!J(S4>TKLqkg~5h>L?+;2ZF?{1QjUL8 zuwAyK%eg}Q&Xq0G$rqY*NcWYUir^xc;@;<*DyVna3TY@vWsWvvt}Z`%1mT%fJ-K%F zA3`3lOx&t&>9jv_r1eCDeYKqjXAxwq+X#(r5ONKhM2ksfcKLn&UW6El~15l z`I8%uqi}>>A>k577l!02aw(Tr_AnUS++H7XR=n_D!H>_%H#ubO+g0{eN*wVdfbFeb zv7R#}niC?Ax^sz#K^(Hz)EWi&$oGqBr0_s#e_ zKEchF1<5&d{2!Xl&Y$5u%{PKSj{`zN$wG%C>Qa$BL{xtz5{%d}?0Sn0PSF_n^amuN z_6y^L@KZC)PzR02CChP8k;QqG>Tz$-Z z?GsIN#Ttyk{u&j7e$-KylAS|i_e$p&t~Djut=o-##PK7-QOw&lqr3_+4F75({l@r*IoRWr_3#70YPhCkS;-gm%p_h#$V>>H5h?Urw56z~0iJq=$Loo4zj z?AVpjPh|}H%L7s3HG1gL8&3%^ks7zC<}o!99+C)R&($3-cN5mZogMLP=&6y3-r;r_p)di*Q780X zNvMr|4%v*HR6~s&`$X$OG+lTYbTV072gvUFU$&?kv7MDVjQA zBe9dr>sQ~pdzi`kI@%6qDEe=+ul8lnwFH4)_^zpC1r=wu9ssbFyG)q_TOqrF>ZC;& zt@EX_YbO%_25PoYmYdJXU=U4iR@uJdA~DcNMg@{b_F?X^Zp7?MPOIGfOL%7%rhU4w z^U)0~OGq?haNeV5x!B<-ox~MIKQRj^WYDMb!O0pNEMlI%$Qq!<;}M8nPZY!0>H8%K zRe|Gk4)rpn9xcwzwsJNX(H!98+FqFIc~JCqfu0I6ld9LjwiGX5;w_%tx&-KFIEPb< zd$YB{jB0Pe9Y+_td(7eqeeiHK?nar6q*&93cCDb%2-i~Q4lkEAc~8M;=^PgGe)ogH z4|g}sM0&Ix!>-|;6uEQZdfylosk7qxsuV=`Yx8EQolIWX@=i0B0adU}PoATnZPTwX zpH|nteE9>|hX->gKLoeAX^6CY$KCc;b4dgEKBpb(&c-WrZ=!}RlGmW#1wwppT$i4G z(VmWI^nWKpbCh|m-G6{7^xU#~Zww+luh);sQKL2j0vaFJnJ3)Wpu1vGHXo-VWJU2R zY*td z$hABzL~hi2WIwS5yZ!M%l4y@<`~zVCg{vkpbbyZ3Kb6SX*9@a0n4BzM_bK{K10bn-xE^CJtZZLiB0e!hN0)K^ z(S~?NCs}3#D|kHTQvCs|EaP$U*3Loq0WHQD@`#pug{(vYOpZmXeYXc5G`P{&=C74p zt3iHj`i~;`vgr`=r7+R*F40fumiFGQML~*quXzcqS+!B?k zMW^*kZ%b^1SF~-5wk|*w5q3oG1xqRJziHm^JQox6zz?(YA|fm~#UwPco4%Z!o?>9m zcc?>`mL5<}B2_8KV`STqPeUc}SCbxce$^m(O$I0S9_UB~8l{4uN^G zwrp=G*iHDpt_9*r0~F0)qLc?BD!Iu=#VL9mxi%U}P!2fLU7lnhEr;UEah6n8- zJ6_Xc_dV3V=WnNr>he>Y+(`Jn6YMY43&7VU!k`RP6^$T8Vfy8@^;B&hmM=K8;{Z|OK(?ydLabSI-Xs4AR7VH&Zqcp)~O5KE2adC~gCPSvt zc3r-$rEMkC_F*G6*#bmp=2F}S5oS%Cch*rf$6<34@B+uFOP5(qE=WymTx*OqB1~Z zE>X4z@BCuJ%69*UD0NT{oToWFdj87^d;CVJ6M zv$@a{+C55AhTMDn>K96S!WbfT1{=NR2Rh=FfD&{M1M9iBQ{f+=3 zTraI^%DcUzQ2e3oCK4r|@_a#V@W5^M(ozf4N0@k}d?R`qoY+n0=0Ho$L`8<`4 z#!nP;EJOkN>pQ%=!V)lNJ@?^;n}i5nYhLV+e?Z=1zeYh(vaVnboA#Z^o}S+cSRG5B z%DwE4qY+PK0FehgExQp;z%O|JU8-JCy;>P_-uW{@5%=0aR9cv3+Q4VZRt+)wBgFgn z9xUaODex17*3(hc-+<wlRD`xakjOZ z%YMP~UNQca-HUr&p}jb4WSWPs2H3(9ZR4H6qC-uYo6C{gsDB~}Fv6Fjiksuk6|r04 ziA`IJ(P=g((`W0%>wI`tNt<=#+w6r#gz`RJ>#f}Ue#<)B6Gb|MS2S5?8XRPLD7B~p zQ5gp+9}`)iTKR zG&X8UPQ=xumK%2@CC>D-f4nLi;WR=I@us+gP|DJH-*OG^X25JXiZ(G=fF(U&E|Z zDed)09tGlG`RpjY&Iy()=Gl(~!8mrr!_DybDzFR^pDHv)H(6%R>Kw^qnBtbBh!?6~<-`=9c2r14H-n*vGLL~Up*3QpwlzvtY$wpJEJI*l z%3pKsQwV4$)hxl*WTEy!3uPrbd`{|6eb-8-&e)cfP*nRf`F3MS1{AnlF*zRsq&@YG zErJ7(-1*?e13yGr$BI_^aqB9}c<}sGu>|Xzf@V zCq5z_B{<}e^sIk+5sk$lb2eieQIs48(vHzK#}}*zLw9`jJcv>&YJnbR;Cd5O4S)O@ z_ZWiU4w#OHZuRMF=?RBZ(s2~VaCc`2aFjin?B<%&hwH&}RP2``knVh*T@f8T3PR~u z%yqE>9z8DUmgu2X%ydybwwQ*_EBLlAWk(W^*~ec0j9US|hm*i*X;e=Db<6v&x)>ob zxijQ8dkob|+|rxykbaM1skj2S75CPBz;&Q<$SShBzAiRV5>_Y}q0JtK^X7x{U^;{> z)_W~P-k>ajjt^9r5~NPx3A_LoT$5_C6oF1!yepuV)MJ;I=`zD&#QkZmSuPSQU>OqM zssFDBQRSgAYf6%56tZ(+zCT}LL|y(2mLPM~bOy&^7c<~iRTk^hA481Zt++~cBq4E> z-1bb4rvP{MaT~C248TgK@Lz1wZ*}3SFNW-)p4ZTiQcfLr0ZuHYGMhOhEL;D6Eu<`v zOj3DR48|dEf=_bdDhgC4ahiN~b2SyPS~(R1_M)9U>yz**>rUyY&ufGy4<%I`pxpAT67Ed#a7<;#CC5_D>&+(4$ ztQL{Nkg>%-Q%KTSSO#YR4-K*!5MI7cEZv(?b$8UJ?-OwxkYNoCG;p!J!t<3JSx-iW zNRzM|GN9CZ1}G6Fj6b^#miQBVHG3(onU~0hfRHlm#~Wq#j2Zr4wL4p^qO9nmSUuUU zN#fa!OrcDbCLS$dVe7FD%~~jg|Exe0x^PcFUGUuM-*hCa@&ECajyd5o%foD!hFx$1 zA28+nZ>SKKv*N~kn2@&dnRaIdc=0fBC?g`)T%PK}w&wX8s_x5Mv@NR?10?qeljdAw zl6)OMq*z%e&$R!5a+T|fmrLh4)~+i^^HC>j#35uT+C?3}I1BX>mqn05C)0I8QX8br zaeq<99!fNxsS(2dsgFFwftUG^-hT&}IdJocy$BSlI;BwtWnPM$t`X4b5g`8~0hv5U z^ZBF;ztm*MW496z^GY|_JZ^qHp#Zr3TQzCQz`W`>fbNUr5^zh3G@s_(#6*s`tq{|B zq#_-0N68Q@E)tckyH%pN{-Pf(CBHFM7euae416>t?~`52m8AJv1{)WT$nI%~6I$Wk zb#3Hc$@w15KDa*YDMkqOs(p2HPX{!eI}CD|o_-f7VwYF$C-{P(O|9%Zc<}`V8$@dK z*|DJq#bZ)Cl}M?9oM`6{F6X%K2J2-&atHQT%(=p3Z#a`3VuJ`3g0Tezff~1In$8w_ zG5CHY_s-%Dbqmc7%}E3@@&>p{otIV%qTTDm2FwPN zUH(J`F*Em*-Y{yjdNrXv zU?YSr1u)`FDvM1sv3lhL7y)MByz|o&JS?QJu08oW1GbHr;^!Qz-2`^U3pm(JGhPHr zQnQgG9wQA2EBDu)PNU>adTQ~EnTws}(hsDZmmw5Mxygzs*6UgcJl}0_k;j-o&oU_f z(eXw?d*_Abf~}6-*Q|TDS(6xJhJt9up}yH#l}Vlf$%(t>5y7lo7PMcTx$Jj#1|$fn z8QI>WAVSDxX-LL+L^L;|J0gq$nX!!>^iOjF3`h?fVH|wkNOhyj2Ln%j20ZNY0R!K&W5<=aT0>8ug4f z?hZ0Q?bwv@c|;WLBtvbNlm``z*H9-|fjEo2K=syFyyK(dKtdWasMnw$p^5R1$sR9@ zM;>#FaSw*@SzOr?XFsV<#C}A<$!Sp7gDV4RP%P1wH9cE+BInFTG{&v+7 zpTgj6{#G#@@yw%$)(Q)Xm{7bw171ZV)l^()5%{((0!KeJ{1I+9^Y@d=ZAzSF+qlrl z!E1o&qMicZ6&_wOQH1{!SNq>6LJb{W(eX~yOGe+FhpV8GVP0r^uPMyTO+WITF2ji- zuYeQp0<MCqcPL#%QV(=45`*YU4sjP5?Mc*{ELl9mC^CV&4)-!p&PZ3HP)3r+g4PXr4wCve* z$;b`f=m)Jbc+mLev})4z1nc&!vtY{vhg-fz!{d)F%g3&BLg5{Mq0nEMoO8R7J!KW7 z3{VSkZ3&3M#7W{lWUAOTjAU8NTIt+&V8T9cL>xtN)rW&?-ClG(B(P+BF5KHS6 z0;8ZlhzY@`Vn!(X7w1Ov5HGrrt{jcV40zi*X>hb+!KyL{-LuBTVsrxsWBV+7qQ!0G z46<0=h!jM=`Sl~k3AORPfOA5?An=LQo1k`?+>A)%+!PjlD(%P37T=<*Pby5Am(FQz z>hWxABhVF5uhy&!loQ6G117&98?znRo{wS6PR)Bn+A48{Z!nRP+~C_KP_q=3T>hx<`Q?5aiHewIM*kN)5XIXRtM?Y zZ`#ysF)`USk~I%WQ(W1G!N0K$%Q|3Y&*+W=y{;W{-RitsU>4XkzF@=0DbrXId}?nG z&F8REa_0rghRr|tq)k4oSJ8wuu$rt=6E&j3@e*_Z#$;H5m)zKx)pdiRRQsgi7mT#UDmz$DMZ45f zk1a`w;rTEF^Ecg3c)cv0yLAwkF~K?(L^u6x%*bx*L(ihXR5RE20sfx#i4?Kf^jS8c(E6_IWF zXh{`Hdl$Rx(O^y7H@6lK-loKbVKYxwcs>r9_0sthyv69V^kO8KW1QNL690XC5VgHA zlW*&OGBWdO}1N<#4%1NR)j{IbM#MZ32fh$bHm1ihg>TaoNVw_VqX9-0g7LQ8}&#a zzcuG={HkJ-K$~5a6o~!2$&zUdK~5Gh9E{P!;PKf*7JOi4zeGc|6Xsse*p!2P<1@ZM z(FGhDJi9*C)fP&eWC5KmW}%eGbjWTNHc6gg-} z<;1wum%zE$q|G^j^^|;%9J$ni?p+p6YR=mT@bLjg5lDJSm;jbar?$!$ePOt3%Ur>-2FqrmHd1+y`8V;k;vZ~oHGCyGY@GvJaE$g7+ z{m6TYH}6-A?letGOHq`SZ^xD)($g$FPOz%LrZVjQPK?~U_XkJ%8%LwhEdt)EMTtoS z@kD4{X)7;8)A@7N?*T)u7~T0BzyAI-bd-U`kV2=Rf1o^uu1>mF1KEY~JJH?uZskG@9~we#$kEk! zf}v_?(UohORTjJ9QBfjAtAvy=@}p8WoKA&rOE#ECWIZ_&@w#dnsjHLG*iQD-wBUXd zq@PJHD^9^oOEmTovw3=p2W-hUIb)dWG6|0txxjngQV%st;BhMn{C7(cTYc{V2*g!k znBl{AvHX{uu75?}L1tHGn(xGm15*OQbBz&p3Fx8LX~k44bZ z+*I=`@XdcL-FUGxav_P0YQMEF6dzle)Q+OmPu=pvOL79HIC+w0jF&>vwMnbJ^M>Y* zY2%&q|LYG@rZPf(sKl+v7Bp4%a7Z8Rl@KxW6|;GQ|y~56nfM&pEvv9vBd^~ zZ&cBzyx}5f3?oYnwLA5K29GW2V*YjGz6Cbqp<|R(&2`(H8z+{sWtm7auTgbPEavLU z{>qHKNKND{*i9GTObW)pcE)f`G`oL{R50j^D$kHhGZXLb+r2FS4k>qc_~F5ohjn5$ zr<f-dh>5~zUKqaOrD&$TT=JV{xeV8tTX5t8*ZbVv0AH41;?cK)%=%uA&i z%2ok))dqQ~JIChJH8pZoP0Cl*0!24k2kZqPcJ4Lu*qX+WLHbtsQ6ezb5HyfgL2S;D zy|c6-A5Y?&UMEKeop4nvqYkgtk0kHKNmzcO62u!sDr?>~=#foro@-`Hh4Q0GBJY_d zAz8(I?5n#Y9D8l;dQ{qxK4>z%Lpkpy*9)Mx3QK>Y=6e8V3-7mI_Sfs)=7_f;oAtZ? zc)zgy-XBTUD~%gd0fo>b9$voaqrJ_gNgK}%i3$6p@kN11>ts5uR}3F*tuuI_a0$bO2WzMM3kBbWtl1mT*F}+EnM8HS=9!nu@gav`7f04 z#Cpl=P4%&Y0Pwg3>SkhN3*YUEp)dZjkm4kGS$c9Pak5&NrzwC(g5LSm7u?k@iA?K= zn4&Mcx7-B__Uko?<~6LC#tFk`&+j-_s;`z-^=QWI#YmqVM;ZRts8Fho#SZf~7aKq6 zww~qF*CxodA=4JW31^zt=ad3TfoH^XQ+9^`8hzEWPlco>?udS0YeC20v2#Ewo3Nfk z7qK-XRm`VUJ#$5=+Yx;hMwPsb8~hDqj8L>l*bCFMY4jXD9atV~T|MZPT5@$3R)~7` zNX}~m`@F=n?fT|rJM{G{p3wdcBnSO7`@oXR41dSiEp+lC9&}FPYt%4svVApd?FP9X z)QFK!2EBnGIPrHAQgiv?b)mlh5(7QDo;R(4xF%wG1{$YH!@_*p?Y5jWH*cJ%H_7x-WxiFU}VUi5b0=Vj$=5XdC*_ivsH}!DvFk zkX~Y4m2Zl%JR#h@>~DmQJlZhDaTqxJC*3khE(IqD*BID3nwzA-?4w{+jQMGEe?*5~ zukfyIXQW;w>wpitqzYr&N6$_qtrMuyk3^vx;k8uB5!A~Vo1v1FJ|3@PWKVjw-4q^d z>}^=pp#}Z1^^dU4NzIh3=d5({z?3@*D08n-6Mq}fk?&y3JYNp&yVaWnTzNJsO9ecN zGK8JxmzPU*8ga66wlv>@SQIh3KXVePROzW0Xb}q}(uZHvs_ns|CeQ`H!JDu-Los#d%c z!B@$O_~#e0BJ~~|^rtS>|{=X^GWXuwD9Y#*X5UN}3ZhO>KgT#UpP0DZk?KBDr)hg+o{mo{<;TaTJ$N(XvN=LV6aJ2W&Ty(MgO@t| zpo|)IW~3fZ5o*#$ndRDSu5?TIxI}*v#RF8Ru8$meSegn+YC<&=lFVrGW-ON1w0Ud< z5&idlGJ2HzZ%J=ksfPs5uSj9oR2K`uuHbcXwh7jfW7iRi<)9oQ>gi(o#rTfqq>@u{ z1yn{wJBlHGFOG{_YrM_Qn7@SfW0%DwXFE+r{2$MA1JQhJgk4EgF~&vz++~;SMqNfw zrER_Hr~~sma`b+z)H{!Tl&*IL-4KNp-#n2n)1#EvoY@D*Z5L?S*!(9mZ+N8D@s1ZN z_wgw>KA1z)&;!>fWXBEbI8NSt-S}xn1HVwl@gC&IcTAxb_cW8=&_S8V=c0)->_x+} zhHR)`La8w5-L<558nunH)E*~0)5}YTBVvClwA;N^?uN!#e$a^4erfcM$4ABky-ZPffm`q2)d_c4*|wlTku ze0^e5>KWvbCYxvv#VIako~h zU2Q^4lf&MSW<>8}&l`M|rK`1Lp)H#>9PR${J1=)itME$O|YV!r744Vhwl zh^i)A?IwMeoQIV~U@~Rrmrt=rti^$0J5~E05T)gPP5!WJwf^Qoy%wypH#&7u;&`9) z35P29{vw^bv>c>rV95jNI6=<-gk(-diL^6NF05(ABKGTsKc*B2CqB$24&zkk`PUNjftaVltkHUi zT54g|5HVBoq`{MD{|5k9K&Zdc8)r;oL@n8{;3$flo=d)Kg(+&IHs8iSiQvY0{(9O) z@nT8rAy~tXvENIdJQ~<`U35ufJ%yEOa?&58__~rOXhb=dU8Z2j zJ^6q%+unvejwtbm9_*x7-y2xxTy1W{>NGOyf59r$-DS%U8`I+Kha~EwPiwP#SeG3a zX=TSF+WdMV=bm_whK%^^jnE8IsWza4BVt_VvPKG+u%FCKy{K!qGG}=0Hnn`|W&Ooc z`EB8wC@(LE4P5e)+E=C%OJ}I#ViYys>LJ;RPSP3W1~GO&YZ#V8G7m3N)rX&Su=xbl zwpOtY$}8yQwO*?A_)a74#dNH^lC4)=LpIT?=;iWnWHvfW#~H?xzwBDFFc9Y&O2%>v zg|71Z%Nyuxt|oO?_R^_GBAiRx{?VwXOE3~fflE@Tub`D!4|VFjy_>>L&m!$UMb66Q z3&o)dC5)Lv4vXyRSBL?{PuotxA!Rgm&R)uW@r@t7ONvW8pvp#{a-wkk6t?;Faq^`C zI+jpFM*L*T3O_?F4LS5G{svh*m*T!!MpM|91hN$`8trv~7OM}?r>J7K{pmdV_Fjv+ zQ=KU+F_l)lK(_3bE9u|e#U2+uO?o}o$)&-EmW^_@M{hli^S(p_ZSy$P_M_FBnshxx zl4N^@xM;4LKDymt_up7T^7pQ>W5xDT;SMD_)V-O)jQ7*i2tf*#9VXG`y|nDqUF_2k zB`3LWY&ly&T1g+sz9fa;zpa$|9=nl=lni&%Ux~ZSw`B|Ld)PwBTDDnv7k&IuOuKS+ zQtkONeo52}ijKWZc^4~bQO02!o?*n^zBtI9jw>Tc<7#Rr1MbE>L^B6+F5jLR-a4Nxot~U0EeXe(FJV`>6)C#?}x| zBZI2LezFtWYl(e-iA?vkk!k95E^6CGGW1TM&4v#A@X>iI-C96J4io8XSKAVxSB55*HCwq zBhk}IT%vX|eK6~0t0t}>w`r&8wnrO@td!#-YbyCx!;O^neF3Teenz`3k5RdUEyWhO zvJ00MvR%*blhC7f`gn?CTMT>I4)Y|^OjE6L6JB(ruSMJE-Ik%uF#dS1Xfabq~kNfOky z!;!8GY~pK_%h9r@+4OvxHC2CJN*bG2(66WG=#SD3%G%OGLmL9vV+q}CWNQ%xE?P!~ z9nzfT{hxf#nqt z(=%`jcS@8gqKc7tJVo%YX;~C@@_lG?( zZxeOMO(B8)XJl>hlVIZ`*4I6jqNkP7E4y*rq=8Plm~fcCEVhRYQrpHR7Jp`)?f9hV zXirY#Qz=eYi4MFuPG`d#NH=abZTho?_OCfX>6b52_dhZ2uRBLFhXkm}YBL>ioJWSb z3S5^o{EDeXY{I0o{DhKn*8l4V)}f_{ei)qP zzkPLuysu@Gyp}1~r? z3rZC@c*3LS%wfK}vlgB85M*6h3GPnVXhyCPqerb5*z~u#ROaGBE0@ZX-o+a7xT8Ts zLOPuIW)J$b&WT)-7Sgn&U~jw9)4$d$s*E?M^nPYcGG$`qU(PT!OT#coWS(CdUbx z@1eOj_mEb?Vd`pr&Z?(%kZJV}dcS2dm8)A*#DX&Rt;2Pac-ltAxu?l&RXR;c8DQrK zsBk*H+h}s+d3OBj6>OwaBORAH!|u^m;9fNpvtLV2l59}{+a0IQ`FF|C%AyMd$J5AD zT8GN>G`YR)7f7dgBK`WA!d{)fgZ?(}p_e!7N&ovbmh4Wm#>*{8RjY%|6)B{tj=A(P zaUH4E9wxy8aW4Hw2|at@O}(!Aq@F&W!sTT+#gdyet?N3QFx80!+?Cn-ga2spGy}4q zw22*hpTd56*Nl-e^)&599vu@R+THSnl{l(Obwb0WIm4EE=P`V@ymHpq!k!+UKf@0F zjHAOjTSw2?h?dqhkoXHR?wiP0QdJzy$%ks_SCt>hw+e9nBNE*Hfm(WeCWFSEuVs@g z6u7yIQt5ZWR@%RXXwj^C;`Oe(r; z3t6GkMB*C`Qnc6qbH|*-f9{w|_ZQ#1?4XQ*z=75aVCc0MJ|<={iAiVS)|C)2Uc#Yg zVIQNeAqRtdvw>Zg3lSyGxO3-XT&p3E{Y|+rX2D@NP@shUehem;UV+j`A=F;(jc1&7 z@a{wjROs2sI5Bgf;jS$l%TPm;z#isU#yj5A4Yi;vW`eEj3^4NTVsvB-ago4eWQM-O zu4E^0%aX-MKQvK0XGXAikojT~fPB@7cw0UhS{-v3 zeNF<+E#)C_p(QE`{e|rBrg(CHD+J#(!k46vKWb%gO{z2gQB^>xe{%S(q7V$%h9Y^3 z;DhYzFriEmSJ%nIhKrlPU$hV+p9|wZX+t!U&IK4F1#fj_(5ZD5rfGIS>fXi8V$3o@wCwf%!=lqe(z#@s$T)JhUPeDx-o9g*?@YJ z`x)cKQs%cp5nT&pQN3pliVX6xulzSuZmovZ^B1De!DjetsfJYTgB}K&p#4Ucmmjkc z)TY^B!X#PTH$M{NS!vuLp9wOu3((X_7wwcC@U!4d>|Qen*L${seS+h& zII*?SEZl4a9Qu6ld5a%Du@wMECl|agHkLV{(ZJxUwYXRPD(H;tU^03?z^tD<=B!Ev zujjT5Mp!t*abq<+>v{rYO2y!^U;-ZbumLW0B{Jogv*F>>WPqT3@Lr`F?9}Wq<5@Ig zmwpbmu6xKZ5$n-0L=I#ZdD&mXVkf_gpMynu`>P{b9S5}h}*WeK zzzTS8{RYw^f5SD0bm;u*ffA2`amz!*q~x)@1%et_e<}n*+A?9%d>&+Po`6eg`{0DJ z1!Omc;?TEf2wQE5ISX2O^In*Paq&{*J4&FP-&D-m+XZ@3z3@6f0M01ugAW&kO?E?Y zsQ)9BXl(q4;!F^l?Bu-8moa9qgGu;_)cc zIv15T%A>rLDdQ#IdGxkEd+0o!{u|Mu}Cfv^7LXcS7!|FTx*73MXk_byDs!(#-ZfP zICN-mMU{aSs3BbfVj8P3HozNoV>2P8aW=TzU@>+;4!s7unSWc`VBl97h-*zpXM6<5 zWOVS%5gR-wWQ=W!Q*hxpe@w1R1k?9|xOoSQ76EI(5k1X-_D=&24+z+$e z=cDgfeNfjeh79`Xq?RZa3CqT-x zV5H6uFh|y&5wcLlrnuJ-9deG*);$Aa){`-{QXS=`y|H=vZ)i#GU^25`G2qt@53Vp! zd^rduXRd_JRgDlkQyACJD1%>`Gf?c43i`!|RN1pu|Bh(+W7Nu-0@yLl#xD&hy4#ahWar0dmejNg9OBUl0ZxLE_&Bd)*hs^CB z2V&sEa#*79me;SRg-agrQ02-@)YG{R9x*PsD6<-@Z%QEZJqTB>FvGRao0#pNEYPIg z7mwLEVxGzuc*%3bkeB()-|4CFXUG*7WK3roZry|?OBa;4?1K{9lT6$AH;@u(jv9Z9 zAz*?(ig$+N#nzeN!(L)+7VZNVoeyAhT^%Z1*TRXnq39_zAOB34h`V{lm~_O@JX=@+ zwkjH+p7?wWb6JH&0}A+|WE{@ga}(}MEJgh(tI%NnAe^&pgq#>VCbMNRuCtwm3cb%@ zypbTD@?L}d3J*NtIUi0tuS9;55V|dU%-oDGf|FhPs95WRYX^E5^9ichwNDS_{I-En z6^m_!518(d7#MR_z$~mv0Og-&z@z6yOk|2Dwuf3nuemIy{n-K+zAEGI4byN{qbUXi z?15d`ETbE{8V_r&!-KtE7^7dtoc?Qtk?r5%?C8JdzE2hJj)>!{d$SO(O~#HBa=4d2 z0k#CYqFg^82cE^^iHgNISzQ5>ZU*Az#A)bo;{eR8F@m8g6?_%L!^OWHkdw`U^{;*L z-B&q0sA`HgpWERE|4=N9%LA{E*MWR|(d?iT?q1yn-z?t4pWXBE^|N?pj%+;$&31&# zcSJCybRmwB5{2UY1Y9uvJp}hH#G$aWF#g>Sa28yPqUYw|S)Kd5^I0MApdk(3#m8W@ z&_b-Q(?jW_*-YzITg(m2MvqknYv&d zd|0A{+K(n;pOQ7MR0_wMZHV3%^+D7p8nu&LF+6x5gt`A@e4Xsj;8Ps_+A0JImvnGo zh6YdAQFM42)mh5BIiOVrI^2lylt%O4qFL=NVg&lI(|L5+<0iLmx$U z3*duj=8cY7s_H?L1SYH6I;Fv-p_Esosny>Bu*TU2z`g8+V9}U#u!X`9F9U2 zt{}0snR(gh#+0blfZp|Xs6LPgy}3bHw`4hL?gCucGy>iqy|8U00^`6NweO{aAh_aE z?#QK zHz21WjJ^jxg71$jFnQTJ#yZ#3cu8ylsKnFB3tpB>^7) z`v@PlN8m&=bv&-_go!^#_v}dtXqzRE(@GQY`18w5y7Eg1xy9kzWrPYCuuyIC$^Tf{#DK(8t^xAANPlzYec~r?nVwCD+1S zdl$SW?T=$_OW?XYpJ8N_lk7X9z)!uN@l?4Alh0e>qD`|gj9Y<^bp~Nc>R;wct0-DF zYA`qQyTO!i2Zz+1ahj+IoD}{HSLU>UZjdt`zAA&P-e;I~P#JxzL%=6a6xB?sz-@dl zQ&O}T_3CON>Q***Iw+X!JwAzP{+JAgwWT2T(-19=)x)^g)|g?hh#84hqu*Ntni8wb zvZq~vqoXAKWl0^Z3be(X@M5(P>iO)0 zatLpY|cjJ*W&BUvdl@Syu2RN z2a~W`(E%&Gn_kWnh#P67eCTmxMwX+gt+iZhL88y7s8}zZH zdX$m;Ofh$hFB5P<48twlm?JiFu;h*v%1sXhU-e6j-8@++crYG1MVv8ij1qc3T8yS= zHbUGT^g`1y1eO#y9&X;KDCMFnD4e1PiMow7cUZ zGXVy6m}5S9qRZRKD7rKawtT6A@=8SQyBA>LLJ{82iJgq5at>^>y#Xzmbs!yShr0*7 zV7L5zm=$4<=cZWWE*%HB=2r*V%f|5B{-ESh5;!tm^;}V4wk2Z$Cq!+!}-eC z#1BEJivpwPA0cF=KT0_m<4r~k_m^p*t+F}pNHoJYcHg0B@(u`7a|Fv7L8zD)$~?&{ z2lbLUsD5A~KD^xnlIz-;h-c%#afu-+l#IaM$0s4$D=|V+*Lj?PfIxU;Sa?{JXRy0> zSWIYigw$w^cOE;cM}`HshXjPW2d{};6C5FJE%?9QjtmcV4~+@=j~W#*c2t)AA9=J( zw0meslzVhoxLw4!|L!CAKYhGCL&HJ?yhlx}$jZ;#Q6|I0&m zRQ~@P{ikGtwP4JeXs_A-1yD-^1QY-O00;oPfh|osu{CBz0001p0000F0001FWp-&} zWnXl8aAhuTaCw{T7wQ`j$;eQ~P_3SlTAW;@Zl$1ZlWC!@qoAIaUsO_*m=~X4l#&V( zcT3DEP6dh=XCxM+0{I%6ItsN46ag*b?f;ODLL4 zC22ru9yAX~n&(MMLZ(JjND?1~OuzGeo@cG!dj5Xif1I_>K6~Bgz4pHE>v~;rTX$~R zX3fs#&30y`!y!+5kCnR8D}BvZu9jZu=<4C+anR*}tB1p(|5-LZc+B$w=ol>@U^*E$_{KPRY=Yt*|2T$=I@Hk}e>f-6;al+n< z-<|z`@8WlH_}{;}&)LitvXk8|%)*^RVKe(?j*$Pe!NKd4+aZ2;&Yg42wKubEW@l~S zQrOI~lihdg)~%cVv7Z0+vDv`O?)*M?ZUrZHr~khc53AIc_Z*wKLY-Iz9M}Dy0=)mL zfZqRIfQ?naQhF;J+xh|xtS*^@e`~8r$@*Fpi0e`nJ2CmLPPc;dldbi2nk#=+yav}Be7+l%j zpmmP`4oe@!{b3n&dEbGaOep66xrOQtJfN~(DB#cr$HE_2wyOze`X%60K29W6EDM5MMVewv2rjPk0XuXn#xXxM1Ani)B_Upb;xMkB}}JX zM$E4zw9w)+NwARs4Fs;6w?KzOm?rk+L8vhm7d{8$hT2-%Rl1M{289wnul=CM{*thx zqI9P%8~wm(g<**RZ2!WCSjzQ0b`Gs(>>vGaQL7@98w!G z{^vP1^a|2Vg?9|Zu6p5|QU@-N%%c%oWoYK$dAMpMW7wS?#}-V2Yu5o69ZH0j-I3h22ehfQOgD1Fxkot-UkxWGddxb-n-=py!P_)dm9QgHcQDDq-!(CXy_ zfpe0iFKRK(b|^-D;~JzWIAZ*%F8H5uQUUD^xOw#g6xCQR+SVcl`M(Z37uPKDSQ9foml94*zgmT|n+ze2n zmS^Z)vwsf5MT(iS}gL?^^!5yduTqSp(Xv4#7 zeEu@F8|uixZI@9Z%8Q*h>j{^l83gB8LO3uME@zLCoZsQl+!F-pTjKO#&t=^2x&@&l z@x*Y!PVnC`L(0>9v`sl7uBsiYk6eb(QeIl(y@>K_u7Y-v39ibiLby}d<-;16PN%+VIbe05QV#znG%;2ZaQ$bk0D+h1q1w%Vciuz@bVNUlI z^c;9$py}L%&(C}z_@5{pePoZ3eJYR+5rUh@e7ZRI9zN=3VdJI>JYO(GzKXF?UV|X~ zxE_jCa|Fn%P(7p`u0v?;QyiDMgXC2+2JgSxV~NBsGWdEf9qcrL)q*`l9ISaSibKin zf7s8~LR@^ZaQ=28%-;OKs*}l3{pg5u@533ZUtGvDE`WGSH$orIkU`g4tjfKCR|e~- zt+Y6ujIM{{x%2pWxQN*M%F$kT6nRxO@P#)fY%`s_L z4El?M>O54$^s2Yyd!{Lr)_lN&?bjgmv=JPMmiXYD4u9p_@L2N>-KM9(W3ma~Mr%mN z4qLqW*F|*s7ed18DtX`b&A_bw0*nq8p`ktqFBU7H!ngz?x04~6AW1E&{YmEAa@3`~ zCH&pb(0!AQ&d#<&Mm7Pxr_Lg#AseAD8<0_%fU5Z#u&$5B{`ZiFi*7!~U3qE#`;(CW`HXB?DMi#D zg~IGq5F90w30q4Jd3mM|Hv6Ao!b+0fNO6GKVNPf-4zJTvXMj31>f4L(jYt zwx?>bZ=NU}`Be@jpF>c$_(=9`NWzuF+2{{paKz_4G&IxEzWfU`ga`0O&=BC0oLb301J$j1Ai16SK7#n>-b>0AdZ9Wuz&ghu@M{y zKH!GVuHPj1J1-R}kVWZgO%}U|5VaW%(8)%u$SQ|NuM~#250Q}bn{XoaG=4{Y#r26u z#4hH*g1uLe+|&pOn+{yx)`&M1A`FkaB^Dp4Ad>H$@kTzK2-bu`_nAK)uSi34OF3SP z)nmNWmigRTjlQl7>@Z@d&1`?khxINvnGM)JV54g?=1_}+`!MPlkKyCDkmEFp#?eJo zUc>?2ZYroNYJ)1QpzHY=LaxHt!@rlf9LomhtvUF$n~M(9d?K~O6ozgVNbHNo`@(xH zmh=NpV-zX0yr35v(@eZ~pRztYK!i8{>kL-G897EZ8O=XZrY7WA9pmAE*y zS1AS7FGuO{HMl=BhyLIBbc^D19C7Vrd_tN?`iIqs@3@K^3%_9Tx3&7#FP4#h?*(vv zz|8+wsgR9(->rNWEJw%6{>J zEk}a}dQD>F7e7K-`QkD3%C1GozESy-Z zJb(H;`1Pjoa^rfidvZhmjV|%3PlTVvHISaeWY{bg1M~UtqpbjXObuRsyMy~Fj*NL< zBq0lwz-RXoFD24pus#+^tl7|RoxwjEi`{>saKvjRK7HmfQ2w_HgUMU*wBah~)>fub zTn9&{`Wc%17hHjN@GtTTLKIt>!+czH`MERL`z#Ubk2T}c@Fi@U5~tjn1Gs4Wot$s* z#c-E7IiNp>DxEnFHp@HA!m|D7m{UtWnuKBFj(bozbs1rQB9MHj5FsrM82raWFI-(r z++B>&y&?zN>KwF1?>3ThPqXx;9pS;(v1XKqvR|!&XGtgXE?E)hzTPLADSxr?$_Mf! zp%n+uI+9hBw=i_%0aD|Mqcg{4oy^1wN1&7$e5l7SUyn(l9($j0)$EL~Xe;?tgiX zu-IX2oKPgI%`2b~A&hy^{B+;RG(0urMsP+5{?i*Hm|F-x=c9OkKL!~~(hzd|EV|Q| zQRzT&sv14Y+}dJ_&6SN%2pqIuJqI73H=|%dJC>)( zVKQ$Ft7H0Ly}zCqT9m{2@Ma9oE~clG_h8?K0w`)Fp-S;3u`GOxKWD$-_x4JnUaSOp zH7|5~H^VDLivB%7F+Y;af=G(e)g8mbfhP`a!hTuJ$;@y@__ zmOX;Uz9MDoIjA%Kh;qA$wz^onKYbU*F9oQcsvAbHi9s&77)cWmcy~mI?tifcJ9&BF zSl|i0+r#Lee;yN+f6210ODHV1hh4WQ^_&u*t4^OnZ%`wPm#&hd)<3~pFp7#yMevU9 z!^-TpB%Aq-ii1&56$~IN(gLw8vy0%>2=o`5z{PqqJib-o%;PWw@hqb*o!{}TJDF?} zdP|BOLdeEsE!@frfZ^pl#0;!}W1s@eM7im?YtL{;?-~N4uR-(7Yo?NK6dD3H_)r)_ zChe{=pY>dj8DD~;(PC(Rx{80@yHS?rM~v^+!PUH$=n8DdJhjtsT5uA^Yk6s1?p%7p z)ebLqI)aDy3;8%TiOjFotPGtCgL@%NO0owkUdLgM$zQxDS1={SO%`}=#q`Iofma-S#yJeym(Qokla+9u41lrVyDcT2LlvNuHV(p!azN6#rABiFX>He3BpLz2jth^(u5GHegD3klb3b8~az5 z;l6D*IIV^09?@9Dl-~iD>`TJ&QJN|p2}i9U2d-?6Lg(^T2)rOqLz_S2@0K5=O>+@u ze{aW&s!=SwSqQi|V#w|_dB@+3-@7|8cZn434m*j5=Xa9wrLD*lyo&%!5!w)R87(Wc z5GtCBmO>UkYLw%{_j3r?cZdjnuEh@i5DZW9&~3Q^%-aijc=LQev<2qVdrqu;lr06> z8%o4IP!6rjWRT}t4(pP3RJUD)BcCp=bS$F_*B(KKIzRNd8T1Zs!{|OP8ml;$X5CT4 z`$M^iIdOx06J`1A#xcb8t-#;Te9SX=O|1L4sM@M_@aSKIc}4{e%bvhKR{q-cY78@W z3{f65#r2&}NVDE7yq?`bL5@43mMuZE@GZE^=b^H?Lez6G4Q{2G?1)!d|%0ph1Qm&bfC8rgBhg+Jt5Np;%C!%F@0{s6G?Ks~bz`2SyfGHIJjrECEM6@4~@) z2qoW4VH(WhNS0$#_=QBb%KkEUw?+XtQ=yytzym~0RaMa*q(C>r!^Jf+|iEvK{+VuOa$3h2y3Aj z+>h3x;^Q*3J7x!3$HEwyB4Jn#N8{zn&6p}~B_m6sks`1OyaS&Q-U9k8oe1CRD$@_7flLHT=)V(?ba18)Zny#C*K0X@gKn5I#pNMUnJoxEtn^35!nf z4R^zSu@QN2PX{4uHV}ohNUT}(fK=&!BD0R|&^_Nl4kQdfM0g6JMH{g4{TA?++Y+Vl zX}nc#hD`$-`8r~S<%j3NVn&R54GF{iWF%VqB#_4OhzuU#qCcvZVcT9lC^#R(ll~w` z=e5D@*$OH>REG%TTC9GWhFayN)W&)x6)d`dVDY60+_|3l9+3kQSc9n(Y1n?Y5o*0# zu{!HNyt@>J-`66M906=xT?@T8`yjG&0DF8R(RM@w2F@`M`O86nHYrlIr2**MZpUaS zuEtaOJ$SRN3E!LraYZT}-kb};k=KvyXS6WS@djM?XoGuaIT`g%MsQRG;gNDB^U{lP z;co!hvtNY15`0VKe#8@<%FAT=E@jLfs)DL-BbMm0SV6H7f%XrWY~j1uUDATk&R2|< z-Z=4X^(1@eMZ;|T6`={NcfL75ew!3QN2DJ!iyz_o9N==>H^P@AiRw9%kX71(*3Hh? z7*Ymie+l_KcNpd-P4HKHK-N6-CU!S}67ILl;nvj;uBRCg$ookG4PCI%8<28;f&+f( z7*}V=U6)XZSFA<)cODvjWSo4sn~ym8lgM%{LN-?uxb#-w&zxv%|Cxdc?;gy4P^IxG z!uqdMFLT$$hVtPoM269G1XsrtL#TKE~mXCH#Mj>i!As#%a!Q!zSM1(G) z=I4fpLeoaL&p!Ygv%kc5-#9YP#^KO?HJHfS5^4JcG{~HRD@%_*q$?qXDTSQuJ+hgj z0k?xX(P=4;@N`Rv@dUt|5vCVce#4$ZYeK^Z$>012!u$6w(#;z%!8JwN5BQTF+(2=? z2RM(a(+eWuOeyCY{QI*U(#w9~)D1Ue=w%XHSC(#y&qvPtBZzsFh9Xvm>1}jiE^;TK zKyEk6GbHGxb6?gUaJWal^(ElewiE>PvhHM?I=Oa<4X+-)CTSXpQ2q543A%ZVf^#^G zC7+VKmK3Bu8321p61<`~@h_~3{Oi2{X*OkC=k&z7Obg_$*^TOB3#ngv9)dzcp?)(5 zC9GV!gDFDD{sCzDRN~MDD;Tjnq*9uXwkm9(Pr4d#?Xoqo76N?jYRJ0V=BOA|hjr6y z65Mr?w3qke#dI;8cke-YUk$vkZzWDAx^O)873udaB=H_;xO=yqcm=A{th2Q^r)q>* z`3OW?j4(5j6?kP?N;G1^kjegxeCCUTx7!TFx>}(VABNeXe?&vyoaRgb33` zd(R_`I|E+N6H(f|3nzP(>EFz1eD%yA{ll7=`7DRQKr6WV1tRECIXZc{A@GzP-$c0R zl;b~my(u+tT5=R!Ypqdt&>ftb0pRU$!<&vS=I|FeoLij&nMxM37c9eu^3}-dJ%Njr zrUvl^ zt-A*i2E5d%sSs6;TBz$^i|&u9xEIPvP0Xtp7s(Igu&*bhbonPH`7dDnoHM!ZJ4igb z(r{#>B~qRU(oYA|h>ArJjL$DZmy!@2e3k;qleu`^J_n+;E6GvS<Ib$Hm4q*%9J`e6BeZsw zEGoH5&f1kD;CUak_VUtHF>7*Z@Gyxx*MwbrN>JIPPi0!9aM31~L|-n&+rxLkY+6C% z49`F}(E=56rZ}=|AsrUxr`NvAA;Z5Bb7b~G_Lu-o*u9vha$F(`(b7=Z;0?!n+_beJ z0iz+t@N-{{-s%bBCbb1}`X1P&5<*VgwS>~cZ1jD;0MU1B^k`T-be-Io_eMAHaS6*~ zU(TVDTqS5T^h9K)0Ax<3quu=~m=%Yiw}}%{H?L!Dw+9sNzrYjGcyjB$ltEm{af7ql zccaj`9wV9?nEmlZM8kL+9#`EYtXn~9vdoeAN}N1si<6#t&zLg`2qnfTL=wQ3cJ?!SWz*O#MI z^Bdvy;h`Z{*y)$xD3Yw$fpOO$L=?@X`$NR(!?V|!)#FjP!ea&Fh5S@MKMU2~K~SCC z3g6!ZX3yu+g%iDa=Iw^nys1Pyuz^`RW&&sDIGFaFW`^}&k)Ul1QH?*0n{o|Ad7Bw* zvqhQlN0;#aM;)<>TTaKM^>B2m0mJJXNuud$Y9=-euacc`Qwbo=Wd(4=HC*?-19o~4 zAL%BH_FN!Jl;uT-E1ef?fP&CQ z@+#dL_N*Qpf47MK*U=Bdx4uM~GX(2O;vktfi(#qNRL1KT5x*cw*0~j7>B=YY*`i5P z=Egu*qzW%5gR%7)H`S^s!uI&Z>$4<3QI!ZA2I1DTb|RWk zfNb;I(CJNr5V(-SQ;1ICc=+!5P7)GK$gf-4*qM0%as&J{%P9zvN425NTZ7?vZdzL* zN4G|BmaJ9xfyMK4aCs+5)w||W#g&(#KOTuESDMgyNRdX*&ZYaBf{6VUA85XD z#Ql*%OfDCtHa&`XZ=ZpEEf z7I=|)XA0rm{E_@tHbn5`ZKC0OkG$UVhNv{1hoGe{v)WUetZ`Zl6Z3TP_M#5_oi&K8 zT`@R*nqu5TgYNmS7sVD+WUzZLJ}Oq@$lNFRnDiMRt_0$;X(`Tjw;D)kt)XMAzBtEy zoD}S9#WHIH%$*RX(U(ThYhJ@780g{JP936tz6xy$0}yc6f?a+xr|F zL$ffcq)zVFzawR*^1-iBfsHQF%wD5*@?%R9vDMH)MYkpR&Ui4TE7FLsL^8B`xs z;pFB*hR6-WDJhs(Zx_Lu`a7`gmBIKNdCGg<7V@DX;33IG%{3PDA2;CE)GRKamO`F2 z%YWou;g>i8@iQ-x5myG?Jt2?^7$d4|;#7X*0#rW5q5VJutQ>~P7>5dtzfyzk2Xe4~ z=_Raf^=7sVFQONAcN4D)c@$*+CDO6}$R0|-oc#$H-uHrNJ~zRU$DH8j=z>X{E$*$m z4ljvd_;q>``TP}ho=6uNkj=rHeF?ZQWrmm^QwZGi7?*Oh@cWA~G6HIdDfbI(4C{f$ z=2oJwrGQIc_hUkWjn2BU(MrrPmEkO|^q3|WjR#O*@COUW*-`c+1O}_!@wzSrN@pIx zoJ}6K2YKMQfWX2s8pm5t~*pP$iq3(9=avHJG_$NhC2*ooIB` z5|MvA@cA%6Wc?B_wc|3m^Z6F~lQW6F)ko5KaV{FB>hy>2eTGSiA5lv7LJIE*JRMNS z3VwflZApM|)G65idTww}RTSHlIuJbg2!hA2K+v!aJHsr{TE2iTJ!A#-6A{Gwhc~A3$SFkS_3cfb~=bwp7RAvFuv?ApgqC*fLG_z)B9z^P;R$mL-faaIs$;GD zWhDQq13pR8cZ>9>{E=-ad0EVC`j>l#q~?X1sqDK;kBUIz;&ZQ&cim+qes?X|7O@D%g;kgi@jz{M6r{%e;H)Q4Kc&Q> z$hrYSJKE4RHbG>1xv0uzmPdTJ4Qr;3=zT9BPR$*V=Zu0=;3XJOz9Sxs4kNm+AEzz( z=qIl-9L|fy_t$bzvKd0b!YbJ5uY{XT1PMLAij2%_hxT$O982TJncZ9Q=yy02Y@dOx zTabDb?!dui)`;$n#@>4AKty)ME0>rEuTs~++(vbgU41&)!0D0!`kp9ku(4+aqS!q#e#R8*wJ^ES~2YAR!|R zTGxf?ob^}m%-$HgR<$5&t_eqhewzrYj2aMZVa;48NjfEhgJ`x5-0M_H=y5{K-TQ? z0Y~C>G(0(o?5;{QM~l-hC6u(OjgTkn&q9HvPr3`vfxKef|7CgDugyjN!hATt6{lg% z6Qp*a3O-ej$+3DBI;lR3rbZvgp4)-x#`UP^W25_n12ES#58oPdiShjs*f~B$+VN2& z3aue0K1t%#&oUCK@f3O|*`coDPqcJ`F-JlbyF%h{)ao_D&nTk%SRuU1*OPZeFIm}P z8_u-fhA;ak^7M`zW>uC#&7dE_ybi>&lBG{E?a-O}i!;}Tq4mU<^uFOI^3#c=>3I$8 zLL?{WMsw-ip+N^5A5zA~P*9u-Nqx37d=XY$zQ|16xV!`Bii%;|aLGCj!1` zOb*$hZ@cZCg;O2P*XI+Q;2I$4MS{D#UED3Wdys_S?(PuWE?nH*-JOfOy9I}!FVDOC ztGD*~3wEovr@DHk`zj$#p?&f?dZ z){M~I@c=0+*5Kbs-M{D&N4DUHMRCjs+`(8{)p0}9$`-dqM6L#xQ}`2B+(HwZNMs2bt(SK5A47=O0ZiNn$_(w;56unG z3o)XNWtcQuc0;X?*xFAraXCM5TxxkQ{>+nyBFgPVeP_>LVxWrl&kcRK^Iaw64GYOH zkLA@zB@$v-l&j1|W0K3e6=oOhz-1-yR=If5mB1~0gFo6ub|tFldC;*1a6Q1Wa(x21 zg`kxJ?f`0}NZBt?hVf)#_e~F2UJ2~DE*zVkF12{Ou{ltmP^vOQEd>aoVSGaA zBZvX{pHR_1EqC?7?PRy&W@^%l&s9kFUJKI!qatSRG+#Lh>}tJCfsQ6LZ~w-C)VDP|frM$XnyS&vA%}Ne#j-&w#Xs-5g7k4n`RKb~y~&)YvFoEg_y?ist6-LVp1N&0$Nu+YP+#+|c8TiEh#Wv^JFU&Nyw zEN)|@c@t@8he$(2?=<7mud0O4B*Jst*IUkg_HgyO|GF;lX|w!l z(Do1J4Q?P{TZNbgA`jI(@Ub+F)%A^bK;|*>l*drSVb<P8w|+A{$qHnAMokbkhaDK7W~bnGH|$i(Fq#v1eI~bjax`A2!Z!Q7ymx zKZ3SJrUt@ZYHmfN3n<)qlt}jxdsTGlqCYcnsj_=_a!cNKy?`KNUmsRC1?x5s6rfG5?Ac^1TL|<6sHOAETHGISk&K4rC;w2te z{C!DjNy&I^C2}o-ygWvdKv8?PxzdtS$B!>;rodH4WozXAMvBjGDM#T+!)ndJEU+iS zp2E+q)h8@diG5;FrR>KP?s`puM-H40eMscC)#c9aMELGuEM;mk>jz zyrl;NS^pk6u#7=1Pav{N6^^m^S#p)&WT+e1gU&sR>b}Vpcjf{d>X}OsYD*#bl?9VN z2xmTu++AJvgVYRFV4%f-<3yWqCe*L_i~?gPRdHkTZWFR4;Chl4rrRu+F^-$(oEV~y zk4zQU&-0L|+nb_sLpsS%xYDb(g$Z4yQ)+VcSM1bhBW9fMT=<@aPt83NlI{w7bUw%J z&h@^Eg(mjrOs~hn9Y$LKkn(Fj;z27hV5S;n!@b}##UQRD#S)OMYrW;1OcXvMTeOUW z0uy-NFW(XIKz1I@N7xx>vmEeJ_!=uZ&diUI1hulFY{Qw=4-P_0I97?2c_WEDuVkVB zf%}o1AsD?8Kq!bOn+5jna~g$9+4lsTkz8%wx)bH@0*jIU*i-E5^(_%NaeGGuDVbtC zbasj~PpPu%PWubl9EZM+zbA})1(y6}INipTDErOou>RVkd_a=o%qG^6K*ffoI_dd9 z53P4QC3Z?-lLjS73-W(z!_L>Pqbu_ijW@1o`Gj1v*X`5$=kly6C(KqH{RtAMGo^9c(k9S0C zn{oYNk`1L+us%~xZvF{2px|JEPvC_fYQzSM;z~EU4iiWQYyWsl<66b~U0gRKGC)rs z+$J~{5E)6gX+gyTri5uvBTRWK7yo05NDR{%pk;}OUtdhALZUL1!7s#Y{b!2&AcW_$ zv&oYk1MB)dwt_k0A{BnLD(+2CS6N3=Zy8zV?k zaQ64n)Uf;Vsz0H@p&6)pq!>X>iLAR-6EiixU>UGj!0PGR7sq|>Wy>>A>pF@#$#vmL zPVlUi!(Ny)crkk|T+BUGGEddG(V1kS2I+w&OYwE7-a**kKLoWeV8}z9+eR!uZ1AvX zyEw&Pz~X@TE)eXFefsmd{+6}FD&3S&`mky7Sk~dGCzT#{3Nh@?r|nd|ZUw={^B`0c z_64=7n_4@=hdt_dITm6Y1KN^l+NMADE0ej3pt(ovY)v-%a{7e(n~PFr5W9*=YX+q8 z@8=}70+muTMQ=JSjh-f`YOJq5oBG_LcZ|S^Q&@BlsrlKx{6%Y^Uw=7ihKC|PwS1ls z`#ReSYV$;N%urx!)DP|Mk%d1=^A_G&;n{y3{WaAP8!%h#A%*;gmdR%O_%v}qHg6`- zl5GijlHhlIi6_{tto}NF;66Nz>cMU;L`w7>ig~!yKU#pRn~Fio=lTtr+?#lC;a#F3 zi!ny*8E}uPaNN1Y)BLS4ia$WMs;)$YJBH9!LDTQj!UEm#fPf-j{IIXt?8Y<)qps(8 z$i|6MrikU(RswRHkL^*f7$+&1N71?Tsn+aiYR&x%T7PYo%poDf@pCSY13bD--AW$ z+ysz!Oq6#lP?IB}U^*gw*XGSLrlNbhH=R?$6#yxEsjph(G6OsXVGk|3t`Bp)mp9N& z9YTJ4_=Bl0l61&WT7+kqsYt&`h+uR^LkAd5q(%hDerC`#={q4QQZfA4uV4&zrZRFz z`n+cCnl^iDD;75#`fv1Az|^1`V?a^mlnid1F$qH^p;hG2cy!Ch(I-PBGS zVz|ITaX&WL13%J)MS$4vGSGB7Ap2VeTKT&x;!b&MP94+d0@Qlu=S7J?^5>Knv@a?0 z@vP#zo>=S_74hxE=Z8O0f-#@IuIknX)@+n^yFMAP4sxKKD0m0@B%lj4a^W4@8^i(U z(4>@;;5dOo1B?*MRrwa-uAZANKOyj#5Sma-VV+zZQG@w&3PV_TPv*BAPmLgc8uI{+ zBuwLp%iaqYsyl5nqtQ_K=#1~a=Nzal(ie!&&mPiJ^Ej4og+X&8^yh+n$r7Fpq9={& z;_>#8yV%6yNA384q*`P$B8{oX1egkgh~kC&6(uJ*vkISNO0xb|soVA6KlJP2?xX2R z>DXz9WQ+G6ij{q1)K6y*$2exQ|4<5oBG7+wlejp;bp0~G%_TmSa4@4+Rar)4)0lAF zi|J`g51M*fmuOfc7%aYlIh`mH9&g~oyIK8+EUVPH_v*@z4|{(~C3mEVz9m3+Bv@}) z#S>@}QY`crvVf3o!^FMlKfk6(7RJU0DZk5l_KrmgP6CMzqv5(eE}BnAy3jS`nP?|v zX&%AnslQPCdggqXyfN$@#yx%Q6%k~dmh^nnF+Lg;W6 z6HhEA77tUa=!>>)@5!uu2%RieM%c~}Q8s@{weVH*k9G+PCj;z=)^4CA2LawAI9NE^ z$1-s{Z;5as<#_}ZHAXwW?TTgAWRC2MS`^Rkcumy}al+N-YPTcGXRajR>=Nsx_%d;? zi4(U)pJaW79T&ovXo}@D4m{p?;5^Um_IfB_5$UtYG7#RF(~rC1f-EuM9_tR5vMhSM z_hmz9ioX$ca_sqXJ#@5-iwOth;dZ$umr`Jl0~EJGU*tW=djP?B8TZLo8VJasdVdT0 zk%R#gxj)4+s9feM&~Ldo$?6gy9{VPM*5N|}(l+M1X0fEP`|-@`+WmqP&mQuya_YZ2 z>VVhzB;~F57EW;>Q*yd;QGCD-`Y7^F%wTzArrB%1_OF$ zzTI}Z8?gexb)4G7UfnzhEmgM~!6Ak6?8U6Jnyb6 zK9CNT0wkdj9X;qewZvq`M~zO_rU>m{5p-ciL^I%7ly4-zcvUD6zQW6*eWozRX*T@M z`$?^Sb7zPBK@T#JK_Y9luxNDHDhYCr5qNeb#_Zn$JVYh;yy?+6U)lvsxr>?iYw|yY z=u}IR6W#g5u$Zf0jh)x8r{pU(*)S{%T!d$mWod6l`1yddZCoQotB>W|mKqYgTA z#RyJsvTV1@rR^y&$rGr3;jaq<%=s1UxQ1{@_uDmak~OnVaX0e5A;AmlqH; zdjxqYMTS?tpmsB5MMC+Ibws#fjRLK2`V&7Abd1MG;H@XHO9Q_N4SAjjZ_riOd^BrGXnrd4zjJ=V^B9ZjpWH3U- z8CMBNoNV5G=#)ZG0Owe?;rk6X#J*OgBMYtAK6AVM-px=0Atn73+qsu<5ObwjPlvC* z>d`1$#0X*X7Sz;8g}wXU#=dL82xk+`Tr7dvbT9F{O_mzbuXS{Nu9>)V5z-P(Hdv5rG>MmM z4Zz>P1an+HpocZ?0M|v&%3=b$V)S;?GDFiYy`cmEc(6}wT(fWK#$2c+1rxtzYUNjGb9#Wf6 zmJ)N`UU&HYc3zIv_6B5Vxx)YpLrQD^Gc*uF6I~m)N`;t$*Ws^yEYgS zG)6D3^ow0NO3A9|Ybv2YsmZtWsEeSM`fPaj7d9iSG1B{VPi*O!qOLrww^tQMvkV}A zeOf*QONF}B=%DO)M5Z!lcBJTIrG}!`-Dig%CRClp=(ZSGSJN!zV4;cZg^zpvXPZTl ztOPZs%gi3zdg6?HoX#EuKA{NQcTSkWOopV*`f)2Zyj5oMy%dz7Jqy}}s`q)`#%H~)!Bh>+4_0B!_q=D#==Yoml zS`-EFq(=JZFD(qeAWEsom%hEf<|^+Tiz@+w26)PYQoOdsJ2q9` z6}(9f;1tB`aR1as_o=^|V@BEHRooRGu|KWpQ^wPFIUe_O{m`m2;eZAGCCeq08ojs4 z=ub1#PDbIu0To^SB~jKJ*BGvV`^7uGimw{u!@`gS@}d_j`Y08>7>qBRd7@5~S(7Ti zHIBKAkLxDFmAyki*YlsByeft54P>osR67pQH$JX;XDLxQ=(V}q!B(Ubir!Tj1KlL$ zZz`F3oyF9L)p+ptbT+~pKhpI{yZ8ip({z7q4OrZuLF99EA!lmE9pIH8@~M{(B#aph zNgl*eBwv>1;wu0RNgxfl7seIzu5BUxqSM=jEl)RkNSZ5$*V${X>sm!khdV+rrq9iWB31cU6` zHVZF1;l}m4Vb4svWnnD=*~n83{e$aU0do!49lM;{34TY?m;2mt)9!9s0>#l8NLUt?3ChIg==`W=PH- zs17xjj@j2#YL#Lmh(7URkhi76GSd2eOEd|$;++GahclD2jgHnDiZx~6fiCdvJ}&s~ zh)N4}rNFZNGAUR4wp??~sEYdtO#d>r8lYlEx$A~UDzwARp5-Hc6MaTaE%y!Vb{`mH zkME2_B&Jmj&nP&G9kBhPWcYx*#Nd=Ksbm9I(t_}QHYS( zU$6Ws^FW^D6MXpXshZLHB|^3o&g4|e=}2S-VOAjBln?xeMft&w&aToXrp!(zJ~1Vv zE7d4muXk><j?4g-Ou76j*(Jnl0OM=;~6n-uq0zp z%!+IY5pgx-f-85<7WY|jRnxAJ5L05(OjN)siD>9mWP&OrkzpQj!i26Q$TO89kZ1}q zF2Fd*$Va;D+b=^D4o_q`F-*;b9*zHaXgmk6@TY!i zyQF2;Baa}u-3zj2@L6OqdRYN4Huy^X*H1UCgeGPOBu0Wm@|+E2{uguqb(l-Xv{BT6Ili@MgD-0Bj_bMFWrqmH`2CxJ@?uHt?= zW5p9>(Ir}QiuriawQ*T8cDTbt)MR-h$VLo4EZKs!lJC1q(s^t;7<;!7MO+D7Fi<5i z#U0-S4>{)X_2$odx9n+EH2CD}?#nQLmeXSAbIjqTPode{M~e&W0}5DR(9%?luBubP z?A1(<-@YI?fdWpQ3JrHksJzM%1I#ydg7(uJx4lwpTpcyPFsz4!pU$*bdt_j0PdEv0 zXfRbT8j?9zjshLQ_|}y=GAOUGO0|k1%99D<0|T0;Bzqq!scnZ#1~KHmKrqA{v6$P& zog!bA1I(fZt9HzgGgKRMGOZcx?&Jq|%T5{;`AWO<&pyY|MCX3*@;D?e`6kW)!3ENd zvACvD4fbQFFewGx3}7+J^C)49=bZzgM|{ABsP}UvLifgz_hO#PM`R52lj0}y^MI`c z@?~@NPq)WRQYFM65?GE>9&M10Ck6W1EWD2V7VxM6w^KL3&u9ctQ?-gz&nEsj1gdgX z;x0mW^XtCtxs|)@wn8OK*-`1GWWQM3)RnW|B9d&x?uVJ}f}@>5)V;owNMV*$Vu&5pVReO-MycC!1Cp-8FXA@SvpY?;*KhvOSc| zeRB-sS?M0;LZltq6>E&lSuE_iB1bh~Cnz<58I zksh10;I#y92oy zU7J-YT1Te}xkMDRz%!ePvmkpH$(VB-^tYTdDmk2xDv&%t!`Hr66{Y%*1UDG#+>ptg zd%{N*j7*;$rMsg2Qc~ZInx?x4dE7A8b^Phe_iDz&Y)d^Tr4-Oq3NDy*|H1I3=%Kdc ztG-VawpT))k;N2lzRnl%$N{Fy{AX515AJQ@BO`3xF&qV=g4~KLvQPV-y?b#5O;3fH zi)kJ`4PXzNv^7h*77|apr|uCXS$G}8qx2+|^vC__aq@fd151tc zh}B^p2g0#+UOy|7ec4$xmZXqmg!7uL`IyMa zilL~WFOqzL{d`;qKwhiGxJ)zYE(fXjx{y=R^M;^4ymrDJH{T+oYOEMaIc1f zIB`-Ro*^zA2_D|A3%9h>;Hs|Fg^a`wA0DcI2s$RXXT6Ad*CGI(JEB>Vf@)$ z+&Rf1!d4mQI4ZHCx~kSSY*KbvhV8(eT!0nMD4~#Sv?{YRzMvZ{>Z~a`W1fn|H;l5AHvx0e|?}F(s4O|DQlVXXjql8 zZrGR!YRmzbmLU+x7Fx`k7k=sP4hH96--hWo;t{ZD$!z4TRAyQNTzyW`Y&d%{hZV$! zWPwc2TFkbXsQ#VLw7WOvZ zdo~+xO1K96zTg==r9>dnI~-nS1+x^(KqG}Nl^e$%erm}cMdsmkNdqEL)8&`9z&{rwT=sBhD)&ohV0>C}ZUqI!zORs!Bx!qUV)aa*TAKPQE(i z&dl+o%@O65E<9w<1OaFZFnMM3)g%szGH8q9CuU$>Nc^Lk1hqbg=l9+T~+8CvAk z9YfAlIM(tAO7fp>e2J&Y-Q^UEqO+#iN(v8d{Pk6!+jAFgg4d_{+!aX<$-QzXXQd1d z>Ado&9uV6m3j;AIs)Yejdk zBGHg9axpnR#K2T8b10Bk_4BdJS`lArJbtPw+Oc$d5!Wpl(y{DIkq}CbpQ&_!B{K>L z#2y)(fTJ~oP(!Ap5=$B@PHu(01hFI#yW15uu_BE6Dw!sXWrfKT$0kqpIZag6ia@>) zpFP1GV&g!`l_qw+EAneNVIShDEsSJ^!BEgS9Fa7oqAf9M1zS#Ge+5$Cl`XYGDW?>= zDs-yLf})P)R6@4}`%tHFLd-vbls@JCSQgcx@dwa^H$a@1Mfj{f183}(#aQZ+jcKCq zl=;sJVR(hZ9Ao3DQ&p&cPamS^tHKc#$nlD5IVRQ|;q56g?m>R?iu_d*4swiFfLQU0 z0pFAL9g}2=BPW!3)RibR3y~b-7$IgYqTh}P^a{V+Fs*@g9MekKuWbcnY z3V1kHF3& zv=^?+|34@Hx6Mi8;WsKAm3(us#PMtfdl76sSOY9wE*()P9pYGwQlIZUn{^|V%^Uj> zUqzBNw{s|kHxALMfN}PBi?RzVB^zrY@oa*-vi-3b?rid-vJZhU^l&^J#E?K#%nA?X zD_uC|P5MKwH*(SMkJB z09J|AHivXs>P?B+Hg_Cb6of6YRTtEt0s~aRw#_@M3vixh3@=!#3o(A**;$3~EYEdW z5z)~X^N5sEnKfUDHXlMJQ;5tfm+BZ;b4YDA!!iQW9m(5q^uH?h{d7c;nLFbc9A19KreUHF{omiqeX%1(4a-S@sLv;LewHV=NMyMlyp7=?JgB_ z$RYsZb(fhe%F24X_i!J(zN~s6QIh=#g$<4Uj-W!@mEC`+cmMYLHf8Up+EbE+h5Kg^`u(VQZ&3>V zx%!`QA3n7DF~}>0c&@;H{P5ww>puqnyMX`u8SH;B{xJTxOFe?3CFDKWKNaym_U|p& gkNQ23mEAk<@0~2@{-66peSrF4{XR@u^pEX-0l`3vQ~&?~ literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip new file mode 100644 index 0000000000000000000000000000000000000000..709d1674d3052d2c06e1e3184f9654fe8fbd884d GIT binary patch literal 1246 zcmWIWW@Zs#0D*&zy1vimr{tIb*&xgc#Q6n!1xZ}Y5%C5>yvg}_$%!TLMXAN5IVD0| z#RWN;CGnYgY57tNN^H(xNi7a8mc)w8VhM%-6k{`fpKf(#W?%qePN1=wdHJO!dU*v^ zFl8KI+kb4yK5+#o9mvSQzy(%TP+C$9R989sjrc2Rfs-c!js>ipHz{gP%#ysN3*sKH zU$=O{ylL?vK~rXk&ySxniA$)=`;4X^YnOPsir5mSpHgjBv#hwba!p}a$F}Uro=bO{ zy!Jl%vnXWmolA2jojI~-(Gjn8KNeki@@LPZNsBH$5nAWu)}gg-k4Q(3;$(B@Y>nig z>Hx*fZRU&3leI$}y*yJLwZav>H74(GHE%Xo4q#l}W?rn3+&h2%{Ok1!48r%msd~=c z*qS5RBgrxOhNIfXEols<%V#>67B`sQUXYc>ki{pO;J~cv*{SMT1rH*K->o*9rnv(B z+yTV=K%A0VmYJLyUs73+iZcN6CTA|h2!V3c5CHpMgi()8g8|LPx3=NoFMu{p0fsg= zkOoExrNQk+t>6wOCAg(P;l22VqZ=r^OM&5CTHH{2d%>$ThF8$=)>u~9An!?EfC0kDB*F|z{@4>BPy!4Z8bMTM9xQy&^`S=|L>B`?L*pMH6D=yy zHKWG^LUSH4W{_efz?+o~B*_ef>wt7U6Nm@PLXZsn>Uk2QJx~;cK{+kGC^0480F;S^ zQ1Wj)NP-I~6AN(wGdWlkn4MXJL75won>`p2j>cY40IdRpCE$<(7aQnW&CmT({w dsYrmu33eYLv>gPNCUE~D2hKvE8g&+sLIC8dg!TXc literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip new file mode 100644 index 0000000000000000000000000000000000000000..9939456c7dbc89ef6709c026d8647e389fd8f2d2 GIT binary patch literal 1226 zcmWIWW@Zs#0D(h|y1pkwF8tL3vO$;?i1Q2d3X-^3GxG{cON5vs;tix2lvslcaxzP_ zIJsDhK@0;HEeJzl?V@q&5N;zNR_ z%n+X+KV=e^P?`4`O+VHy@pKijB}_l1+N@?-ac$+A!my5Q*^@n&?lgJree!2f$lg1b z=1e+sWYMA{Uh94=y7J`Do=1}wU3wz4&dIGqYuz4^jvU3w=FZs~$wActiksWa7n>(* zhd6q9raEecD|%~8-rs88Y_1%@xVp`}SR=W2{`~pZ>lGM;?|oDCoV&3#N3utfWAY70 zwT)ZS7)+PXbTBP$FulDXD~%zGPc*@SS=F;s)w2p7L=eAQZ8S}D1^T%Ii1~pyCABOw zIW@kdvLF>_0OU>1T!;|@<)|S5_P+?D9-9UOnvKYz%?-2>7$w6yxEYy5m_Z2~dtw7h zfI&kehytf`aMBC#M%9NNQ4n1W3=NHcfK0S#L)VNR<_OJsz%-8(0|DNwY#>QyAY2Ee zvzb6VSayMA(a%mzoSr~Y5C-L{^rFO+cmq&2;cmpX0 zCBEQ-oXirZN*{=@76%tgVnt@L1VaFdk*0EOHMYzQ3?R%2G%_L1j00@5 zU(FBGCqU^AMg|5hu(E>El478`%GqzkUr7s`JP~j#VC}p~QFCIJ%M~BnMRoC~j^u zUu+(%9pdQand+z&uIQ~Xd4H>Uv$=8rsQjQuT;P4P()ML|NK(lcL zKiAGnK$~2Fq0bGZfl)+h@V8Mb_>)Nqeo*L7zM-g=%3ul%d(+|u)7uNO(ipNJVZY=E zB(NEoM3_N|9(&RQN`OH_BZ$h(gM~zZH>y7LXoBcsU}$LM0R|&lq@im@4{?O%I-q8x z@DK22Wdlhv1K~O#UC9LE!7>jd+wQNDKWPIL1z}JgOD{@Hi8lacT@GMgjR$dsI6;{h z!sL&Kv*S}TbBlqQnaw#rFFCOUk%>JR5iZ7FHURAbgC*c#0+$r%TF~=8LQ61^iBuB6 a0v5Y>5Zbl_3lX^IkV9r6P>mD|NFe}&+IWNj literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip new file mode 100644 index 0000000000000000000000000000000000000000..2b3813c79aee7e103dc5f73d025d324273148ae5 GIT binary patch literal 3409 zcmZu!2T+q+(+(gYMFiy1ReC3M2uKrzP$hJwNG}lx#e|{(M4E&m-GmMaB|vDxMMU&c z6i~Y01*u9kH0fd}U(o-~|6S+XcXsy7oPBoZ%)ImLJ`iK7b8LV!dmkljYHnQ8El)n5 zIW^h+1Aqap^mK0iFn1^%>JOu*ad&~cIKlrfda}~^Bj5oDI5$9mLHC~|7hlM~igt8| z>{&Ac0B24|E((Q_K|ol*{eRNV|Lo0QIjp}x0{{?d0RZ}+Y2?E9$a!I<84qqTe&_Da z{WJtgh7TKEb8R*KgaKys9nHwFdZgpB(e7nj)9!c6dZk^S{263FrVkvug3<%7zEb-X z1F^H?Vak;bWLv$1mr{PZth&5ba5_NhCaJ$ZI5~3K02^KjBOj+zYu^&4i zpfXq{jV1N2u0)(BDzB7ws^QW3*chny5?b^fg=66kd+2S!C5Wl6(7M!9w>MCqwX3_S0zbyFx0N`}S|8Q5W6{(g#;DKJ z?!42R(WDu@-O@2g=(&rc!M2^gDn1Z|cvggeTMyEbF!bT%TC88zLnkH9Pd;Bm#}on0 zwG0sC1O?rG#+>MA=Tf3G(O_(NiA7HF!J>+Z96L7kQP7q>I(>$T|O zpMI#=j>p~0Pnu}09$J@=1GZV{exB87&M(LIxJ#s;oCp%QWtk{5_$!Y8%x_Yk5l1}N zWGpPL#VUVhhKs$e1BWC9PHaACfIToejx>j)`b8Rz7rwuP@`pIjh5CYQh1RG1Qf@`j zy?bPvQ5-0aAsD4vRpzc#KyKEqhxQ=Xqlg8a3y66kpCLB$(N$5_mSz~5)jcrl@C4?(OqHLMOQj^n+m&w)*=ey&fKK0M5HjS!WdSuusK|evst!{k z*aCU*Cioq3mp?IydT5&QuZi;9aUOqKv`Fmdn%YaC)_rrxys`v6gZ5NE9+Xqs@zNvi zyMZEy5^$@r&5hog;fW#0?++=%3jNgK`^xp{U|XIB=yMnOFk}`g9NWhqsZJ$)F~@s1 zM%;4yEfC9OmM+vZ_xbGm}WKt{)PTP4aAX*>g?n zi(Ew-C}SUk6fYBK*`sGAq6mkI+#5S|1>(|jK|y>eq^iK+J!g>KqPeA(+c!n^;E#J! zi#I-+J6fweAaEtyMY;ua540x)YFVxzG_!IK`4$T<7_C40%*BSfD)2J}>LJ@MD7tCjY>Dy3>u_oD0>`4TehjO*i$9h9 zAf()$=GoMJ8wpT9*er&{GV(NTiuPkFvF3`8ygy`Rigu*Fw6!W$D#ybQwrfILD`D z!<6laY+*Hn)ofJVrXAlG2VU{YS2+=n!%Su7%dk4isr?#C6sWR^o(i#|>g4FU@3EJ^ zm|jga4Q9LIIgN{V^7m3}5fqQ*&fJRQ(_YYpT}(s^Zh|iV0BdUOtCSi{LzSJgDa$;= z@uT-O-b6~2#UY$VsAttH;b>HHl}SL2FWZZd&@TlRded4lo$E&w^h~Tp%eG2KC|$n0 zVQ=9q4=}SW>OmLk+S@I&y_XJM@HJP19qL7!2A!cx?TaJbenCffKiu3P%m=Pm%Xdh# z7i)7xBOJ9XNL-d(-$k!SJV)z)^uxi;88l|^=odqaL+80os#;|}RCCD3I97^oNy!1V zSksy{pGxlrbsI{`3Hh{ne;b<>i_{U5xG~T2V)5EO{jXh1fXD;ntRZ9H(q2U#7+a8NdB3mix>c3~9TD_8|9(e{nCi_LQ zyL$vf-8`J&VF4az12K1vgUG|&+XHzEArz$Br43?jSzS+6(NnFuadB6@7WE z3=PYkI&wW#Y%^aj^y!uJ!YAt^W92GdRRMGd(L)8> zJ!vr<$Y{0pw4?rdU51)zkha;jRkT&1n!a23`!&2KBQ`B~ z&M%C^(Y`iyJ16x;{v(%H!rH5MsVUtuS6lRl&X>|&pFoUtA6)?^3ZSh&U+R><`3=tpd5ruTdJ ztd`)ZwIHqy+8*`J$q@5OY;xrOqmKnr;I=_n^#tkC-hpYoX7}9}^7YySEH`m`Yb8U) zGM!5&jIq1t+_ohB8$$ao^Uw zgUjJ8+2Jz#8u$^m+hbe5J0n)%bx>5YENiXQQnJu~*L{b>5JKq4uUv&M&(HT|e$mST zXsG2mbq5>DCTt?#zRVl6SiYDHS4lU+mqje0t7&F?3+$#ZTI}nKn~sGWbQWLOo&V81 zY_%f+gvU6yNHaGp+SzFq0=C|Kz&+mT#PX=!;7yEmKe?hH9Pvf~sF=DLOfj(g!eu~} z(nEx{(X>X`u`a%PyDws5dQ$v3|1E*jSicpdtM(&fIVK-@$iX)+JMK9_LHVoVv%FYS zH&ZJhrkvD$&ER%rmx2@Z81RYZHkIlZ8Pin7+o&xqq2267W=Z<(?v}rC=TF5Rt4AE> zb7M1C(tM*E4t@R{UNRk}72#hoO!j@zpWno_Z@!vhVUkn{8OV?`jt1m;+Md^(HDS`7 z_;&kJT)e$K`ZiYC=jfCMVoX8FMs+3!{`X2wzW8Ymqs{^p_RoU;JMMS-`h@}j9!4>e zm;2wy?04|*O!WWY2r~G;cnM-mO_mD)02TSGA;+{c-Xhc5KX3TolIASu%y2*7@KfOY h4*6ZM`~sXotjWj!k}tpdQvxUfcgYvSWPJkw{0AGA1YiIF literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip new file mode 100644 index 0000000000000000000000000000000000000000..37b399e1584e5191a9f2992f4c8a9dd188650c1d GIT binary patch literal 1632 zcmWIWW@Zs#0D%jQy1q$oJ~iD3vO$;?i1Q2d3X-@OBe>YoGIL5&i-Z^>r5Kc0ob&U_ zw79s~vQvxlQgb92wPd*j((;RvQ{xkJ((~iX(~A;QB>uwyqZTI@YkERl$nwmp8+%g!7B&ypvsDX%23^qg6;-hF4p{_ zl++?67C&c?aG1st35Eca5O~DKW8=xpzyQLWz!1pH%P%d_%PXjYDdPYKl&o&YnU_H6 zVnzlAF0itK(vo7Jy2{ya#9v7ZoIDY5EMV=tNl|lRmgFs65chcfy2T6TO^Xi+nleLt ze*Bb4Tta2uXEgm-yTsE~#FjAqlybA0WyQ6XYYM|Uwq;NDT)NZbwfD)NMIn3dT$(fK z%#lTlj(Dy6vFOT^KYJccT6F1&&^jl#4y|>2L^^U5C!0HGYa|C%2PkfCGhb{TtR3R$ z<(cZJ6|U&5F?oNhd9%540ORU5^J0zU-ud(AU$0kS5We?K)pPE~)*LCx9!ZYHHyYg< zeV7kV;#r`WsTSaDaiHzS0;xj`CMr{&{#xYu+j~*~&m$Mp4(1hIPp9sW^$g1LoFoSi zH%Ne3Z8S}D1qMV15c30ZN@`gqFg=u17Np_~mAuKB3o(MF95qOed=f`^nB|*ZRkx7IZlwYwYL!bm0G&F*!%sg0t1bCzBLys+pE(V5%MnPa`qQxD$ zX7qqZXl@2-Mv8_2Z&o&tBr_1M1JbQbARa6uK}wvW`ww^pfZ9M9l;1$vHr@cW)Zt=_ zj0cGeaYITW2!|~KOasd!9#DDY4pfktnpdL5&BdOVAD@(&SPU#5hD#yk!H9?j>_s&w zXhC2}BZveS+~``+%Qlcg5LnX41Ppzok`9(wutyO@6G&ST3sQt3$M!;?nw6jmgaH7R CvJiIw literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip new file mode 100644 index 0000000000000000000000000000000000000000..11701717ee81e209952e926b69626e2de9749d17 GIT binary patch literal 1355 zcmWIWW@Zs#0D)hPy1vtNrmnICvO$;?i1Q2d3X-@OgSi+Zh1l{-OA1O$q!^SWob!t_ z^HN=NlTuSsGV{`%i!w`6i!$@`w0ODLaubWvGxJ1&ni&F6%)TU+GjSF(0|N+i0%bGv zfClR26;#2Lae$q0;a#w)1S12(1t8`ED+5|u3{+P+`;GW3X@Qd`0*(c&oi{0JPRx?L zr3>O7uV1%#!MthlAwg4Sh|iCoGKov5%=?U{A8VI*x{BBmrk_%7RP z$(~Dhn!NTt`Lifw@109?CY?F5Xwea`bw3tedGcq^qe+V{JrP>xh)Z zW6ZR;!SwcmtTcu!NQ~WE9)GE>@txJ|%aMQhv~1qLQyAY2EeuQ7pmu$&1g95SkpKMw$j0x>89rxzur#2dg12rh1j zdGR1|A+}&J4J<7baFrJBKy{g^c_nD2h6f|U7uZWBpxt1w1RQkWG74P_dSQdm5(8u+ c6+N&>!tOOV literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip new file mode 100644 index 0000000000000000000000000000000000000000..9b87be841729ba6871263fc7fa5d8d58cf952eb2 GIT binary patch literal 1101 zcmWIWW@Zs#00E{ZUEizek#}8yY!GGz;`{==f+Q|BPrrcBkYFJ$|IiR1Bi_f=Plzqd z(I?b3Sc*YO#5u&-DK#-SI5n{-Im0D2IX@+}NQ;k)D+wqVU!Iv#k|DtufMU&q%r#f+ znHd;Bm=kDCW?p`2iC$ho6-*fi*fn2P{9bhpDDBC}z`zAoR!~|}3{+P+`;GW3X@Qd` z0*(c&oi{0JPRx?Lr3>O7uV1%#!MthlAwg4Sh|iCoGKov5%=?U{A8VI*x{BBmrk_%6 zRP$(~Dhn!NTt`Lifw@109?CY?F5Xwea`bw3tedGcq^qe+V{JrP>x zxhV|c}vn#j;JNkv51!com&OIkrm8`IJt?@3>PLBYr*!VJm^*b^U60t^}&K~!cQ zEM(C2p+_4;7Xw2><3Au1EfUc+qsIV3a{(|;km4i2o0SbD$qa<+fOItzh=(u^dv*q? p0f8l8KY%kgx)$`T3DE%3k`81dWmJg$Obkp6SwM-iz?{s$006iFTBHB~ literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip new file mode 100644 index 0000000000000000000000000000000000000000..c6a3150b72d008098056e8cace3ed5f8da43c033 GIT binary patch literal 1133 zcmWIWW@Zs#00Ge^UEhPnm+!a$*&xgc#Q6n!1xZ|No_+zLA;Cgi{-Gg2M!b)!pAcJ^ zqfe-7uoQ!mfOCkmdr@j?N~KF`a(+r`k(Lk_Uv6qqdTM-8YC&pZNotA&V*rXZ8@9A; zif3kE0AWs`HJN$&r6qcK1ywL*9AMX+wQ5`R6)2s`$iTn_R#s42QVdjAIs1+HD`|m~ zCjyQIterP0YEI0Oyrm1`9cbdHRKKZjKWbd6zb0(cRvS`r}uXR5bU3v0n&!b6;EoTbLVW0ek?{774HdhW{T-|0~ ztdZP1fByXI^$HBa_r9rm&fVCWBiSR#G5Ln0nqgx?HQR*-6X6w%G6~E%0lWzeQUa?I zRzFEr*EpShe^KqtcYe2WZkxM@-%$M(bz4)c^Va-b-)?VH(cGxw$V-AOZ)F2%{dG1_PRnZgC%XJ_bbtFgmz_ zG%(I6jSn|!#YZqH@gW6@ki|C?-BKA!(`Px97B`gMUhpc7;T7N0gp`JcNh%`27LLmd zwxt!6v~4nIS{m#<=?gF{7@0(vL0JWR;si>7K|>>m%FKfW4Z1$`c!TI-U}$KR1%@kH zETU^hj{=0|exPQg2nq0JWdlhv1K~O#J(~%{Ll}oWyMxq#z!I<@z?mOi3wqXtXaH$R Y2QrZ|GQ@r+1}26qpu}5XPG?{M060)?(*OVf literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip new file mode 100644 index 0000000000000000000000000000000000000000..867437401d2f480a657655d7e2e9cfb02b93a6e9 GIT binary patch literal 1199 zcmWIWW@Zs#0D&XTy1qG~E8a>1*&xgc#Q6n!1xZ|t5nPOsLd-7yAyN!VTrT+~0Y&*K zrO71$D0+RPbW6>c85lsA6R0;cFTb=zFR!2qri=q@#l8)a%6Ea%sf-K^TwrAdr6t8c zb(OQ@wSUbeg%QMwcD_qfAWAgr1 z^Ja790LImA=EWMxz4PbKzh1AvAbjtes^{E|tvOPXJ(3)YZ#cRcZcAe*T|U#Hw78-4 z_JUVw46pc76B(K&sfY+$EL1Bvur%0v(iaBQK(N|qn&t}hd0g?m!ipsd*(Hj0o3a&-p+* wz+eeDh`{9lx)$^dkI-TRWFlpMSY%-L3PRgVV95dZ7;>mA1giM~lw@E40GX+6RR910 literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip new file mode 100644 index 0000000000000000000000000000000000000000..ef9530a9a11ce2ad6dad62e2297647f55da4568d GIT binary patch literal 1215 zcmWIWW@Zs#0D*>PUEiYJ7kTx7Y!GGz;`{==f+Q}+2qDHuDF!8Wm!kZF{L&IFZZ7uB z;`ox(;t~l)Egmk`qQsKSd{Ll!1}laD6a&L1`@GX;W?%qePM~~d9#FqtUO^R183)+j z75rQ~F9D@p85tP3z{(0rONxQ&DrdhDelDBk0+~f7@7B84L zEj}b@$_(-O@lz&o36*)D(ez{O5>HnVTf+2Hs?BPa71vg-DGck_mOa^X=}wc^-Y0(+ zh3vg^Y0jiGM;0wQ;zv#=wASqr>Bv!>Z0?+`ksMSVpt!ls ze6e}5c8H^wXR4!CxT3elyvdmhG5lYS>VL5BMHuziG#JoqL=J0ippC!?8P0*t$RxrHO5oTN z8c+fZ8X7@VW*#gg0=!Z6p+^%$7ckQ_{sJ=5A`M+LdWa)5=K#|?QuqgWv$BCCnSpQ} zkWOa;@nBg6l10z)c1^GYih?jGN2M1fro@cW0@ZM^fD{4% DNTO)s literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip new file mode 100644 index 0000000000000000000000000000000000000000..a8e45abf3c8ea538ccce5c21864bc7d9559d496e GIT binary patch literal 1280 zcmWIWW@Zs#0D-^Fy1o|S_g5(b*&xgc#Q6n!1xZ|t5nPOkT#QLVjFD0dN`kIAsky0n zCFPmLsXm!`sfk4aC}yZ{csHYnnSlX>Ie}(m=H-``=;aku!IW`;tx}Q}XW?LEV3-BO zTwrAdr6t8cb(OQ;z8g}_(par@x2kyZGoXf-)kf1aS711G05Lxhr=*r;Ca1=i zR2HP-479w-nF}#OtsFJfz+ooBsK=(kfM#Rb{FSR818oWh#t=7<21X;LQPf7QC`u+J zia;?m`G%t!B!;#EV`yt}!`9miveFr{_(f-|SIv*#;dsFAO!LKmXZ_KZxCG!3=NG2z;H#2MRd*RQGn3A2&fq;LIS*5 z*+7!aK)4P_uVVu7U?~Waxwq9^H3g+05C&z;^rFO+cmq%dXN!ynaUdC8h%Exl6Jkq* z(n(+%nDr$wvc5Y|TPCnD@L)tZ9eYs%v=9uIfWr%1u%K%}FF6oeT!2iZas(DL*!_jj Ub{AL#!F`DwL<@mxJXt^r0o0Hq)$ literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip new file mode 100644 index 0000000000000000000000000000000000000000..02cb6d516ae68e9a1c2ce3d81899e923f8a11d67 GIT binary patch literal 1050 zcmWIWW@Zs#00HY3U0<=AX$G1=HVCr0~9y6nJ+d^ z)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3d9g-v@BI1muh%Ou2;cjr>N$5~YmQ`(B*)|% zifXA0rs;=FiW^LCFUU$`$l`mNkk-&JX~~gEDz@+dg81BOqiLEe(8nD>%n!sVsb!hT zsqrP11*tf_pEo&kA%_3UQT-40y$GWon+5}#jRDP8_2+;#IRV3(8%P5qWXK1$x}=k7 zal+*@3|C}Morst;C1TPNcA&QznM9aD=^A??14@8FLnDaF%!7pjx<2$Mg6LvkXlVQe zWTHhFx@PpyMrh6fhCfo+2Y9oxfh3uMa2=3NW&-gL#$nH!AT=Pc1ndWJPDR&(o&_Nq aKw5l&Or#76v7d>7i6ILpF$lr9Vfra`%xabu*H*464C~mIJ=t^VPLtQ(Cw~@&?7eeo&ZIL(7A-pBweH8FD^LFH zc{FL!r6)q`oZLFJ*6k7L$WfeZ?wqZW98?{kxVg=Iv3ap}h@+Qhs-sr8qPNE6{jKKB z=E?z#tJ}iKA||ygnk&4}_AJY)x!h~+vUQhDyD40EYr)I|yrzxZ8h`cv&y4j9%JQ5f2M z*jQ~eO>+f?Ne2+~193`fS!Qx-d`V?ND$W4Po1D22BZSIPLkJu`B8+-$8VqPQp8scj z^a{`>S72;#18HECQ5qd>)QXN|QlbMC8SZB*F|z?bs6|Py!4Z8bMTM9xN;ZyixU`M-)UC14Bcj3@{+kq77X$dYB_L z_W?B{#Xx{JD;r3X83@+_>24+v4`Cek91Btd0!zSt0OwzHE$CSlq5-7E3dlstxDfl9 Q7?>EcfD#*kIh%n20LfWX>;M1& literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip new file mode 100644 index 0000000000000000000000000000000000000000..407edcbba9a970e50d7d579ee262a4209e36b136 GIT binary patch literal 3427 zcmb7H2{=@HA0Ea!mcn4Nm9bRBEm`7FO&=u)gMlFoaQO#OnGEe9+L6`Agq9^2w^{T&;f28tz+o`N@?$h6BLsLZ zqA)EG%3+iH;o8Rk#5fP$V#<1U=Xx%Axmo-LG z)%Av3%FdHz$Y1=mMg=BS3{@LsDADY*nV7k^Z%s{F+n3M{7^d&pDl`84cX8#1?9I}! zgw)4uu}hxH10D(M7bD`|ItazZ@G&J6#4AgM(`7`8^uxT&`VuE#QYPxA&1IvT!^`yH z4X*=4*L80i43xT^j_oA86L7B4NA3!|j?UYVYjbQDE-T+P&_0aKt5tMF_qU6Kdws;K zo`|8DG|`zV_ZVuZQksH8=K)gvD9dAV(eY$W7do-ZWvHh^4)+UgQ}I~j~ zbrlb%dC$hbOxh+D_685DxVheHx`s9N6rVnRq7bR|bB{#%oPqVc|MmsW%jb*@k7G>g z^FNx;wrHFCeMF*igafnX*UtwC7>2!}u@8#iPYN`Q3EJ0;iL|4Pf_>W(r*COz&M{|u z-NmN-)E-~1vyknJdPEQtH5qyu_(JHK@97=dIhLaz(Y-x6>jeMSrR2RAGJ+`8xfL%Jh6JfwYeYVjU3PChL8-ywr#cLKPBs{D#PY|FL&MuQHenEXbWtC z!M;|+P9~9R0K;hlh7$o40atPzcL0+PIPNWVf*RTV*ZO`n8X9UKCgO z7q=ZQXHCz}xNc7vr37SIw))MS!DTL`~kc~Yd8;4O|!)(nWn&5%rHne zp_V^xaqE7^J=CZ3Y>)Ib=^K)ddm;K-E^|JcmbeV`VnQV8oHsG;O+nls`?Fn$4foOOdK)hVaP z=W%T}O>J&um_EFGM`%2hKPsu(N-9=8S}fbruhp`=B-g?E_Y-Dxr%aFHX167|UYjLb zj=K54nE3v@>WxzqLr$|ki5k@!JsupHAw>mmf-Ir=!VamS6NxFPPQlF@&bGQ87@A7k zsj>7iQc^&^AxHS`)IafCJZZF4pV*yt zJv65dWr}9_i4^a}?a||X@1m1j+SvTelMygj&-5j+IjNG}wVk->{ax4NBHe9!JF@cf zV~^@ihQ8b_%Qs+W)P4Wauy0>%f2{b#1$Umlv)5EJ1+98jVlJ0SrBxh>uAM$U%+M6m zIKF=3TtU%CNq5Cgm54s8o=D#mT$86UxhKz-X1}B-uQZieBp!iopSro{-{r=bDvYu? zE=Fezi8Eb%Fxs-ehZCY}TB9_5YQsb@eOAFF%6S1(noVG3ER^&UA0RBt>VVS2!;6H2 zh3wl^1-P+%M6tk=f6!{iLvc_$uApGBDAp&yG7xh@!BA&$!NGvnzVMkHmR|t82?u^x zfwR-{)v8TbO0k-K^Pif)JutxX=}RYh6SpsyW4xpQo(Nt81zFUb8Wh4H1u^#dQz#mu z2>wtKg-MnMMr?|K6lBWm1#bm_35+$y9V?88&x~I%!yTDkPzw1-13w1<4#paV&gULs zjlykpa6jx8fWZoZeFHIsK}W-Yrh=bJ_a@WPe75EoSE#U0?rpd-APUBs@J>)zU_}if zKv-6l{Co}eDmF|Cl;;Dv%Iv=;p!+2c(CK_l@!fXk|C&IQzOu0@FnBQ5CZM;na*{GD z+esub8B||#5RpbF6X|4dd*L{$FLVy-EEdujpaElzN*|>94iAEd@FR(DEYsGeIi@v*E1ra2S_%~LLnwh9BYcJ;k30yI{5pkB;_A2hK4 E0FnlA{{R30 literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip new file mode 100644 index 0000000000000000000000000000000000000000..21b93d67f95a1d2bc76990891c22603c8c06f3f5 GIT binary patch literal 1191 zcmWIWW@Zs#0D**7UEjT*62FQA*&xgc#Q6n!1xZ|t5nQar8Hoj{LX4483`%US6$OcT zDFG-tUC-Dpc4TH?0AWs`&dj|0(h|MAf-0CY4zLB)e{ZgM0hI1xWMJR|D=R21DF&*m zoc%`pm9)Ug69LBp*3O$0H790C-qHnekJqnTykOq6_>iC}GsNe|PnpCeROWp~(~q@F zJY7X>3DZw0H>+7zTwA%OFsx%+_GHhcJ564DpZr-AviHuVIg`#DS+wYg*Sa5zt~~j( z=h37^m!1f%b8_p@TDM1}BS&$vxpTHga!_@E;^sE<#pc1wu#h04M&@@R!MA$-5E#=Ug=Z7+x*p-=Crl>5Prt*@J0X00VHkzin0>hyLi1~py zCABOwIW@kdvLF>_pyW-?T!;}W<*1W3#{-Q7gC*b)0q1{oE$CStp~VWwM9TQEu*dFlgtkS%G6U`pr DtmSMH literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip new file mode 100644 index 0000000000000000000000000000000000000000..334cc694c49e57efd7c1f940693fd43e00d80b26 GIT binary patch literal 1219 zcmWIWW@Zs#0D;A=y1u?1`ZvXZY!GGz;`{==f+Q}+2rkBOF2*Dw=KRtUDF!7bH)kt` z02GaFE55zH6q?qpnig>Hx*fZRU&3i?u@>y*yJLwZav>H74(GHE%Xo z4q#l}W?rn3+&h2%{Ok1!48r%msd~=c*qS3H*(1rZ_=ck!qv_UEhOf(KI(#i|_ zqiLEeFi1Lpm>-B!Qp+-vQ{zi23sP~0Pu}Frg%|-;jv7GV;1OZeW7A+jv+-;XKhs5^ zP0ql`;0Dsb7^5^c+Nc#9$)v;vC^9DBa8zUbx+RriD=;#)7B_6Yy&x-{AqyB8ovNNy zK%X-*i7KK=a~3f@wSUbeg%QMwcD_qfAWAgr1^Ja790LImA=EWMxz4PbKzh1AvAbjtes^{E| ztvON~Ti-Zj=06K5XBT5L@X+{u|8-!VbIKM!zAc>1TCp9mC^fM-GcP^9 zIJHoUK}o;|A{CUFm!9gDnUfQMV&2LqGe&i01_lu31e%wbmtR_;h2Qk&%Ic3#_c5w4@lQu5$Jp@mJCUCr<<%3s^gEQq-K7C3#C1#64cWZt;S7)8a#d zrpyqZA3tRhmr$Aa8BIUdF7b2~u_a7DrP{1!S#fRUn!>P-ZP}AOm+mxq?S1lRQOMpq zm*z}5b7aw?BVOx%EV}aK&z?t<7F~KGw9d(`Lu=h0k&Yb2$>z@48p%P`0g9X3%om#{ zYlk>`d8RsQg)4e%Oy1vW-fXTMz__~2yjUZ-cmDkO*XtD+gztS*^_;u0HAk{Xl4J4> zM>WGOX$+?6vm8u|8%%F6$Vy|#;$s!+RQ0T4K=rrPM$-B!Qp+-vQ{zi2 z3sP}edv({(Z#^f(D(z$M2j+X&FJBc(3}NK>qwy=;LXYgl4J(LbwE0q3B*Gf uhdqCS)PTSeuphv=64Lb&>(?z_FmGCXNYIoS;`8IDOyUwM^FE{L$J!;Ht|GRC z>8F&N)hsKntz1(W*0C*nvggvBCa=9u{wxaFd*{-eNoS5MT6Dx~-H%0Ap8VPKXwsrf zPlVPvxpio*+auDEqd3{zIa?z+s5(G#bDQ~M^I+`|M=#G*N3C#0Z;i?OTg{uzl>-=8 zx0x4fB=^prKmU5Y0)z0qZ>pYiH@4&t_ulSxO zq$eaKus)ikA`)!jxXfT%T0u$MrVR~CgS{txVL%NHtBs~}0AhY1PDw4xOiqn2 zsVqpv86S10#vj2xy~L1SFFZ z0if`od_z$!mBADk{-(tZrneVlr7>jjr7{|NPZENJH6xP}O(NV#oqY%m!v;1_l5*Moh2( literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip new file mode 100644 index 0000000000000000000000000000000000000000..234c13761bd8a8e48560f4daf8eaa0eced30356a GIT binary patch literal 1258 zcmWIWW@Zs#00I9tU0;Rvt0kI1HVCrMCcy5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VIS zXvz%n`SDXGaS4@qpV9PV?GjH{5nICaQ_9V1mKE1lt|<)b*p@xnbLmc#*WM?87KQA+ zb7{_`Ge;IJI^wnN$D%7w{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(W zuy%-}muIS@R=A?K#^n92=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXUo&dn7p) z-*9weOpIm=sGr3dV$UA2pLbOq+k$@qVr&W;o=%3%!Z~i*Ct7@FaPKIXb>QF$21ZrS zm8zaq@Q{N9iPc8aG*@6ybO13w5T~S;0Rud~q_Q9tXSn1|&RmEQFy*KL0}d7uMm;tS z1~eOaWv-b%2invFj0A2V4UDNF9t(!s(ilpY&vYm)ZYaII;8hyKE56i3hNej>BElAe zYAJ`_JU^7l#Lg@-MP=zUm6wdb@L*&TVFu*{?8y=+0R|0?ASyEt7Ch+s(BlrGi-Dn` zQ5YDuXfcVd89f>hnp=UIks>9)o0SbD$qa<+fb>)*5D%8IAbI)uQAa@speP7~@?UyU zVoJOLJU?@BL-KPxNSq6nsfE}gzyiR`ErOi6-GPcSQ}arU7(5seuEt(y0PO;UCE%a} p7a{0c(8~sd79SuJsic5K3U)6cw4DJKGH?$fht5Kv8e=Ov)DD0~9y6nJ+d^)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3 zd9g-v@BI1muh%Ou2;cjr>N$5~YmQ`(B*)|%ifXA0rpsqKm=-sf-d>QE#*oFAn#j;J zX~~gEDz@+dg81BOqiLEe(8nD>%n!sVsb!hTsqrP11*tf_pEo&kA%_3UQT-40y$GWo zn+5}#jmTlm4YUy$A;USa8JR?wL1`F!LIX;GK|>>m%FKg>M1VJ{KJ;jU=we`KX#53a zqD30IX7mt8XwCtqcckzS@MdKLNiqZBIv}0S1mYo#!=4X8YCvEK*bm@biLM1b8$vXI Zw3q;yNSP91KNABJLl#h?4VY^g7y$Y2J&XVV literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip new file mode 100644 index 0000000000000000000000000000000000000000..2886b5689db08e87fc4bda7c5790a629ec696d81 GIT binary patch literal 1234 zcmWIWW@Zs#0D;MEy1peUwjN4AHVCr0E!_mc)9p|m>C#Am=kD7W*$(JUS2^JOc@8*E=`rI(yxHhvltl| zxWLMQ1{VX>RnC4R{z_Wl*lo{glBRQx#Kyh=M`C{{6?GQ&V&s0aP za7Ay8$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`anP-?pqlMM zi;3_GPMHMZoB-bhhFJny2O8MCCI$FBVVTL%lB@99DDhe0Lz8~iB)2TjNpcLRA!4=B zG|d$l5*@_c^r(L zSS2nvcm^dwLYQyAY2EeJD5N`SjvIq+vc~mi8eq{5C&zr^rFO+cmveD%f$^b zG#(@=#0kpNU?wmx3nS%acc5%$YF>#4Bf`bl3kRS*V6X%nOyHsdT?=|qfY1U;=ST$u bEMT#F2cc~TuuOq_4mo5N0@X;ffD{4%S@3su literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip new file mode 100644 index 0000000000000000000000000000000000000000..53885e5840d91f2d668b021e4bc79bc77456c773 GIT binary patch literal 1853 zcmZuy2{e>z82*Pb7+gyuTVq#aFs@RAWQ}aqn52blW1FEN1`WCP35CpPxERaGXu_0C z_qGsw*gF!ReEv1+z zj|A7k6azCN5)&B!hX(pn{NbWRa-ae%B8CzfLs12^VY~b(p+Ql&Rd~&VQkOkp03b|B z5Kkn7{TOm&{P#AAWg4G?>)5ZN08j%30QhnncxE)%N3CXr;re3`w76?`9NTu9YHn}v zreHM(q4dC{nWHuYzsVjvx{&$Y!NxD{n;}#x^N3s<4J9E?fSJ~?OZ6TBUJN-IM|8Rt z*@wf)=Swc zV=}iy-cOY_su`^7FxFpyZ)G<9?#1mpNbck$hluP{WPjU|+U$8Dm3`s}vBIT`D6(+s zy|lSY?mi}bvssRd7cO)x+XI$Tmeit$!i|2LtDkB@i&-~XxJMY}FTC*FFAFExbNA>x z8bZ}g*uAhcAH`ofd#_$`>AgT8c)7o8*egnTb6D?(SbF#$A6rn1TM@pPGuDS#N3{aQ ztTip8@-!2QS{vo)4uh6ujQhy6T`e3;o;07Xvyo?J+wc&=?X+rF`*gcnT{56U>T+e) zCfqe1f~B_ZaqnB2AncSb`WUAtpXef~qQk;Sj$F}uNpZW=9dU&6$4KTrWAVwN2G;3Y z)=$;U_Y$7<--^N>pKxO-drrCN%&>u$fLZtmmAU%!gqHKTj; ze7U8KL5JF-1)|EZ)rDl=R>O?hzqRMDu0Igdb7oumWZq3@)uy3VoAKg8{OHRv8f}#W_@6F%>uFRv}UccMw$?Oqczz%O_=CUov`>kk)E`TmSx8b7|yRAJlKotq<)Yl&g^M3R~o* z&@^h89^8kl=etge7A}l`rp4gf$H?Ah`y^6`$-QyZ z3fm&7yoZ+`n>=xCwio}it23u4oN)ZrBCL4tND9gy;VD~I@ZcW)@pE%mlN}wQBTOgk z#L_sauCgIObpHHg{#0?g7*^b4#{tZZCST}`@iqp%>n^w2@Pqf^3a4l+V_uNUpM}JJ zN(@_i#uvB#IEyRaY-il^6&HUgh$u4K+%S!pTX)_Tyg2ip1?m z+uDz&b4Vv%7jG;u=<49^$UQjsdAwOM^Z6J~=908ZH?f9mm>Oa7Myqqfpzh@UdP(L& zo?q%W1vmdgs`kNSqpKCdqG4;W1E>s{V9J6S7!*q+1o={^kwHS4D0!_krzY2V4|fMw zUq(M%>V1w}T|s6LD|uBFWlUcWd_j>?xo-g-t`O(it-nVsTPZ|${W*~@s>K%YD{uq^ z149A8YDSyu18xH#lLG_*q`w z)KmNrK*niHifSulTPth;fK@;$ZL}M7on^u4AtDG-q4fV+lfjMUNKO$JBKf;Quf?r> zsTC9eB&TSD>b~aP)`Hjm^dH~?5d6~<;9SK(O922;@Z*7%E_A1#X85&z344TbZh3}f iGy8Fa706o0S^)?lHsJCn|61V}0Ym^2xY4|PoBseys@zxr literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip new file mode 100644 index 0000000000000000000000000000000000000000..8396b7975691ac2642673e77405cd1a54bb38d04 GIT binary patch literal 1103 zcmWIWW@Zs#0D+@zy1sMtJm2X9*&xgc#Q6n!1xZ{iDTyVCTx>;&dFiReLTve^B?YA= zLhL!IdFdq?#ZnAPJno4l8L34 zsPXB21(aUE$iTn_RtB`97^tpt_8ak6(gG(>1RM)kJ8x3doR}qfOBcjFUcYYff_c;8 zLxQHv5T74EWfGTAnfDn@Kh`esbQQ5BOh2XEtY%qpZRMK6u#RonlRcO2G>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z&eD6 zn+I!$IC^=eI%;Be{3}{Q1}G6&Qr?eN*+EyRkJ#O0q|i zWAP0~w~gD<7)qDVbSN!uD80SlRT{%9KGDPjOkR@$e4en(;VztpU%@r6D9YD+v#3`v|naQc~C6xuKID;i`a^^yea4AO(7jUSEFzT^s zFre9J7Nz;_DbS`CU>tA*X-GT|~7#JEFg@7T87LVwf(IWw&85Au@ zQ4-+I$_A2T2EuhfdJ+?ehcFI%h6bqtfhAx+fU`Hc7W5no(E!pC1Y{!RUx@uo3``7J MK#B9f49>s+0Dxs@Qvd(} literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip new file mode 100644 index 0000000000000000000000000000000000000000..012725912304cd73ceb34fc03eb0941fa13b265c GIT binary patch literal 1106 zcmWIWW@Zs#0D+@zy1ucGi_Vz?*&xgc#Q6n!1xZ{iDTyVCTx>;&dFiReLd+5I22u=4 z!tRMB8L340~9y6nJ+dk)(&y>@=SHq3Rm>jn7qH$yxCkifN^!3d9g-v@BI1muh%Ou2;cjr z>N$5~YmSs;k0i(98;)*?IpKw6@JOr)#}v7d>7i6ILpaS52W85jUdH)iGl literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip new file mode 100644 index 0000000000000000000000000000000000000000..9fd13e82fb758d5d8ad377234eb31821059781ff GIT binary patch literal 1080 zcmWIWW@Zs#00G{1U0(^df;2H88-!VbIKM!zAc>1HLWnU^ib09lJ*QNQor^goGgpE! z07c{bHeQJcW(Ecj<^*cY%*!t=(aS5Sf+^zwoByn>@6c0~c6XL1{@bP+jHh zH{!3P1x}s_I2N#W-lV8GF-!86E{J=)e%;~)^QOgz1WlPCK0kiSBrc&c?=zZytX<;i zDq>5ReoD1j&9dU!$~A>y9ow=edoJB+^4k04&!UjMcP`DDbmqvSMMu2W{aAG6$)7!s zCM~-3L};CpTZh)VJt7@Bij&Qqvo(@~ssj`^x0x?CPu31`^zuw~)CyPh)|kA%)x6nU zIe>9>n|ZNDa_{{4^RL${FbLoKrs_F&V{49Nk0i(B8;xp>JjqN+(461}{FDwr}3uoEp!+EZB=85pJm zF&9`_L1{@bP+jHhH{!3P1x}s_I2N#W-lV8GF-!86E{J=)e%;~)^QOgz1WlPCK0kiS zBrc&c?=zZytX<;iDq>5Reo7s)nq|ecm1_#aI<{p`_FTHtCBNu zi;j4$`?2WClRtYNOg0tuc9jt9i4zascD%HuGYQ{h4DUHdrg*m5#caK>%~GMa;kVXrvj4UAT%1zK(RC`EM+5vebgq?@ znY&P@-8AUU#U(o@EMYE9XDD4hP30vc18P`VZ8S}D1%^=v5c30ZN@`hVa%y}@WkD*= z0Lz=4xez17%27iM99|-fdTbgDXg2aK^WtU$#>{LW<_6jbjKbj?JxteSHk`h(@zB1E z>2qp#eQ5cs>Z#FIV`Co0SbD$qa<+ zfb>}=5D%8fAZ5m_YfF!|14Tg?lyB3E5>w(0K;;HoWITw2UVd;UUBp0ahl& z1ukPCY)()?17-rt7D0H~;tn(}Gc~V-_%gOvd_hra8J9S^bb2CCB))`80$rjcGZ$!5YJm`j49r9+1|@d)pwQ5~%n~gTE`j3I z!qU{dMCcy z5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXGaS4@qpV9PV?GjH{ z5nICaQ|g%2EGw?9TvHgXO1jdbi`}jk40CW{Mqwp z(xOXGgw{E^b!e^IBhrzhIN97eTO&EBIzVxAoB3k%V(kz|FV9p*t#Czejmi64&6~}Y z0~lAgnHOs$_s*X`|9ZUwgYdm?s-ANdnUQ$dB#6fbCq1Y0C7GuWP9P}24(#l|MteCPU8|4PL6GtUar+i)ptLh4L|(>n^I zj~zO|kg{Z&%1cIgkV3-BYNKhID=>^YfS4bMQ&P(^f$>yQS&)h|!15+%F2o42a?}t5 zhnEPW9-9UOnvM7U-1l4q+T;n09c~~Ej6zDIr;S?ClT1qVfMRFzje}~93BcHC0LBiZ zOagOG0B-_=l)$M?N54y4SaM{NiY?Ilj7%cTpv;Lqc>*QCprH{&W#+*`16?0_R6%qx zFf=sk0D}@O;?OmthdM&@JfLQzI0*1&Wdlhv1K~O#J(mf@gJm#C@$#-Q*G(R%4TM4Y zFuf=-CEkE0MGR))11npXMh0CCw8#U?0?ROdXc^`XG%zzY zkFv7NgAoxa*o$dUNQ1zVMi2=uw9&Po7l9y!Ah4wII*>#v5MgN!dyIkgfVIW5AjKbY NL1j00@)3VyDgmw?i)j0_B1U}XiRCB;B>m9yW7zmgU> zc_QFgz}k6}qUOXb$y>T0?(zC{ix^)BFvyPj6J0RCBUGe5kzI? z!GZ!^A9_4NbTKeAH2wlI(P9lme1t`hD0010XSX=-A literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip new file mode 100644 index 0000000000000000000000000000000000000000..e2295f316c3a3e34087af99a2f4ac1c66ec8f6be GIT binary patch literal 1162 zcmWIWW@Zs#00EZ{UEdQsCMXC1*&xgc#Q6n!1xZ|t5kicSQVdEg!I^m(Rty0sTKpDW zJfY3ZzyQLWKrNYh`K2X#c?DH4WgK9WSMYP~yabeXWn^IB0xK&hEhz@7tDOBt{FSu8 z$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm( zsWz)wR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFok zt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=L+98f!o~e#n;fmfGllQlpH=8R5Fs^Ph zFV;xzoj-s6^?C&c;d|dyJ?Cz0&5`Vp>AQh+g^Co95#PEMPs{g^h7h%+6 z(_lce5jm{6fi?moWH<*lBa;X-C<$XvXg~=tXlMjcnR&2~2=GSLhaOE3T?`BjjlY0Q zv`9nOj2_|$%{jpIjuid@-mGjONoF8i2c*-PKs;Dhfn>s6Z4YAAfubM`%2DY>i7D|0 zpp3{C84u!sG9+6Bme87RrX002}-TjKx# literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip new file mode 100644 index 0000000000000000000000000000000000000000..87eab5db10f06128aec96cd4a4a500828b7753b4 GIT binary patch literal 1162 zcmWIWW@Zs#00EZ{UEhs|pYjR-*&xgc#Q6n!1xZ|t5kicSQVdEgA&GezRty0sTKpDW zJfY3ZzyQLWKrNYh`K2X#c?DH4WgK9WSMYP~yabeXWn^IB0xK&hEhz@7tDOBt{FSu8 z$rAy`0@lu(6g4MiN#4>0agW!pTfAW2wD^#qDKo_9$4{BWB~<2pM$?bAOFUggYzfm( zsWz)wR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$%99gvJh}XIwi>^HRv**#IMVFok zt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=L+98f!o~e#n;fmfGllQlpH=8R5Fs^Ph zFV;xzoj-s6^?C&c;d|dyJ?Cz0&5`Vp>AQh+g^Co95#PEMPs{g^h7h%+6 z(_lce5jm{6fi?moWH<*lBa;X-C<$XvXg~=tXlMjcnR&2~2=GSLhaOE3T?`BjjlY0Q zv`9nOj2_|$%{jpIjuid@-mGjONoF8i2c*-PKs;Dhfn>tA*jXIvKv56|<*4+c#FTgg za7K*eVvCFi@rBqTz%(#d@_=%sJ5Ve$HLt{q!GjUuIP94kXe1ac0fz`Ui=%5n&(sJl h#y}=g7Keo)c9$cxwE{~AxId5sVIfe>WuPPj0{|M3T2cT2 literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip new file mode 100644 index 0000000000000000000000000000000000000000..00a30f3611f954449c57141ef5a4b73dd862149a GIT binary patch literal 1075 zcmWIWW@Zs#0D-CwUEhQ=|9tF#Y!GGz;`{==f+Q~X%)FG$D6GZ7#ZsJ_nj*oV#lyvxo1apelP|HLmn{Irg2QZPC$yOv7(kd4XhCLP zerbtbUO^R183))MT+%B;F9D@p85tP3z{(0rONxQ&DrdhDe zlDBk0+~f7@7B84LEj}b@$_(-O@lz&o36*)D(ez{O5>HnVTf+2Hs?BPa71vg-DGck_ zmOa^X=}wc^-Y0(+h3vg^Y0jiGM;0wQ;zv#=wASqr>Bv!> zZ0?+`ksMSVpt!lse6e}5c8H^wXR4!CxT3elHWOPnF}%eUyka3u8wYi^*8zz7-6 zfz8My!VF6A*b^F10t^}&K~!cQEF=QFQT3rm6GRsSLqp>)AQLUp&^4onI6`v{Fufy% ze}FeD8%UBF2-gAWbS4lFVI1~M3sM6DOTc~rXJK?L=-CjW0i-1n$VAGN5c`=Jm>9Bv L5}Sd!mVp5P`)W@5 literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip new file mode 100644 index 0000000000000000000000000000000000000000..fa9214050033581395e23938d57016178699921c GIT binary patch literal 3032 zcmai$2UHX37RN&gC7}pJ0tkzsR6!9$I-$5L5D3{26humZP^5(f5KIlQWO;wl`b7=O6LXGU0u(<^S+sT=AQZP|NCa{ob#U_63)a70ql>$NO=>y zvj1rldcGe(dU)YZU%=WGr0xye~uMhpE5I~&6(39YU@pQl< zyzm~fTp(5~j^OI<<$1JDC0tpS69n|aJ7V$T%m{sh%ZdQxKlVf2|Fk{M4gl;2{RFsr z(w9B$iM#Pv8QZ~QWuUV{RhR*QQF?$5%II4Y=yd@F$-f!*`wF)$s>VeyxwA_0>Hi0K zofa}CShp4UAyQ ze(>P6NzOr_Q|ZE@zeGcO?xRoqT3-EwUI7E$TWnWw7+>f+|c zo8j9{>qs7L>X)pop`^Z7^izy~+D6`D>^F#vZrfgvTb+~BC0HwFj!I2!zw#flTvnHZlN*^?x7t*m-zmhXU;><9#g@~RT8C^h* z8`eyJ?z*pGApG*F1bm7AA(D-<1T~64LS!1p3rx3gm#|64vzWPYqPAaCYhUC~EP<0?KDqzHn0bY=_&U0WB-V zD54M<3MELJzU~{Z^qVQ|X{&l{FxTO0x7IceQNGSFTK`#(CI%KsgW9PMK%;?W7PC{g z)|Uz*HQryi*y>JJNJ5aniC;3=dAyXyHxPWVKco`vHMBfBo%I7?y(M`j=o)e1`OQb; z+YTHQpfxHyv&7zmPi$RcTI?B?YHwNDZPSsqIi+sqQ3p?X`y0+#h9(tL^#!;KP2L8B zYwDAEL#)W!yPfmt-5GqF(0|&CdWEvn&vaXfqz5-W9I<|`4p>{C033GTV{XquXO10d zry*OBrCCW=GOy-)7G!%SyXKVTWoM<@qEX~@=v`ZK5{UOYyh{cFjD|X&Az%6mv^@#dV~%77k*P|7aPYI! zneSE_d004F8H*yva1GV24zV7`?>C8^{k8Uaw0^J7MoJXh@>nbn31%xUQL&D3{vC*#v4kg#EgTC&LQHURUW^MMgj_P?Qxrinw_V}w9R$CrkWrJ46(OhO zDNW^PbP3i8td|w_CyiaftTzZnI-t zM!xDLeG+Cx>e^Z`YV;i;TcM+XWo@;lI&;pH|5QXuw90lX|P zJw;f?jxF8K&b+))x9c2xSaWE@T!138OM*xb&AQb7-mbFRph**BYNKws(Xbo4dC@UL#B?tskk5=pLENO<&CFi+%(D#jaio z;`Aa8V`Q&cArw*ZE?YAB1lLD@?$%IA#B7`A?<<$(v$On<3ceCX zGiQqFC*>VS*%SXh4{M`R`WD9Y;IaVjTrF>(&89*8yP018N3`~ zo2YyaH4cn0crSS|i{K@n=(s3$Dj3>sX(=b&C4S_qLN5#_Xa<{Uly~bibWF-vEPc=% zn>*$|4c0n&V=oSClcH(}wev{?LSgewF@|Hka}>dk>q;Eggq+t4*I~GeL{WUMgheuy zd$PwW=Y@P2c__Z+1?N~Vby86>+}|N{o3N|)5wvU<39qHz_I`VU|J{q&DOT{8yJR89 z%mxM|oPiO-v=46oPDu2LgK;x>|8l@@2k!Et?nmo8Q~>}tgKg>7^n(%pDE`sKzKQ$j z;_o~S2?w(5PcqSuGWzxSKI{E$iHGR|G-ZW_;7XG{VHD2 z!TbH0A3*A1&c08o2Pqv>A>U)_MY_xt>*@18cYnTbZVtUhZ2yM>_%F3F1;zjX literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip new file mode 100644 index 0000000000000000000000000000000000000000..a878aedf18c943383db392be5472f7471ed05700 GIT binary patch literal 1747 zcmWIWW@Zs#0D-+7y1qe;eda1aHVCrOjXso$Yo73_trTUJ}P>loy@LDdpjRnZZ$l8!`WmC#?QB zHScOD{WGb#@i((;>Z1P>H{9{e-tt5|Vri7{aj(z1m47@ zx&O(9Qwt|v4@sY8?#Z*{q5P?%jZw=cZsC4=NLogq_Jfqm2BwK?+XeVIJ|Bq-VPCsA zZie6oo>P1Jl}y?n_I{MuBOTZf#IOJ6OZnkF-(H8*P8Q2IJvFiK%#ov=cQZVH`f08S zi+LiZ@N1&Xo8-guS80Bd%1*` zu_v~*v-)4xEL2hAZLin7pnpMjN22^3LDxmIKM1)z{nA+6E>rW%&oST~bEKd#W6+$o zhcfy(Vdt7d6^}jWv)Qn8+o>;}A7@NG@Zk2k!#lpWtIv!7bZF-ssfrzzjO_P(gRIUf z^33m9qLsC4t~zR~O}v~`m=U7S#O%KoeRmV+!u zu8BSV%e^n)^~>(*@iPJ@OLW)SKD;6Ppv`vO${V|nN`F7Q+xhJ8*&j65iTB+uPq^VU zT}MmGAnTd7KmcP-75@jRY1cjeGoU7MtBs~4 z7Bzlp1%Cury!fGT{^~?ejk6zDt!h8~^V3bfhnKd?GR3IMsP4aE_3qaE#}y`9ZU@?1 zT}l9Ld&w~?6-2B8<(C~tj^o_*t;>T{q@^&S(oUgvgvC%%5*BOJ-Tsm z_nC_aAGRcW1|@k;k^{yiBa;X-s6fVEL;@wiprH{&W#+-67F{2Dx`XIqU}$K33``ej zsSsT=dJ;lt4h5FfNC_&yo0SbD$qa<+fV3j0&Oq`5_DTh$1_YLX{Q$0F5H5gdL9aj{ Z8bDgyflQ?81Y$oE0~13QP$C%Y8UTn-r$_(* literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip new file mode 100644 index 0000000000000000000000000000000000000000..1425d94c588e3a95c51fe7399dccba107a6644d0 GIT binary patch literal 1608 zcmWIWW@Zs#0D-+7y1p(2Wj7>%Y!GGz;`{==f+Q}c%v>SHN+|{MCcy5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXGaS4@qpV9PV z?GjH{5nICaQ>x8smKE1lt|<)b*p@xnbLmc#*WM?87KQA+b7{_`Ge;IJI^wnN$D%7w z{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(WvUZ50muIS@R=A?K#^n92 z=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXQ6PBsnJEXjE(D{hanaP5&grtcht% zrp?SbZM>@_c^r(L`d4u8+wRNLx#|Db(#>~`_xF|+A6kB+OaJu7{72t5R0nT8^X{M| z-|y6-;x|_1Y&W;xeY4g0c3AHH3wP5F-`RTf-Op6EYTo}0*3oLYzun)&Zd(_8`fW8! z>09|bv3co#@7}cYOPej*?b>y=v)A#P*5}|i|6Sj1+x?|@>fDzbgMJI&EBt0Z@5#;M z-=uGx{xapJ{;x$Dr{(L9ZwuXgJ;PhZAWlgw%S=v< zFR3g@#TlP@lQS1$L})o`go2|}gi()8g8|LPI|sDV*8vlcH!$&V18oGRupys%o_%+b;#mdQN}5sL$y> z;m^jYf4|kA6SpdvGw1z*qlLE~x zQ?AS+PU-oMg$5@!ekz~(c<++0eXgeLGTxJfyeEADMk^ze2s5bM!Coo=CBUGe5kzI? z!D1O*A9@mm=we`KXgm!}CTIx~T{C)WLTI)HrZ1$l72wUv29jh3!gWAe1X#o&k|w0& zEc*R;mjO@|gh9n0u#`-RH-MI&Tx^x`AORuv%v>-7SeA;Sl%?)K+04|u5)Vd%E3p^J yK(oPM2{>rL1vI)A^r9G{#Tv*&DwtuBfZYoSZC1d#0PZ2=&{zmmGZ9o#FaQ8hj$S4J literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip new file mode 100644 index 0000000000000000000000000000000000000000..e723b255e952f70ff8a8c4633db4256967a2275f GIT binary patch literal 1314 zcmWIWW@Zs#0D-$5y1stiX?c=BHVCrL1j00@NvA=uQ=`%7gtOa5&u(E>El478` z%GqzkUr7s`JP~j#VC}p~QFCIJ%M~BnMRoC~j^uUu+(%9pdQand+z&uIQ~Xd4H>U zv$=8rc-hzFm7epZV?94*m2^a{Z&XcVmR!-rnDN`~S`7iTgc1t-O1? zdsUgL=So%2DhAYGwc2Q!<_ZkX4j|?S;*`{~%;ePglFEWqoMD?cIddUK;FhBXE;wjK z81>jR7|?9=-uWQ?Hz)#u5y=gtfw4_#Otw)gCX-2tNl-*izTv0_ibzvnM4A>inBHEH zmBx_8cQxXYOjh}$7reKbYbKh>pDVik>+amV=?0cZk1g!FJWa9r_^XD{rnd_#yWe_E z67rt(1sFz*Od`yn9ECl(10}$qp%Fx7=D~stT_1XULUb`OG&F_-!y7Gj(KVw-4MOut zpk}0q3h-uS14%Li;W{AwmI=gzr7TF9@JsTdumMmMghBZ_y(lpy-T+iCu;s*qIMDKe ztpLme77`-Jg@ij$MP_PVi3cOXmDmd#pxI!s1RONrq6b|IdKrVzVgqC%l{m0S!0rWv UHW3yi4ASE9wo#hzMFoSBoKCn~=EqJ4-nleq(wQTR79H_g_hZqO zCx7-lnzZQB6QOlZZXH_d_K0-kC{8wa&eli{st!=x+-APmJX<@&(aST{Q7c^0TVwM6 zR`X_at46$dA z*w4GFj%~re05LWN4bM{v?uM`siEOvERdTrW+PHp$G}AuaGL*n83!cmP5| z#%iN!nkz6wI)IoTh=DPanVcG5Qdy9SGkEeQXD-ADpK{dj0f&wVqaK?EFg-w{3}WN+ z5*GfeK%3lwals9wfe}V&WVBH$GLlJ&3{YH5zHv~k(s)}c!`9_99kv!XY`wi8E1e;W zKQ)n|Y0{D-lT>VhzGq|-VFu+A>}e4w0R|0?ASyEt78vOI&|?asi-Dn`Q3@E6Xz_-w z89mSuntOnnk)j~Lo0SbD$qa<+fOHoVh=(u^drk+b0f8l8KY;T+x)$`D3ef=4QVe7w VLkn002!&WF`Or literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip new file mode 100644 index 0000000000000000000000000000000000000000..b38de79996df64e30b888f95c6440f6e6b3908e1 GIT binary patch literal 1107 zcmWIWW@Zs#0D;_2UEc%Wtohx6Y!GGz;`{==f+Q}coFXok^rFNRF80*4G!R3GGrzQ? zptK}Dr$~xHNyH~Lu_!MyFFh!+B-Jq`v7jU~KTnIFizBBfJ~5{tBT`)GcbTKC(x$MJfMksc?DH4WgK7!vG=$ZJqJp+FfuT3ft3MmEe5Kqoc%`pm9)Ug z69LBp*3O$0H790C-qHnekJqnTykOq6_>iC}GsNe|PnpCeROWp~(~q@FJY7X>3DZxh zV^*`QxVCalVOYnu?8%->cbdHRKKZjKWbd6zb0(cRvS`r}uXR5bU3v0n&!b6;EoTbLVW0ek?{774HdhW{T-|0~ ztdZP1fByXI^$HBa_r9rm&fVCWBPH1*$+7r`qnqKjG=|dUGaX8c8%l34c$LQRiZAuy zu5~%VlUkCP)UUAcEihpgSm0Tl=&^j7%1cHD)Znn%Xqx5<42BLM<_F@G)UwRv)cBIh zf>fMgk~cYXAx5B-qXr5%NJJR**fbc>Y-ExwId%nTlPfR+xPdez-YJTKHfqH{GAS_t zih#*C6xC80Oo72~THIiIdqGwjLl!Lfr$kIz!VdI2Ba;X-C_`XRe?SQ^XlMjcnR&3( zh^`Mkq9D2$7#bS+fdPpYZRncO!yKWx5vUm{1_HcU*+7!aK)4P_*D-;32;;D4YLFTb mSOWF~IE$leLC>8K4InL9KqgXNh1k!;z{CKOxD3q93=9C3&0rb; literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip new file mode 100644 index 0000000000000000000000000000000000000000..0d37ccb28e3ed2d9824742b46c735c1197fbb844 GIT binary patch literal 1135 zcmWIWW@Zs#0D;a15vm~`hh$*K?ib09TCpEDsFEcMa zD6u3}i=T_FAU`KFxl)NWF$btfi;s&vDX};;KBq`DP07SwOMr_fF}WnOEHyqYv#7X4 zf>BF^i$60DDprtMl$oC*(ZQ-E#Kns$Hianw#bu^V>#g*d85lsA6X>$cy!_GXO1jdbi`}jk40CW{Mqwp(xOXGgw{E^b!e^IBhrzhIN97eTO&EB zIzVxAoB3k%WbF_~FV9p*t#Czejmi64&6~}Y0~lAgnHOs$_s*X`|9ZUwgYdm?s-AN< zw&qCoNODZRp{SP1V48l&q`1NK_JXW5hAh6P7gi*vFf>kDa%7Tzq`=e_?&xsHXAT6_iOr(4Xv7d>7i6ILp@gA6Q85jUJ17-^V literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip new file mode 100644 index 0000000000000000000000000000000000000000..cdc2079dc2775881c79099ca6a359dc0f384193e GIT binary patch literal 1093 zcmWIWW@Zs#0DvO$;?i1Q2d3X-^ZGxG{cOX71<^U_N)in-YH;)@G% zGE0OQBc&LWczhtj!62ak6w{iQDrpBWGcbTKC(yLay!_G0agW!pTfAW2wD^#qDKo_9 z$4{BWB~<2pM$?bAOFUggYzfm(DL1QGR$N=TrZB8yTlQqnr8`Ytd!PJS6tefur8$$% z99gvJh}XIwi>^HRv**#IMVFokt#fkg&|0@gq$5Xhvbl4%MsiSffa2yh^Tp=D+98f! zo~e#n;fmfGllQlpH=8R5Fs^PhFV;xzoj-s6^?C&c;d|dyJ?Cz0&5@Gqk>pr>!_m!f zdm2M&`Yeai;)c@O3tpu$yyAPBn3j-`z?c-Kpy%mi*eRSMl$_1xp02^|VzpJplb-=K zM65QNrnv$`q63KefjA|#EHgPZzNE4s6=$&IP0n115iaGZ;Q|g75k@^W4Pc}|qY4rS zuHR23+y~lJ3ycG9APtNtN+Y3-T9J@UN+f{dVDb$|wU59!Fa^eeX>o(;?FCtB3|V|z z6H^ir8WJTETn^tj;;=(Sb0Z|=8JR?wL0JHMY6MDvK|>>m0_Oy9+6?eU)rTHu5M2xm z4UIy;Fhz?&bj|3&kI>u#)Ql7v0p6@^AW3E*TnD6km_R&)aoDplNDT-q0s8@*snNBd e=T(RXkQP566Dj9H>}O(NV#oqY90g`=1_l6uKVU2X literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip new file mode 100644 index 0000000000000000000000000000000000000000..86d5742dd78e16124ff80d99e2ec11d0f3f03247 GIT binary patch literal 1114 zcmWIWW@Zs#0D;w=y1vG2tUH5&Y!GGz;`{==f+Q}Ml*E!mF7}+%y!4WcVj*r2H@+Y- zB_%aQib09X2O=Jjn4%@Z#S0S8%uA0iOUx-v6$P5Xpe4w~Tas9mo>~$QF+zed0L4~m z^OYU8%nS@5%n8(xnU`N$qL)`t1yjZWcAQ_$57Q?==?+E)1}?C&g3^*=pt{Q0Z^U0o z3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180tNnAo@-e)xZSi8j2Rm7Gs z{giUEnq|ecm1_#aI<{p`_FTHtCBNui;j4$`?2WClRtYNOg0tuc9jt9i4zascD% zHuGYQu&~-_n&t`$3n1nP;*`{~%;ePglFEWq zoB@(IIddUKh?Jv-2sk`M81>jR7|?87!Oykx63`}BVCZuLXRnC4R{z_Wl*lo{glBRQx#Kyh=M`C{{6?GQ&V&s0aP za7Ay8$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`;pk?#Esddc z`Amn>;)c@O3tpu$yy8nuWN4bCA|h-dsFrfb=Ydj+QIR?~#u-a&v<_Zc3Am#_+ zl+?1!l%oa*I50#Q_1H8R&}>{YW7oP1K${$aLC+1OA(2W^ z*tbzD?2}0edr;6%zTv25xFwCj6d3fT#SNyn7i6U|Wbv^Ib*g$+0X@#hB*F|z<=9gl zPy!4Z8bMTM9xM%_>qCzuh%N?(hDJ_cAfiPXx@Pq7Mrf`EYDNnE0B=?{kR&q@t^?8~ zOduXCxj^zM^G6*|L!c-KgR)n8QDREG0VuC>LrjVXi6F8pCnyJl#elh%3z}<#OLILK z5zfS30ssvMgC*dw0hb5pTF~=4LW?JmiInqUv4GtV2yH8Xg$CS5$icA?q=p4V005(3 Bb0Po$ literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip new file mode 100644 index 0000000000000000000000000000000000000000..2ef77b58d9b0d4e985c4c6bbf838f0b38ef8b8c3 GIT binary patch literal 1240 zcmWIWW@Zs#0D-riy1tU88}H}>*&xgc#Q6n!1xZ|t5nPOsLTqJ;Ii;z^LhPA&DVfQs z#ZnAP+&-y!=_MJ(A^8Q~TC7}**%FKaDCRXjHj;2+W?%qePM~?2dHJO!dU*v^Fl8KI zYZU`Hx7`IwH!w0VaDkN-l$I0&)m6@ZBmPQS;N*#bV*zXDO^TWmvm|fng1E=)*DYQ! zZ(4jv(3BbC^W&#X;u0$JKBMW!+9jT@BDRF-r<9x3EGw?9TvHgXO1jdbi`}jk40CW{Mqwp(xOXGgw{E^b!e^IBhrzhIN97eTO&EBIzVxA zoB3k%VC@h`FV9p*t#Czejmi64&6~}Y0~lAgnHOs$_s*X`|9ZUwgYdm?s-AN3r7~0VN}$=<1DK&)4h(*>mYmlh@uS ze-?%8y>n^Kq%%hrEjr@0?#H4lPyXzAG-=VLCqnC-+&Z+@?Gfq7QJie(?<$^ne4+sunKl6&XRpMSkxfkF7*H&xHM8(VXv zBzq(|7T;)eYxH40Jc(z4Vy0Svv&Dh78w;ckF_@@KS^DtcC0kWb4YoohX%@pC<`qT< zjIXa#(cG%y$qx@XNRU`=7YdUkOW48 zvBiP38w+L~WH3>lqWb5+nY*f1-+m^XbA!`k%|pid|~%3Lfa8wi39gLa=m(5a$=@6(n&nMhG!hN--$$IF_XrC8npQ_~aKC z2cT$VPLW|TU}j(dVNRgN%)I>461}{FDwr}3u=&5Q+q}O5ly+rgVBi8PD<~}~2CA!^ z{YLziw7|&|0mlN?&YKi9CuT|B(gksk*RNZ=VBWO&kf13u#OKFPnZzYj=6y!fkF`rY zT}5mO(@&{3t65fDTe+q%tYcgDWY48LOS2j$WRrj#}Z0-Wrqlx0*Mb zD+e&HZZj{|Nba3KfByA)1qR`J-&8&4Zfwnw?2+V{d_z$!mBDoROb65A2GiRMveFo` z_)o*9rnv(B+yTV=K%A0VmYJLyUs73+iZcN6CTA|h2!V3c z5CHpMgi()8g8|LP1g-MRSAjM;14Ek|NCTr}Cy7Ln1bkHU}$Lk17xDb8@gunKu2iK14aW< z6a;v)vVkO-fp8s=&S3)aU}*-DAIrWvSLp&pK^T;+(u)#P;te2ql8db}9>f=7ivZKW z3@U=0LEV9JnW=dt9*hVlV$bD3gTY`4I9$Ma9$gE1{zhoA1Tv9wJ1iuzyC0!#0ipHz{gP%#ysN z3*sKHU$=O{ylL?vK~rXk&ySxniA$)=`;4X^YnOPsir5mSpHj!HW?6A<<(k5WgYK1F$YfRqXYTj(F9Kg7`&AeD6xp)5j`Pb_e7=-VA zQ}vv?u{B3ZvPY6*@eN0}!mVixrORhJlomIX-d^x3jo}sF)fE<>FW5bPD7pJ&ZRoe< zn}5iLRNT95e`)ot|MwT&uzR`w%D$c{r>pBD;x=h}I&Ewf&f;djz|qT=xZ8Er?l+J1 ztZ&L2{K!20_R9^uZ@;v5+i&~*cBV`7x6~@#4au=$TjkGk=(RW-C-OZ#_BOzcjg3u$ zjq`AZg!D0ePcL21OU&?yfrPl#M$;)Lm8DOvk9E#xL3|$L)v4qeP3uGb{Qn2{K?puU5TNWh0 PBM00)4h(*>mYmlh@uSe-?%8y>n^Kq%%hrEjr@0?#H4lPyXzAG-=VLCqnC- z+&Z+@?Gfq7QJie(?<$^ne4+sunK zl6&XRpMSkxfkF7*H&xHM8(VWEdn7p~-%wObWiVYn)4{a3!SwcmtTcu!zSKm9rb$bV zOj5CBK=rxRM$-B!Qp+-vQ{zi23sP}=HiT#ZX)y#c WkuoL3ekKMchAf~&GceaOFaQ9JvO(kk literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip new file mode 100644 index 0000000000000000000000000000000000000000..305f6fa75e5fa6130f58015317fcff51e1f90480 GIT binary patch literal 1393 zcmWIWW@Zs#0D+Tzy1vq9Y!gg?Y!GGz;`{==f+Q}+2rkA*A?Ezj5-A2HZl}bOIk|Oct=l8gk)t@-+&Nn#IjA~7adVsb zV)Jb65JxZ1R7b6FMQ@GC`&-SM&6NWfSGSoLYb5v1pFjV4y#j;qy>F_Xb2ql;NJ;ic zaxA{#=*E~B%@$BUi!;QYJz_uasyem>{{p5NZ~u35s>;F{2RY4~nO7`OTy~gk#U|lb zySB{vb3`avd7aa`E0a`g;UNeK8mo<_X|BK^=>TGWAWlgw%S=vU}Gbl4)PmVwdFlcB5QJHzL z;6T@h9#;@u3=9p8(!j7pi#c@7=)sQA+zZr<6bS*|tZX1jW*}S#q~2 zN@p`NFmQpD6_l0~1JzZ|ek1-$THxe~fMWq`=S_;56SE|5>4Lb&>(?z_FmGCXNYIoS z;`8IDOyUwM^FE{L$J!;Ht|GRC>8F&M)hsKntz1(W*0C*nvggvBCa=9u{wxaFd*{-e zNoS5MT6Dx~-H%0Ap8VPKXwsrfPlVPvxpio*+auDEqd3{zIa?z+s5(G#bDQ~Mb8qbs zM=#G*N3C#0Z;i?OTg{uzl>-=8x0x4fB=^prKmU5Y0)z0qZ>pYiH@40;XBd+f%0MsgYK2=<3m>-B!Qp+-vQ{zi23sP~0N#5klg&2WSjv6T7AQ55IW7A+jv++Hv zsqS^4P42)5;0DsbSfVrr+Nc!+$)v;pC;}$mIH*=>ye*Ys>++cnTZQE&XC2Q z8W^+p_A(XCjVhk}K>sr`i7Q_5L+Zp zI}(xX0J1@Lz~TaXIRx|!7%TzD1-Mi~*MeTqAhZMlnMlPBs!ix&hS0VTSYp9L5ZyNP cU_fY7Vu5M{1qv+ISQuCsZUEJ^0VNq209Bi!lmGw# literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip new file mode 100644 index 0000000000000000000000000000000000000000..bffbc66c8ea8b1f8936aa60c762e5a40a9f2663a GIT binary patch literal 1118 zcmWIWW@Zs#0D+%MTvRE@re=)2u_j&LjZ~`?enAj0+|^YK$sI~ zOJ*L>XuZ6GDwr}3uw%Sd?wtM^C|$$Iz`zAoR!~|}3{+P+`;GW3X@Qd`0*(c&oi{0J zPRx?Lr3>O7uV1%#!MthlAwg4Sh|iCoGKov5%=?U{A8VI*x{BBmrk_&FtY%qpZRMK6 zu#RonlRcO2G>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS z9L34z&eD6n`di>IC^=eI%;Be{3}{Q1}G z6&Qr?eN*+EyRkJ#O0q|iWAP0~H^#(hwt)IsoFVq?5&L;p)v+!37ch-=UHu!cNdadb zw9G!p7^09Uw_AM0CgE4Rw#@l+L?~H#ozuE2lT>WsAqWW?tBs~{ZL1WJHGLnDaF%!367x<2%{g6LvkXlRrMh9z3ep=(ADc7*0$pk|~<2=HcQ14%Li z;W{AQ%LL*fjKiM4L25u?3D^(d+>WjVJ+neIfV89mnMhd|Vm}iD6GIkI;vO(>GcW)E D_Z4gi literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip new file mode 100644 index 0000000000000000000000000000000000000000..2cf316c359a4157760ef3f219e2b6859ff7050ad GIT binary patch literal 1165 zcmWIWW@Zs#0D;(kUEhnAtBUx6Y!GGz;`{==f+Q}c2m>L;NGS#-7T?swyZ{s(J10dI z8Z$F6fG{UeM`m7rX^CE5K^06H2iV}r8!eVz14_FyGB9v~l@*kh6a&>&&VD2QN?PFL ziGX7PYv)ahniI1mZ|Q=#$LrTEUNCQ3d`Qrg8RGNfr%d7!D)T<0>Brh7o~|Ocgz2YL zo7F5UuB}{C7}l{Zd$Q-!ohGlnPyQ?l*?Z^GoJnVnELwELYu%4USDyUY^JvndOHYK> zIk|Oct=l8gk)t@-+&Nn#IjA~7adVsbV)JC}5JxZ1R7b6FMQ@GC`&-SM&6NWfSGSoL zYb5v1pFjV4y#j;qy>F_Xb2ql;NcKo_OulhYt01g2W zMm;tS1~eOy!$!m@|1mInx~|n3k}dVnPa+z-eBun?%`8c>pf0RXvFX6XO` literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip new file mode 100644 index 0000000000000000000000000000000000000000..652b0dc0909105f1f0fc7faac8659abf3f935031 GIT binary patch literal 1122 zcmWIWW@Zs#00EW>y1qhQ56`#&*&xgc#Q6n!1xZ|-IjMQ+B^kx>23#Cvi8-aIK$Z|+ zZfa3_YDzp*PKX!EhbWL@P~!0gDez43$t*4@4nVPn+imjg5M~Aj5atA0lbM%aTB4U% zPz6)Q0d`Hu<1+r|KA*ONei4j5pXPE?Yv1*b7Gd{EnN`z zc>TJ?3+7FW4+)wwLwtVxlu2AdW!`5r{aCxi(^bTlF#VKrvzleawUuiM!#cKQPxf58 z)8w`H$)80bd+%JDGwIBcMT?Gjt^2X)%9B5P9!*+w>50%fC$|o*b$diQaug?>Zld2=&dn%f2(=3xpDyG>NfLYjpW|>^XFf$S6~pn_f6Gv z?#9*}Dajs5j>R_|-8ODZV<=6ZdC%xfHCCbc9fsb6v7TTtY- zz{E<=(@WR$5;Fs8a9C|LO>+eXLkAG^193`fS!Qx-d`V?ND$X#;o1D22BT&jw0|gu; zB8+-$8VqPQRve4p_zY-MDKG-Kfiy6dhGz`)Kx5#9%du-bedhwiI!dxUC&>YWgON#u z8I(V;r%0d#7&J73sLVWA=%DLEk2;7h28M=4eqhj|MI^dr^jJV>ZUkyZijx3uRyL3% zGZ3x=(i50KJcM!B^EgNi2rL2n0i5H}wV>x$hz5|BbRZKc_d@JvVqjv(0!lmrW^V=t E01^UaJ^%m! literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip new file mode 100644 index 0000000000000000000000000000000000000000..3ff14dcf91ec12897da4de64275f04c5746b7918 GIT binary patch literal 1268 zcmWIWW@Zs#0D;H}y1uhN&wFMDWP>m(5a$=@6(n&nMhLOwrY7bIag-$%WhUk&r%Ev> zvHRxdrskCtYjJR~Bvzyr8!%|`aB*a(rWT}R<`zpZ1fUq~ujUkB$IQS0!kj>ZGxPFG zOZ4&zs$j}E!1h0X^xxzPP}-G|fq@IGte~`{7^tpt_8ak6(gG(>1RM)kJ8x3doR}qf zOBcjFUcYYff_c;8LxQHv5T74EWfGTAnfDn@Kh`esbQQ5BOh2XCtY%qpZRMK6u#Ron zlRcO2G>wYY{^5oB+N0Sy^dLp#W$*n_c-5!yS9L34z z&eD6n;Be{3}{Q1}G6&Qr? zeN*+EyRkJ#vPY6*@(o3`R0h-KGaXEe8%%F6$Vy|#;!CxxK9=t68I>AQfi-cnn zQz9_5xq&n=N+=C(H);iUFe$+;1q$!QHyYhC(-^)ko9XbisNw6a1+P*WUhzT$T&Szg zZ$lm50u^R~DJn~+sk~$ah6N*&2s0?{V^4cP2{34A1W}oJu%JQLhaPVbT?`Bjjemel zv{*#fj2;CD&3V8`L5h$7Z&o&tBr_1M1JYGYARa9DKyt78I;W`-Kv56|WwP|5#FTgg zP!8qEd;8uWdSJ!0N%rf#Q*>R literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip new file mode 100644 index 0000000000000000000000000000000000000000..437698665e19adbff0b0d08b1f0ba790e069c0eb GIT binary patch literal 1187 zcmWIWW@Zs#0D;yCy1t4Jg*)PaY!GGz;`{==f+Q}M^rFNRF1~`qqQuJzl?V@q&5N;zNR_%n+X+KV=e^P?`4`O+VHy@pKijB}_l1 zj#SM=7Hyua1F*<3k*adn${ zu|{(5{Q2{**DEjx-}|QOId@}gj+A7NB*)?#j&2*bq%o8(pXpFq+)#RZ!K*ZeSA3!Y zm)>kRy?xS#?)oRW)v77;V?BksGUo(bwrKE@Ty}2j=C}J3zAvA1@!Zni+Pi}ECIq={ z2%7fpOTvQ3LEe+TFrWsQ)kf1aS71PO05Lxhr=*r;Ca1=iR2HP-47I$;nF}$3tsFJj zz=0;hsK=(kfM#RV-d};Qfi~3wqlgIC7&_90gM?jw&AfIsdNwzP|Ozoke9u zi?%r^2l_m2nRTo&G&9R{k{mEh7@0(vK^Y8t!UalzK|>>m%FKfW5xPF~_=D(TU}$JG z1conKY@%yMj|zn5#X!wS5fk9e$_A2T2EuhfdOH({hcFI%ApueY0!zSt02dkPTF|pK aL<2|*NDopbhuF`=z{HRRlu%{?DFgtu)sv$D literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip new file mode 100644 index 0000000000000000000000000000000000000000..3998133efd35457d886b4f888d6e7f31e39d4199 GIT binary patch literal 1182 zcmWIWW@Zs#0D;yCy1vgl>)#du*&xgc#Q6n!1xZ{i=|zbtT#UI~OgTkdtObchiMd?t znRzLh$*ILcyoo7UrNt$wDe)jxLX5dW5EW7kO5(vlwZ*Bv`MIfiC8fE+?k=GPDTyVi zT0C4Fd8t4>MfqhC3|jnL9Jx>>qG!*Xu{B}{Kyj3uaO$2UW(Ecj<^(z_GcUiiL@%$P z3Z{$$>~_(p30HmrrC%~KFmQpD6_l0~1JzZ|ek1-$THxe~fMWq`=S_;56SE|5>4Lb& z>(?z_FmGCXNYIoS;`8IDOyUwM^FE{L$J!;Ht|GRC>8I2(t65fDTe+q%tYcgDWY48L zOSbW3L&2U>9L+SFF4yDBnrMDNnN@IA%mzv1XG)YB7*kYktLBjNBPLu4a zpY`a@GBK~aYs2w8^^nc+?#q=jJOKfdrbJCz!VV8eNC;VNG);2_hD-+#^8;~8Y8fyJ z<4Y#6uW|y?g+v0f8l8KY&XMbS>z)7oq{AWh#(~ Tl$Rm)GchnRWC11QSU?H^8%cx6 literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip new file mode 100644 index 0000000000000000000000000000000000000000..855579a23a352436ed3e32477eb597d4c8e415d8 GIT binary patch literal 1161 zcmWIWW@Zs#0D&VDbbY5c2J=V&*&xgc#Q6n!1xZ|t5kicSQVdFbeyQn+C8_R3i7A<> zc_jfTTBps{?a^jtU;trGpw`U1{L&J=yn-s2G7hi}EBLu~UII$HGBPl5ft3}MmJ|ck zRnC4R{z_Wl*lo{glBRQx#Kyh=M`C{{A?GQ&V&s0aPa7Ay8$@^Q) zo6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=1BHPa!kIVsFuoLx_qXCX>o(;?FCtB z3|V}si40AXmK>R+V#|Q)bE}P}X|6yYcK|Uz5T~S;WhSS_msA#{;`Dyrt=fGxU5@7}N-aoDEGZUZ&Mz&I zVo>7rE6pva49Uz%4L~uZ>3*KSFEaxJ2y+4r$;<<)*UKxYf+^zw+jUK9#oE_E=?RPs z3|wGkKud~&>MCcy5q~8uaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXG zaS4@qpV9PV?GjH{5nICaQ_9V1mKE1lt|<)b*p@xnbLmc#*WM?87KQA+b7{_`Ge;IJ zI^wnN$D%7w{_J@)Y0;%8LhGE|I<(g95$VWLoNVr#t&tp59iX_m&3v(Wuy%-}muIS@ zR=A?K#^n92=FR5H0gS8L%!@UWd*{!ef4yFTLHOP`RnNH_TXUo&dn7p)-*9v@+?K{r zy8K{CaYO0t1+UT=UhzHsQQ!0T&i8!zMz@7C4s{x+3a4?2=LJVdC@fIX+^XWq&wv^v zRvS&zT!BH+0mS@3oRV6WnVcG5Qdy9SGhFf}XD-ADm~zyB0SAi+qaK?E1DcI*zPonZ z2HKPej0A2V4U8#DW1)>&v5-tkEPx_m@(o9|ja$+fOqU-tDQ+;my&x-%A&V~*771?) zFRFTWs(Mxd!+?=Vgc+3Vu_r~K1Q;|lf+%q62PaH)edzH8(Z#^f&?pQHPqf%W*Nh(Y z2+gfP%}5au;LXYgl4J(LbwIk43B*Gfhdt|p)PTSeupht~8C?r{MulhqX#pidq-+bZ RpNWBqAqyz66PT|V7yuYkXGs76 literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip new file mode 100644 index 0000000000000000000000000000000000000000..76d2671c4fbcbe88921d2340dfca6ed535841f8c GIT binary patch literal 1099 zcmWIWW@Zs#0D;1Zy1vN|_m$}a*&xgc#Q6n!1xZ|t5nL=ju71H>Tu%O>elCtdp02?{ zjFD0dN`g*_CCM31rODZ;CH{G-9{D8!DCX_7-QWb!_Zdw;)-LgM6|p5uKc$XY&9dU!$~A>y9ow=edoJB+^4k04 z&!UjMcP`DDbmqvSMMu2W{aAG6$)7!sCM~-3L};CpTZh)VJt7@Bij&Qqvo(@~ssj`^ zx0x?CFV+rm^zuw~)CyPh)|kA%)x6nUIe>9>n|ZNDa_{{4^RL${FbLoKrs_F&V{49- zWRE1r;v0@`jET`~0rj&uL+sfj_Vcc)V_WbqK#Wa6!_&#ISvbc{`$UV+49=Jxz6nQ6 z8l|(C*aNEWUOLOC>bX+Yvx)&VY^*k#rnv&cqyvcgfjA|#EHgPZzNE4s6=wkDP0n11 z5klptAp{N|5k@^W4F)tD4YaG?-vrv^4U7$LAPtN%N~5EVTG5eAN_2o?WAcrIYL&)Y zQyI1{pXso*xMAz<1zG6~S^Q5E5;8WVGc-+Fa%7T7MG rgVcb)60je@SsPsodVYmy0BH#TGLdpG#C|3QCWb7a#A#smW?%pS>r-8* literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip new file mode 100644 index 0000000000000000000000000000000000000000..d0db2a7d3384957a0e3daa51087b9b267fe4e210 GIT binary patch literal 1071 zcmWIWW@Zs#0D;1Zy1oy02JEMbm5!9t9YQVdGm zPKhPS8UA^x9{D8!C}zxiYd*<^nSlX>Ie}(m=H-``=;aku!IW`;t=hEs|Kk@x={80N z1}?C&g3^*=pt{Q0Z^U0o3!FR=a4cZ$yh%}WVwU7BT@d$p{kp{q=1q$a37RrPe180t zNnAo@-e)xZSi8j2Rm7Gs{ggUpHOq=?E7uf;b!^L??74KO$!qVEKZ`>4-nleq(wQTR z79H_g_hZqOCx7-lnzZQB6QOlZZXH_d_K0-kC{8wa&eli{st!=x+-APmyjVNL(aST{ zQ7c^0TVwM6R`X_a<4i6Mc0C!KOq`GT3mokq}&RzpNWBq NAqyz69+;gO7yu3OPsIQL literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip new file mode 100644 index 0000000000000000000000000000000000000000..20e95bdf8633c81089a734bff4851051ce613448 GIT binary patch literal 1070 zcmWIWW@Zs#0D;1Zy1xBCUUjMh*&xgc#Q6n!1xZ}o8TlpgnRzLh$*IL$EXA2ssX}b| zr6mQWB~lDZZ2ozv9{D8!DCW4Ogm~C9GcbTKC(xYCJfM2Lyn-s2G7hkHkw@Ncc?6Ws zW@KRC0xJWWSqxNHIs1+HD`|m~CjyQIterP0YEI0Oyrm1`9cbdHRKKZjKWbd6zb0(cR zvS`r}uXR5bU3v0n&!b6;EoTbLVW0ek?{774HdhW{T-|0~tdZP1fByXI^$HBa_r9rm&fVCWBPH1*$+7r`qnqKj zG=|djLnXxxrMDNnN@IA%_cWoQX_AUautnptjI;s6z~fuBrD;Df?`@{NOPmB!mr8MZE;>9Do9Ve9P$S?LT}{Hciv84R-~M@(A6 z4)i`FlL#{?tz%DoKnXBtXarH2d9b92t`9w`Ai5YB8XCEPL5UV|=$g?(9ih1fs2M2^ z0=!w-K$6TrxDH5HGl6&rzq_o8b-&zcYoAT1t1CQ?R)*w4hk#E=D) KSO?6>3=9DIbydp% literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip new file mode 100644 index 0000000000000000000000000000000000000000..f6917e3c4ffa234bbed9039692b8eb8402e6bc58 GIT binary patch literal 1090 zcmWIWW@Zs#0D;1Zy1vuqY&&fVWP>m(5a$=@6(n)7=cMMPmt+)kacAV0#AoKEWG1H; zbFma>R;3ED<(HNel$J;_DDeiTrst;SmH6kSdgPY`pqR^6Rwm`l%)kJ`oIrCk^MKm) z@(QY8$~eH*FMPhd?=?_*AtM6=7g!n4=whI{%GqzkUr7s`JP~j#VC}p~QFCIJWQ;TT&Ue zE}!YJwYXvH?FCus3|aiFLY=CfRY0FJGKnyQ(m(bD2$TSWhDH#TnFmXb==#uO38IUE zp`lS27>a0dhOQYsz!92TftrzmKfs%n4J647gzJEGJrjtBFb;bj2B`soC15{*b2PdZ f^y~@I0MZfxWFlo&i2Y0qObl5-i9^8L%)kHuwohDy literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip new file mode 100644 index 0000000000000000000000000000000000000000..5b29d053454923273b4282559736cd4e16810e79 GIT binary patch literal 1472 zcmb_cZAep57(RFN&h5-hZYw_`&|0O7%!rr_A~$QGE4zqLh_tTTT|0HVTf29b_M@p7 z67{1gk!HE36%j3^4~x&gB!`NJc&pLJtZ^ zvJ;6&hI)x76Uu!9K6kS%PBE(> zt$lj9x%Tkco04oE)91G z1^H+4wF8r_cH{w{J!78&q0cWP2_7VtygLwT3mXkv$~liQ-}ZcZ%(Hzm%Pv2kDjY4l z-*)$UPsP2D)3Qphu591ds~cgL9iNWhSX5w?DQ93W(z$6QRP-_VmIL*#C=+bnzLxfH`W>v`6I3Qi`0ExwYQ1L{9rP+{Qq-)?(;{VW*yMI^OEs= z%|iL?Uo{J6&$8=k70EN{aXqm5xpDcV!kY6=<_{~(zn!%W-hKUFamK~`8=Y$bB1f3Z*3o#`g3UqyPGe&w$+(2#v(8@F`TL5|?lPaPD}IhSW14rFuZ zkvQ16>|jXyncUlU{|{{bv0>hqd3l$n2~PS@9lATp^|Zj{$IYp`HhaupvyAa_pCf~V zqgK`ewXB6^uNLSn@~G0*y}Rn^go*8|(!X4FzW!|wqlj(H<@I0dHR>vV9)2zLUi!ex zwB6Acwkg?eJG?Z#mI*Z^tu~sbxdKD51Bm&7I3=|#GdVTBq_Q9t6r8i)h`*8+IC&!A zSistOlcMItEXiBCAa8Q!!gY%m%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YAQb zifb#E2%{dG1_J}Cjn5V`zTpAJ9Ro0mxq%cgf?<(|)wa0D>oF`VL$$2U3T)jJhIMSq zp6t1Fr^##YlRt|>_TITPXVRG?ixwU6TK8kol_!7pJesuV(i5R|PHr7q>-LCr$$)ln;4(OYBk{#Ns5bL9ZW)otd*8p*x$=g+@hufQOD z@0+UU+>Na{l0A|flW#bx6;>8Am!>lCdKfoK9bhy&z;=~^Z2_aAwS;8Df`juKD;_W$ zJ)%+Skl?i7AfxDGW^Ne+L%o6njMm54lQ=Mu!HtQjuf>o@!$PQCvf< zfq}-zB*F~JW!RHEPy!4Z8bMTM9xM#e^`S>FL>B`?L!%ck640U>T{C)IAvEs+YDS8= z0B=?{kR&q@t^?8^ff*AK@sM(2$M?c$bD$^)gYtHIQDREG0ko`OD~Sj3h1gQSG_cg* z3(YIQQ*L-LA{>jo90FPa21~#p1TK}(wV)R_2rWK9CQ{)8iwo>NL1@!pLGl-JpezKc KX#+|!FaQ7_@~)`> literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip new file mode 100644 index 0000000000000000000000000000000000000000..8438af3a9f7867460b8edd67ea6a8850a0f31cec GIT binary patch literal 1218 zcmWIWW@Zs#00F&8y1uQ;rtQ)MvO$;?i1Q2d3X-_k$`W%*Q;WISb5irtOEQXunDa|Z zq!^U=0}_+7gZ%u0QwvK|^O92oP|T`bxAc$|GXnz%a{|rE%meDu%PXjYDdPZJX_PxX z<{?nJiIIVU3#_c5w4@lQu5$Jp@mJCUCr<<%3s^gEQq-K7C3#C1#64cWZt;S7)8a#d zrpyqZA3tRhmr$Aa8BIUdF7b2~u_a7DrQED$S#fRUn!>P-ZP}AOm+mxq?S1lRQOMpq zm*z}5b7aw?BVOx%EV}aK&z?t<7F~KGw9d(`Lu=h0k&Yb2$>z@48p%P`0g9X3%om#n zYlk>`d8RsQg)4e%Oy1vW-fXTMz__~2yjUZ-cmDkO*XtD+gztS*^_;u0HAhOaN0MXl z4M#Ud)2*otUzg8x_*&fX_4a~S=?t&uGH;@L#52az>My;?C z$q)28Ba;X-D5+ylcR&d+XlMjcnR&2~K-Y&JO%PoS3=NH(z+gm+G<41AA&$^o4b+Sj z{sG>sY#>QyAY2Ee%a}ksSb~9M)F1uUBS@20@Tdw!H94y_QC*Y1sE&=hY+}EK-Yqv=Mh?bfJ~$k02UY6eS*-o7Fc+| P{e>JT3xR65SwIQ_`k8oP literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip new file mode 100644 index 0000000000000000000000000000000000000000..953e275453a5a90638984ed0f5e756fa265bae79 GIT binary patch literal 1218 zcmWIWW@Zs#00F&8y1u%fE??IMvO$;?i1Q2d3X-_k$`W%*Q;WISb5irtOEQXunDa|Z zq!^S0L-Ps}le2^T{DM;pOH=cbQv*;;>oj{h(~6ma0fae$re)><_37mmRKb*SfGxed zNA2SypmYl(0|OUWSwU$@F;HFQ>^I`Cqy~9nnRMpJqD4o%*8Ny?<;kBtk0vd;^h9W#lUs+@x;-KtIf|3bowGHPgQ^1*H@BHD zHV@VgarE*`b<_%1^wyZXztz0iTseSob(?vyMsn}``SY*WD=-M(`=;tScVlaglw^-2 z$Ko4~ZiWq8QyF|cj2mYiU^F|(c9n%~0p~&orlTx=8#vc0+?{D~H|9sl@@XnB85vMR z!fK;wnkz5_I)IoTh*MI_GLuu|ODYReaRx`;XWBj@$m0|1hnGRcv8@AqFkd@Am#m_3# zsp?q;^f@Dw2s0?LV^4QL2{34A1W}oJuz*0F<-mGjONoF8i2c*lGKs;D_fn?WQhvW&SKv56|Ww7+3#FTggP?qHYniLP>A~G#E zBo~84fti;-05$7+Fe03ay)Xb;00v9I;R7xj(6ylFc!U-|AQLJ7!(sxvKM>m11B(v0 PuaJXeAy5rB3rHaVcaL?K literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip new file mode 100644 index 0000000000000000000000000000000000000000..945f89561e9d77d86e89e00b1ae849f27e97ef0a GIT binary patch literal 3245 zcmWIWW@Zs#00Fm2y1piLkM7z5*&xgc#Q6n!1xZ|t5kicSQVdEQ0f{M|xryniTAW<0 z1&Jx~NfHb&W)Xs!gJ70OFa)4jpr7VG`zsd%0|;{hEy&EvFD=o_E2x4g;{dzksc7N< z?JNuo-atnHRRWb2l$I0&)m1J%lNoS@Rf6rq-(ro;-O7?I6>GZNxOX)*F}?bCzdfL_ zbh7q|=H&E7(SXQrQ@r^!!;MlWD4&rDG%<3`d9op)?a~#FzyOh67FO2}4p)Ml1aDoL zx$OL2`Fk_JSI++_%GR=$C+cv4S>dJDja<1sZaX3r?#O6}aJIajWMb7Dqv?I=h}OT( z+NaW6xsuoQPG2o@>~`k96Ok$>1pdr_m^S-}#@EhOA=785^d3?T+#fhy;d;iCl`%8j z1Sc=K^2+3wglzcPJ(-15H|Fol64p%5mJr^n>yrCQC3VS}Yuj5RVpn+`44vINlcmbj zGU!-@LHVRvZHBX%nV7t+twX^^zg3Hm(t}2zrN6YwEC{#3==iIJ#)NrGA-xku3cL2UjGK4 zDZBj-f91XNnca*|9laXJwk;{$y!Q7;IjQ=y@=X0V4e~3r;=)b6*4sE;*jX!PYbC*y zogIEBd0D`+A1uDT50&Sax85sVw_H|SCSh7e>(TEu5mWcFvQ>$MT#z^6iA44NJrCUf8#5z8LGd z#`lvs18VnA;9Gmw|MRq{hgpYZx4lf|Hoc>>gp++)z(r?L+A*>Q4T&8yzu z#X=V^zvy1Kw|C346Rx+)7R^)YBf<}&m*4q6(rSEt?YT;vgWbbi7h(H$9qFfe(K0S z*V6Jl=IQp&#{P;8k9({kzwSfD9Z#>G+_m#w;@R!rFD<(ttH2T0GO1hchpP4T-0E+N z^DL%pIv4%vM1dCbW!1APTT%kT0(?GK`v?8q!1B1<@{oBnU)ok<(J%9t9sC&|>=v$l zw&dUBw%}!twyovZ(~|P!$;<^NN7<&Uxioj4ZMbkwn$?o?k@Z$55ue}PBC=XZ)66f2 ztyS-ezN46UxghUo?5i(5t!5iLvo_DWeD~ATify+=+Cp^AO5_({{3by%^rv@#YVncP+YBF7&rLcVUN~{~>$cPPlpgK>(KgkLMR}P6 zqJG~E-0|6ZS6pu6f`txe-b$-3Rb)DM{&t3t{>&NA{;K>>;j4?vXn&BySh0M)zG3g_ zg|jV{B^B54XLR7#CM&UP)?Ea%w?IvF(%y^=3|wGk1*Ij$Ky{U~--y4G7C3n#;8?)gd6S~% z#4O2Mx*+cH`gMyJ%$pV;5;SFo`26@OlemP+yw7O*v37~4tB5UO`YF|BHOq=?E7uf; zb!^L??74KO$!qVEKZ`>4-nleq(wQTR79H_g_hZqOCx7-lnzZQB6QOlZZXH_d_K0-k zC{8wa&eli{st!=x+-APmJXt%$(aST{Q7c^0TVwM6R`X_aGON@vL8e|n?rS^wSnu0PFG zJv&uBs~Av&z-psunk&%j9YD+v#3`v|naQc~C6xuKIKv@ta^^yefG9@|2yifnFzT^s zFreAUtHs^~3v_Ovjlh^0hM^vC>g)ab^3}ipOcY2m_Zo^d(s0+fI&ke zh|0`^1qr%7^!S44Vqj=!WCkW;wAe$}j2`p|&80xiND&d>&B_LnWCp@@K)R9%#6uW| tJ=25KfWQ*4AHYQbx)$_|3DE%3(hp=JWmAa#Obkp6SwM-;z*lo{glBRQx#Kyh=M`C{{6?GQ&V&s0aPa7Ay8 z$@^Q)o6VI27+1HM7i%Q<&YwU3dc6XJ@V#%Uo^v<0=158QNOCN`;poPg7|j+?KZ`TO zo;_ke@2Wbs1^)uX-c`vhaMVaEXgZs5_5p*JzOXU7*Yas9FB##11_=$Tjiza?z)dN0`ht{cRZdCE) z2l}6pNrV}c&atOJpad8+G=iwiJXmm`>qCz#h%N?(hDJVMSfa%ox@PoXM`&&UYDS8L z0B=?{kR&q@t^?8yOduXC4?%LO-J&bSCO}aT24%JMqQsPV15j>di;M?x;JKDP!T>A) z%)7!!dDk7NCNnj!#EQX#5#dto#Q@NJFjxW(9&n+6t_3~oBed89nMefzEF!Rb1EFmd Tund8F3OPg;0@bjxfD{4%0N!pk literal 0 HcmV?d00001 diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb deleted file mode 100644 index ba59745bd14a7ba83499096655b8daf060377400..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 67 zcmd;Li;Pb%N=)HmiZI|}DmM^fk1zlWNHHi0ho@$yXOyI-1efNz1Jz`v=9Osia&dyx Q$LHt8mrF1hF?cWn0Bc?nyZ`_I diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz deleted file mode 100644 index 3f35572017ab82066d16477130bd4512b1aaca86..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 390 zcmWIWW@Zs#U|`^2(3&CU%Og9rTY!;)p&N*~fTEds1*Ij$dU*wvv)_onk`_35BH&oS z+If?r=EN+?Te=|b@%nX(7tEU$9}+ZWhWPyWDU-N_%Dm5L`muJ2r>lr9VfrcMW;M%- zYb)0jhIMSqp6t1Fr^##YlRt|>_TITPXVRG?ixwU6TK8kol_!7pJesuV(i5R|PHr7q z>-LCr$$)ln;4(OYBk{#Ns5bL9ZW)otd*8p*x$ z=g+@hufQOD@0+UU+>Na{Qj$HA9E)!_x^1j7-kQqrb@@z(uf+{tZ!dV2&hU!=>Wv+r zT4Oh!bFsemHS^oj2JLz1-yC4a`W(w zSqCplo?2vYDf(jCl}ReL3<2JZOd`y1RM)k zJ8x3doR}qfOBdu#PFu8Y@q&5N;zNR_%n+X+KV=e^P`URRO+VHy@pKijB}_l1+N`*i zS#i0|vf|px^@MTVzop`Dba-dI;d%G_W_}>g=IPreY Date: Tue, 18 Sep 2018 10:47:21 -0700 Subject: [PATCH 225/237] improve aten/convolution error message (#11768) Summary: fixes https://github.com/pytorch/pytorch/issues/11762 Pull Request resolved: https://github.com/pytorch/pytorch/pull/11768 Differential Revision: D9884185 Pulled By: soumith fbshipit-source-id: 2a0c3e1f5a4fb4833ae6e9fc791abcf45f7fbea2 --- aten/src/ATen/native/Convolution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index b3fcff1b2ccd29..96ddb5ae3928b1 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -157,7 +157,7 @@ static void check_input_shape_forward(const at::Tensor& input, AT_CHECK(weight_dim == k, "Expected ", weight_dim, "-dimensional input for ", weight_dim, - "-dimensional weight ", weight.sizes(), ", but got input of size ", + "-dimensional weight ", weight.sizes(), ", but got ", k, "-dimensional input of size ", input.sizes(), " instead"); AT_CHECK(weight.size(0) >= groups, "Given groups=", groups, ", expected weight to be at least ", groups, From 540ef9b1fc5506369a48491af8a285a686689b36 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Tue, 18 Sep 2018 10:47:27 -0700 Subject: [PATCH 226/237] Add distributed get_backend (#11715) Summary: I have no idea how to run distributed tests locally so I'll let CI do this. Hopefully everything still works with `IntEnum`. cc mcarilli Pull Request resolved: https://github.com/pytorch/pytorch/pull/11715 Reviewed By: pietern Differential Revision: D9889646 Pulled By: SsnL fbshipit-source-id: 1e2a487cb6fe0bd4cc67501c9d72a295c35693e2 --- test/test_distributed.py | 30 +++++++++ torch/distributed/__init__.py | 4 ++ torch/distributed/distributed_c10d.py | 88 ++++++++++++++++++++++----- 3 files changed, 107 insertions(+), 15 deletions(-) diff --git a/test/test_distributed.py b/test/test_distributed.py index d311f74e3fe0f1..cc135c2ebec7a3 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -234,6 +234,36 @@ def test_get_default_group(self): default_grp = dist.get_default_group() self.assertNotEqual(default_grp, None) + def test_get_backend(self): + if dist.get_world_size() > 2: + group = [1, 2] + else: + group = [0, 1] + group_id = dist.new_group(group) + backend_str = BACKEND.lower() + self.assertEqual(dist.get_backend(), backend_str) + if dist.get_rank() in group: + self.assertEqual(dist.get_backend(group_id), backend_str) + else: + with self.assertRaisesRegex(RuntimeError, "Invalid process group specified"): + dist.get_backend(group_id) + + def test_DistBackend(self): + # test parsing + backend = BACKEND.lower() + self.assertEqual(dist.DistBackend(BACKEND.upper()), backend) + self.assertEqual(dist.DistBackend(BACKEND), backend) + with self.assertRaisesRegex(ValueError, "Invalid backend: 'undefined'"): + dist.DistBackend("undefined") + with self.assertRaisesRegex(ValueError, "Invalid backend: 'xYz'"): + dist.DistBackend("xYz") + with self.assertRaises(ValueError): + dist.DistBackend(None) + with self.assertRaises(ValueError): + dist.DistBackend(3) + with self.assertRaises(ValueError): + dist.DistBackend(["gloo"]) + # Test destroy def test_destroy_group(self): if dist.get_world_size() > 2: diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index 5356097743aa3c..2a7b004d527093 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -11,3 +11,7 @@ def is_available(): if is_available(): from .distributed_c10d import * + # Variables prefixed with underscore are not auto imported + # See the comment in `distributed_c10d.py` above `_backend` on why we expose + # this. + from .distributed_c10d import _backend diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 651f2ea6f2f870..0568e4261f4480 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1,4 +1,5 @@ import torch +from torch._six import string_classes from .rendezvous import rendezvous, register_rendezvous_handler from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \ @@ -23,11 +24,40 @@ _NCCL_AVAILBLE = False -class DistBackend: - UNDEFINED = -1 - GLOO = 0 - NCCL = 2 - MPI = 3 +class DistBackend(object): + """ + An enum-like class of available backends: GLOO, NCCL, and MPI. + + The values of this class are lowercase strings, e.g., ``"gloo"``. They can + be accessed as attributes, e.g., ``DistBackend.NCCL``. + + This class can be directly called to parse the string, e.g., + ``DistBackend(backend_str)`` will check if ``backend_str`` is valid, and + return the parsed lowercase string if so. It also accepts uppercase strings, + e.g., ``DistBackend("GLOO")`` returns ``"gloo"``. + + .. note:: The entry ``DistBackend.UNDEFINED`` is present but only used as + initial value of some fields. Users should neither use it directly + nor assume its existence. + """ + UNDEFINED = "undefined" + GLOO = "gloo" + NCCL = "nccl" + MPI = "mpi" + + def __new__(cls, name): + if not isinstance(name, string_classes): + raise ValueError("Backend name must be a string, but got: {}".format(name)) + value = getattr(DistBackend, name.upper(), DistBackend.UNDEFINED) + if value == DistBackend.UNDEFINED: + raise ValueError("Invalid backend: '{}'".format(name)) + return value + +# The following two values are here to maintain backward compatibility with +# pre-c10d distributed package. +# TODO: remove them when users are ready to take a hard dependency on PyTorch 1. +_backend = DistBackend.UNDEFINED +dist_backend = DistBackend class group(object): @@ -166,6 +196,30 @@ def get_default_group(): return _default_pg +def get_backend(group=group.WORLD): + """ + Returns the backend of the given process group. + + Arguments: + group (ProcessGroup, optional): The process group to work on. The + default is the general main process group. If another specific group + is specified, the calling process must be part of :attr:`group`. + + Returns: + The backend of the given process group as a lower case string. + + """ + _check_default_pg() + + if group == GroupMember.WORLD: + pg = _default_pg + else: + pg = group + if _rank_not_in_group(pg): + raise RuntimeError("Invalid process group specified") + return _pg_map.get(pg, None)[0] + + def init_process_group(backend, init_method="env://", **kwargs): @@ -174,9 +228,11 @@ def init_process_group(backend, initialize the distributed package Arguments: - backend (str): Name of the backend to use. Depending on build-time - configuration valid values include: - ``mpi`` and ``gloo``. + backend (str or DistBackend): The backend to use. Depending on + build-time configurations, valid values include ``mpi``, ``gloo``, + and ``nccl``. This field should be given as a lowercase string + (e.g., ``"gloo"``), which can also be accessed via + :class:`DistBackend` attributes (e.g., ``DistBackend.GLOO``). init_method (str, optional): URL specifying how to initialize the process group. world_size (int, optional): Number of processes participating in @@ -184,12 +240,13 @@ def init_process_group(backend, rank (int, optional): Rank of the current process. group_name (str, optional, deprecated): Group name. - To enable ``backend == mpi``, PyTorch needs to built from source on - a system that supports MPI. The same applies to NCCL as well. + To enable ``backend == DistBackend.MPI``, PyTorch needs to built from source + on a system that supports MPI. The same applies to NCCL as well. """ global _pg_map global _pg_names + global _backend global _default_pg global _default_pg_init_method @@ -203,7 +260,9 @@ def init_process_group(backend, assert len(kwargs) == 0, \ "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) - if backend == "mpi": + backend = DistBackend(backend) + + if backend == DistBackend.MPI: if not is_mpi_available(): raise RuntimeError("Distributed package doesn't have MPI built in") @@ -220,20 +279,19 @@ def init_process_group(backend, else: store, rank, world_size = next(rendezvous(init_method)) - if backend == "gloo": + if backend == DistBackend.GLOO: _default_pg = ProcessGroupGloo(store, rank, world_size) _pg_map[_default_pg] = (DistBackend.GLOO, store) _pg_names[_default_pg] = group_name - elif backend == "nccl": + elif backend == DistBackend.NCCL: if not is_nccl_available(): raise RuntimeError("Distributed package doesn't have NCCL " "built in") _default_pg = ProcessGroupNCCL(store, rank, world_size) _pg_map[_default_pg] = (DistBackend.NCCL, store) _pg_names[_default_pg] = group_name - else: - raise RuntimeError("Invalid distributed backend name: " + backend) + _backend = _pg_map[_default_pg][0] _default_pg_init_method = init_method From 47956ddf7e9632bcdb47df2e12b88a933454534d Mon Sep 17 00:00:00 2001 From: Will Feng Date: Tue, 18 Sep 2018 11:15:29 -0700 Subject: [PATCH 227/237] Revert D9755189: [pytorch][PR] [API CHANGE] Add empty tensor tests to test_sparse Differential Revision: D9755189 Original commit changeset: e9d36f437db1 fbshipit-source-id: 8b99edf626418a953a8bd786847a6e0174a3a14d --- aten/src/ATen/native/sparse/SparseTensor.cpp | 22 +- .../ATen/native/sparse/SparseTensorMath.cpp | 18 +- .../native/sparse/cuda/SparseCUDATensor.cpp | 3 - .../native/sparse/cuda/SparseCUDATensor.cu | 10 +- .../sparse/cuda/SparseCUDATensorMath.cu | 66 +- ...arse.test_add_dense_sparse_mismatch.expect | 1 + .../TestCudaSparse.test_log1p-backward.expect | 1 + ...stCudaSparse.test_log1p-uncoalesced.expect | 1 + ...arse.test_add_dense_sparse_mismatch.expect | 1 + ...coalescedSparse.test_log1p-backward.expect | 1 + ...lescedSparse.test_log1p-uncoalesced.expect | 1 + ...arse.test_add_dense_sparse_mismatch.expect | 1 + .../TestSparse.test_log1p-backward.expect | 1 + .../TestSparse.test_log1p-uncoalesced.expect | 1 + ...TestSparseOneOff.test_cuda_from_cpu.expect | 1 + ...eOff.test_cuda_sparse_cpu_dense_add.expect | 1 + ...arse.test_add_dense_sparse_mismatch.expect | 1 + ...coalescedSparse.test_log1p-backward.expect | 1 + ...lescedSparse.test_log1p-uncoalesced.expect | 1 + test/test_sparse.py | 1125 ++++++----------- 20 files changed, 478 insertions(+), 780 deletions(-) create mode 100644 test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect create mode 100644 test/expect/TestCudaSparse.test_log1p-backward.expect create mode 100644 test/expect/TestCudaSparse.test_log1p-uncoalesced.expect create mode 100644 test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect create mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect create mode 100644 test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect create mode 100644 test/expect/TestSparse.test_add_dense_sparse_mismatch.expect create mode 100644 test/expect/TestSparse.test_log1p-backward.expect create mode 100644 test/expect/TestSparse.test_log1p-uncoalesced.expect create mode 100644 test/expect/TestSparseOneOff.test_cuda_from_cpu.expect create mode 100644 test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect create mode 100644 test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect create mode 100644 test/expect/TestUncoalescedSparse.test_log1p-backward.expect create mode 100644 test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 49efed2a1e066b..25fd4fc5df4326 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -257,16 +257,12 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) { AT_ASSERT(!self.is_variable()); AT_ASSERT(self.is_sparse()); + if (self._nnz() < 2) { + _get_sparse_impl(self)->set_coalesced(true); + } if (self.is_coalesced()) { return self; } - // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false, - // we should keep the original tensor intact and do coalesce on a copy of the tensor - if (self._nnz() < 2) { - SparseTensor dst = self.clone(); - _get_sparse_impl(dst)->set_coalesced(true); - return dst; - } LongTensor indices = self._indices(); Tensor values = self._values().contiguous(); @@ -310,17 +306,13 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) { int64_t pos = indicesPermutationAccessor[j]; int64_t curr = indicesBufferAccessor[j]; if (curr == prev) { - if (values.numel() > 0) { // if values is an empty tensor, there are no elements to copy - THBlas_axpy(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); - } + THBlas_axpy(blockSize, 1, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); } else { ++i; for (int64_t d = 0; d < sparseDims; d++) { newIndicesAccessor[d][i] = indicesAccessor[d][pos]; } - if (values.numel() > 0) { // if values is an empty tensor, there are no elements to copy - THBlas_copy(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); - } + THBlas_copy(blockSize, values_ptr + pos * blockSize, 1, newValues_ptr + i * blockSize, 1); } prev = curr; } @@ -353,10 +345,6 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse _get_sparse_impl(r)->set_coalesced(mask.is_coalesced()); int64_t r_nnz = mask._nnz(); _get_sparse_impl(r)->set_nnz_and_narrow(r_nnz); - if (t.numel() == 0) { // if t is an empty tensor, there is no need to mask its elements - return r; - } - // NB: Relies on mask._nnz() == 0 test above auto mask_indices_accessor = mask_indices.accessor(); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index ec074b5a6c8a88..afd8001734a9a8 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -250,22 +250,18 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S for (d = 0; d < sparseDims; d++) { r_indices_accessor[d][r_i] = t_indices_accessor[d][t_i]; } - if (t_values.numel() > 0) { // We add all elements from t_values to r_values only if t_values is not an empty tensor - THBlas_axpy(blockSize, 1, - t_values_ptr + t_i * blockSize, 1, - r_values_ptr + r_i * blockSize, 1); - } + THBlas_axpy(blockSize, 1, + t_values_ptr + t_i * blockSize, 1, + r_values_ptr + r_i * blockSize, 1); t_i++; } if (cmp <= 0) { for (d = 0; d < sparseDims; d++) { r_indices_accessor[d][r_i] = src_indices_accessor[d][s_i]; } - if (s_values.numel() > 0) { // We add all elements from s_values to r_values only if s_values is not an empty tensor - THBlas_axpy(blockSize, cast_value, - s_values_ptr + s_i * blockSize, 1, - r_values_ptr + r_i * blockSize, 1); - } + THBlas_axpy(blockSize, cast_value, + s_values_ptr + s_i * blockSize, 1, + r_values_ptr + r_i * blockSize, 1); s_i++; } r_i++; @@ -372,7 +368,6 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes()); if (src_._nnz() == 0 || t_._nnz() == 0) { - r.resize_as_(src_); return r.zero_(); } @@ -524,6 +519,7 @@ Tensor& s_addmm_out_sparse_dense_cpu( AT_CHECK(sparse_._sparseDims() == 2, "addmm: matrices expected, got ", sparse_._sparseDims(), "D tensor"); AT_CHECK(sparse_._denseDims() == 0, "addmm: scalar values expected, got ", sparse_._denseDims(), "D values"); + AT_CHECK(dense.numel() != 0, "addmm: matrices expected, got empty tensor"); AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); SparseTensor sparse = sparse_.coalesce(); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp index 107a30f51c2a97..fc53c41b56f727 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp @@ -25,9 +25,6 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars _alias_into_sparse(r, mask_indices.clone(), r_values); _get_sparse_impl(r)->set_coalesced(mask.is_coalesced()); _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz()); - if (t.numel() == 0) { // if t is an empty tensor, there is no need to mask its elements - return r; - } LongTensor indices = at::zeros({mask._nnz()}, mask_indices.options()); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index ef2da61460eb99..ff4b0e0c57736c 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -27,16 +27,12 @@ namespace at { namespace native { SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { #ifndef __HIP_PLATFORM_HCC__ int64_t nnz = self._nnz(); + if (nnz < 2) { + _get_sparse_impl(self)->set_coalesced(true); + } if (self.is_coalesced()) { return self; } - // NOTE: Since `coalesce` is not an in-place operation when `is_coalesced` is false, - // we should keep the original tensor intact and do coalesce on a copy of the tensor - if (nnz < 2) { - SparseTensor dst = self.clone(); - _get_sparse_impl(dst)->set_coalesced(true); - return dst; - } cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 712c28817bf6b1..d7701479e9b243 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -106,38 +106,37 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT r__.transpose_(0, 1); } - if (nnz > 0) { - /* dense */ - Tensor dense_; - char transpose_dense; - if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { - transpose_dense = 'n'; - dense_ = dense; - } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) { - transpose_dense = 't'; - dense_ = dense; - } else { - transpose_dense = 't'; - dense_ = dense.contiguous(); - } - - sparse::cuda::csrmm2( - 'n', - transpose_dense, - m, - n, - k, - nnz, - cast_alpha, - values.data(), - csr.data(), - colIndicesInt.data(), - dense_.data(), - (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), - cast_beta, - r__.data(), - r__.stride(1)); + /* dense */ + Tensor dense_; + char transpose_dense; + if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) { + transpose_dense = 'n'; + dense_ = dense; + } else if(dense.stride(1) == 1 && dense.stride(0) != dense.size(1)) { + transpose_dense = 't'; + dense_ = dense; + } else { + transpose_dense = 't'; + dense_ = dense.contiguous(); } + + sparse::cuda::csrmm2( + 'n', + transpose_dense, + m, + n, + k, + nnz, + cast_alpha, + values.data(), + csr.data(), + colIndicesInt.data(), + dense_.data(), + (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)), + cast_beta, + r__.data(), + r__.stride(1)); + }); r_.copy_(r__); @@ -282,10 +281,6 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR int64_t nDim = dense.dim(); int64_t nDimI = sparse._sparseDims(); - if (sparse._values().numel() == 0) { - return r_; - } - if (sparse.is_coalesced()) { // TODO benchmark to decide whether to remove this special case const dim3 block = cuda::getApplyBlock(); @@ -428,7 +423,6 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons SparseTensor src = src_.coalesce(); if (src_._nnz() == 0 || t_._nnz() == 0) { - r_.resize_as_(src_); return r_.zero_(); } diff --git a/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect new file mode 100644 index 00000000000000..b6af4e9f4280b5 --- /dev/null +++ b/test/expect/TestCudaSparse.test_add_dense_sparse_mismatch.expect @@ -0,0 +1 @@ +add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestCudaSparse.test_log1p-backward.expect b/test/expect/TestCudaSparse.test_log1p-backward.expect new file mode 100644 index 00000000000000..8e4e1fc8c1c18f --- /dev/null +++ b/test/expect/TestCudaSparse.test_log1p-backward.expect @@ -0,0 +1 @@ +log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect new file mode 100644 index 00000000000000..b25c0d3db02b2e --- /dev/null +++ b/test/expect/TestCudaSparse.test_log1p-uncoalesced.expect @@ -0,0 +1 @@ +log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect new file mode 100644 index 00000000000000..b6af4e9f4280b5 --- /dev/null +++ b/test/expect/TestCudaUncoalescedSparse.test_add_dense_sparse_mismatch.expect @@ -0,0 +1 @@ +add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect new file mode 100644 index 00000000000000..8e4e1fc8c1c18f --- /dev/null +++ b/test/expect/TestCudaUncoalescedSparse.test_log1p-backward.expect @@ -0,0 +1 @@ +log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect new file mode 100644 index 00000000000000..b25c0d3db02b2e --- /dev/null +++ b/test/expect/TestCudaUncoalescedSparse.test_log1p-uncoalesced.expect @@ -0,0 +1 @@ +log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect new file mode 100644 index 00000000000000..b6af4e9f4280b5 --- /dev/null +++ b/test/expect/TestSparse.test_add_dense_sparse_mismatch.expect @@ -0,0 +1 @@ +add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestSparse.test_log1p-backward.expect b/test/expect/TestSparse.test_log1p-backward.expect new file mode 100644 index 00000000000000..8e4e1fc8c1c18f --- /dev/null +++ b/test/expect/TestSparse.test_log1p-backward.expect @@ -0,0 +1 @@ +log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestSparse.test_log1p-uncoalesced.expect b/test/expect/TestSparse.test_log1p-uncoalesced.expect new file mode 100644 index 00000000000000..b25c0d3db02b2e --- /dev/null +++ b/test/expect/TestSparse.test_log1p-uncoalesced.expect @@ -0,0 +1 @@ +log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect new file mode 100644 index 00000000000000..fab1614da93d4a --- /dev/null +++ b/test/expect/TestSparseOneOff.test_cuda_from_cpu.expect @@ -0,0 +1 @@ +backend of indices (CUDA) must match backend of values (CPU) \ No newline at end of file diff --git a/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect new file mode 100644 index 00000000000000..77b0b500f3b692 --- /dev/null +++ b/test/expect/TestSparseOneOff.test_cuda_sparse_cpu_dense_add.expect @@ -0,0 +1 @@ +add: expected 'other' to be a CPU tensor, but got a CUDA tensor \ No newline at end of file diff --git a/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect new file mode 100644 index 00000000000000..b6af4e9f4280b5 --- /dev/null +++ b/test/expect/TestUncoalescedSparse.test_add_dense_sparse_mismatch.expect @@ -0,0 +1 @@ +add: expected 'self' and 'other' to have same size, but self has size [3, 4] while other has size [3, 4, 4] (FYI: dense-sparse addition does not currently support broadcasting) \ No newline at end of file diff --git a/test/expect/TestUncoalescedSparse.test_log1p-backward.expect b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect new file mode 100644 index 00000000000000..8e4e1fc8c1c18f --- /dev/null +++ b/test/expect/TestUncoalescedSparse.test_log1p-backward.expect @@ -0,0 +1 @@ +log1p of a sparse tensor is made to be non-differentiable since local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. Use a different mathematical operation which preserves sparsity of gradients, or report a bug if you think this is an error. \ No newline at end of file diff --git a/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect new file mode 100644 index 00000000000000..b25c0d3db02b2e --- /dev/null +++ b/test/expect/TestUncoalescedSparse.test_log1p-uncoalesced.expect @@ -0,0 +1 @@ +log1p: in-place on uncoalesced tensors is not supported yet! \ No newline at end of file diff --git a/test/test_sparse.py b/test/test_sparse.py index 760879485e2b6d..82de45d6d17384 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -44,7 +44,7 @@ def setUp(self): self.SparseTensor = torch.sparse.DoubleTensor super(TestSparse, self).setUp() - def _gen_sparse(self, sparse_dims, nnz, with_size): + def _gen_sparse(self, d, nnz, with_size): # TODO: Consider implementing this in the CUDA case by directly # performing the operations on the GPU. You won't be able to # use torch.rand/torch.randn in this case because they are @@ -54,30 +54,28 @@ def _gen_sparse(self, sparse_dims, nnz, with_size): # If you do this, be sure to update assert_uncoalesced too if isinstance(with_size, Number): - with_size = [with_size] * sparse_dims + with_size = [with_size] * d if self.is_uncoalesced: # We want to generate a tensor with a lot of uncoalesced # entries to stress test whether or not we handle this # (subtle) case correctly - v_size = [nnz * 2] + list(with_size[sparse_dims:]) + v_size = [nnz * 2] + list(with_size[d:]) v = torch.randn(*v_size) - r = torch.rand(sparse_dims, nnz) + r = torch.rand(d, nnz) # Repeat the indexes, so every position shows up twice - i = torch.cat([r, r], dim=1) - if nnz > 0: - i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz * 2, 1).transpose(0, 1) + i = torch.cat([r, r], dim=1) * \ + torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1) i = i.type(torch.LongTensor) x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size)) self.assert_uncoalesced(x) else: - # Generate a sparse tensor with sparse_dims sparse dimensions; the - # rest the dimensions with_size[sparse_dims:] are dense. - v_size = [nnz] + list(with_size[sparse_dims:]) + # Generate a sparse tensor with d sparse dimensions; the + # rest the dimensions with_size[d:] are dense. + v_size = [nnz] + list(with_size[d:]) v = torch.randn(*v_size) - i = torch.rand(sparse_dims, nnz) - if nnz > 0: - i *= torch.Tensor(with_size[:sparse_dims]).repeat(nnz, 1).transpose(0, 1) + i = torch.rand(d, nnz) * \ + torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1) i = i.type(torch.LongTensor) x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size)) @@ -92,13 +90,15 @@ def assert_uncoalesced(self, x): correctness of the uncoalesced tensor generation algorithm. """ assert not x.is_coalesced() - existing_indices = set() - for i in range(x._nnz()): - index = str(x._indices()[:, i]) - if index in existing_indices: - return True - else: - existing_indices.add(index) + # Strategy: construct a new sparse tensor with the raw value + # field overwritten to a tensor of ones, coalesce it, and then + # check if any value entries are > 1 (which indicates that the + # original was uncoalesced.) + i = x._indices().clone() + v = x._values().clone().fill_(1) + y = torch.sparse.DoubleTensor(i, v, x.size()) + z = self.safeCoalesce(y) + assert (z._values() > 1).sum() > 0 def randn(self, *args, **kwargs): """ @@ -164,20 +164,18 @@ def test_print(self): @skipIfRocm def test_basic(self): - def test_shape(sparse_dims, nnz, with_size): - if isinstance(with_size, Number): - with_size = [with_size] * sparse_dims - x, i, v = self._gen_sparse(sparse_dims, nnz, with_size) - self.assertEqual(i, x._indices()) - self.assertEqual(v, x._values()) - self.assertEqual(x.ndimension(), len(with_size)) - self.assertEqual(self.safeCoalesce(x)._nnz(), nnz) - self.assertEqual(list(x.size()), with_size) - - test_shape(3, 10, 100) - test_shape(3, 10, [100, 100, 100]) - test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0]) - test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0]) + x, i, v = self._gen_sparse(3, 10, 100) + + self.assertEqual(i, x._indices()) + self.assertEqual(v, x._values()) + + x, i, v = self._gen_sparse(3, 10, [100, 100, 100]) + self.assertEqual(i, x._indices()) + self.assertEqual(v, x._values()) + self.assertEqual(x.ndimension(), 3) + self.assertEqual(self.safeCoalesce(x)._nnz(), 10) + for i in range(3): + self.assertEqual(x.size(i), 100) # Make sure that coalesce handles duplicate indices correctly i = self.IndexTensor([[9, 0, 0, 0, 8, 1, 1, 1, 2, 7, 2, 2, 3, 4, 6, 9]]) @@ -215,13 +213,6 @@ def test_ctor_size_checks(self): @skipIfRocm def test_to_dense(self): - def test_tensor(x, res): - x.to_dense() # Tests triple to_dense for memory corruption - x.to_dense() - x.to_dense() - self.assertEqual(res, x.to_dense()) - self.assertEqual(res, self.safeToDense(x)) - i = self.IndexTensor([ [0, 1, 2, 2], [0, 0, 0, 3], @@ -243,17 +234,12 @@ def test_tensor(x, res): [0, 0, 0, 0, 0], [0, 0, 0, 0, 4]], ]) - test_tensor(x, res) - i = self.IndexTensor([ - [0, 1, 2, 2], - [0, 0, 0, 3], - [0, 0, 1, 4], - ]) - v = self.ValueTensor(4, 0) - x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0])) - res = self.ValueTensor(3, 4, 5, 0) - test_tensor(x, res) + x.to_dense() # Tests double to_dense for memory corruption + x.to_dense() + x.to_dense() + self.assertEqual(res, x.to_dense()) + self.assertEqual(res, self.safeToDense(x)) @skipIfRocm def test_shared(self): @@ -265,21 +251,8 @@ def test_shared(self): i[0][0] = 0 self.assertEqual(self.ValueTensor([6, 0, 0]), self.safeToDense(x)) - i = self.IndexTensor([[2]]) - v = self.ValueTensor(1, 0) - x = self.SparseTensor(i, v, torch.Size([3, 0])) - i[0][0] = 0 - self.assertEqual(self.ValueTensor(3, 0), self.safeToDense(x)) - @skipIfRocm def test_to_dense_hybrid(self): - def test_tensor(x, res): - x.to_dense() # Tests double to_dense for memory corruption - x.to_dense() - x.to_dense() - self.assertEqual(res, x.to_dense()) - self.assertEqual(res, self.safeToDense(x)) - i = self.IndexTensor([ [0, 1, 2, 2], [0, 0, 0, 3], @@ -300,24 +273,15 @@ def test_tensor(x, res): [0, 0], [4, 5]], ]) - test_tensor(x, res) - i = self.IndexTensor([ - [0, 1, 2, 2], - [0, 0, 0, 3], - ]) - v = self.ValueTensor(4, 2, 0) - x = self.SparseTensor(i, v, torch.Size([3, 4, 2, 0])) - res = self.ValueTensor(3, 4, 2, 0) - test_tensor(x, res) + x.to_dense() # Tests double to_dense for memory corruption + x.to_dense() + x.to_dense() + self.assertEqual(res, x.to_dense()) + self.assertEqual(res, self.safeToDense(x)) @skipIfRocm def test_contig(self): - def test_tensor(x, exp_i, exp_v): - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) - i = self.IndexTensor([ [1, 0, 35, 14, 39, 6, 71, 66, 40, 27], [92, 31, 62, 50, 22, 65, 89, 74, 56, 34], @@ -329,7 +293,9 @@ def test_tensor(x, exp_i, exp_v): [31, 92, 65, 50, 34, 62, 22, 56, 74, 89], ]) exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7]) - test_tensor(x, exp_i, exp_v) + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) i = self.IndexTensor([ [2, 0, 2, 1], @@ -344,22 +310,10 @@ def test_tensor(x, exp_i, exp_v): [0, 0, 1, 4], ]) exp_v = self.ValueTensor([2, 1, 3, 4]) - test_tensor(x, exp_i, exp_v) - i = self.IndexTensor([ - [2, 0, 2, 1], - [0, 0, 3, 0], - [1, 0, 4, 0], - ]) - v = self.ValueTensor(4, 0) - x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0])) - exp_i = self.IndexTensor([ - [0, 1, 2, 2], - [0, 0, 0, 3], - [0, 0, 1, 4], - ]) - exp_v = self.ValueTensor(4, 0) - test_tensor(x, exp_i, exp_v) + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) # Duplicate indices i = self.IndexTensor([ @@ -375,30 +329,13 @@ def test_tensor(x, exp_i, exp_v): [0, 4], ]) exp_v = self.ValueTensor([6, 4]) - test_tensor(x, exp_i, exp_v) - i = self.IndexTensor([ - [0, 0, 2, 0], - [0, 0, 3, 0], - [0, 0, 4, 0], - ]) - v = self.ValueTensor(4, 0) - x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 0])) - exp_i = self.IndexTensor([ - [0, 2], - [0, 3], - [0, 4], - ]) - exp_v = self.ValueTensor(2, 0) - test_tensor(x, exp_i, exp_v) + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) @skipIfRocm def test_contig_hybrid(self): - def test_tensor(x, exp_i, exp_v): - x = self.safeCoalesce(x) - self.assertEqual(exp_i, x._indices()) - self.assertEqual(exp_v, x._values()) - i = self.IndexTensor([ [1, 0, 35, 14, 39, 6, 71, 66, 40, 27], [92, 31, 62, 50, 22, 65, 89, 74, 56, 34], @@ -416,7 +353,9 @@ def test_tensor(x, exp_i, exp_v): [2, 3], [1, 2], [6, 7], [4, 5], [10, 11], [3, 4], [5, 6], [9, 10], [8, 9], [7, 8], ]) - test_tensor(x, exp_i, exp_v) + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) i = self.IndexTensor([ [2, 0, 2, 1], @@ -431,22 +370,10 @@ def test_tensor(x, exp_i, exp_v): [0, 0, 1, 4], ]) exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]]) - test_tensor(x, exp_i, exp_v) - i = self.IndexTensor([ - [2, 0, 2, 1], - [0, 0, 3, 0], - [1, 0, 4, 0], - ]) - v = self.ValueTensor(4, 3, 0) - x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0])) - exp_i = self.IndexTensor([ - [0, 1, 2, 2], - [0, 0, 0, 3], - [0, 0, 1, 4], - ]) - exp_v = self.ValueTensor(4, 3, 0) - test_tensor(x, exp_i, exp_v) + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) # Duplicate indices i = self.IndexTensor([ @@ -462,79 +389,51 @@ def test_tensor(x, exp_i, exp_v): [0, 4], ]) exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]]) - test_tensor(x, exp_i, exp_v) - i = self.IndexTensor([ - [0, 0, 2, 0], - [0, 0, 3, 0], - [0, 0, 4, 0], - ]) - v = self.ValueTensor(4, 3, 0) - x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3, 0])) - exp_i = self.IndexTensor([ - [0, 2], - [0, 3], - [0, 4], - ]) - exp_v = self.ValueTensor(2, 3, 0) - test_tensor(x, exp_i, exp_v) + x = self.safeCoalesce(x) + self.assertEqual(exp_i, x._indices()) + self.assertEqual(exp_v, x._values()) @skipIfRocm def test_clone(self): - def test_shape(sparse_dims, nnz, with_size): - x = self._gen_sparse(sparse_dims, nnz, with_size)[0] - if self.is_uncoalesced: - self.assertFalse(x.is_coalesced()) - y = x.clone() - self.assertFalse(y.is_coalesced()) - x = x.coalesce() - self.assertTrue(x.is_coalesced()) + x, _, _ = self._gen_sparse(4, 20, 5) + if self.is_uncoalesced: + self.assertFalse(x.is_coalesced()) y = x.clone() - self.assertTrue(y.is_coalesced()) - - test_shape(4, 20, 5) - test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0]) - test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0]) + self.assertFalse(y.is_coalesced()) + x = x.coalesce() + self.assertTrue(x.is_coalesced()) + y = x.clone() + self.assertTrue(y.is_coalesced()) @cuda_only def test_cuda_empty(self): - def test_tensor(x): - y = x.cuda(0) - self.assertEqual(x._sparseDims(), y._sparseDims()) - self.assertEqual(x._denseDims(), y._denseDims()) - x = y.cpu() - self.assertEqual(y._sparseDims(), x._sparseDims()) - self.assertEqual(y._denseDims(), x._denseDims()) - x = torch.sparse.FloatTensor(2, 3, 4) - test_tensor(x) - - x = torch.sparse.FloatTensor(2, 3, 4, 0) - test_tensor(x) + y = x.cuda(0) + self.assertEqual(x._sparseDims(), y._sparseDims()) + self.assertEqual(x._denseDims(), y._denseDims()) + x = y.cpu() + self.assertEqual(y._sparseDims(), x._sparseDims()) + self.assertEqual(y._denseDims(), x._denseDims()) @skipIfRocm def test_transpose(self): - def test_shape(sparse_dims, nnz, with_size): - x = self._gen_sparse(sparse_dims, nnz, with_size)[0] - y = self.safeToDense(x) - - for i, j in itertools.combinations(range(4), 2): - x = x.transpose_(i, j) - y = y.transpose(i, j) - self.assertEqual(self.safeToDense(x), y) + x = self._gen_sparse(4, 20, 5)[0] + y = self.safeToDense(x) - x = x.transpose(i, j) - y = y.transpose(i, j) - self.assertEqual(self.safeToDense(x), y) + for i, j in itertools.combinations(range(4), 2): + x = x.transpose_(i, j) + y = y.transpose(i, j) + self.assertEqual(self.safeToDense(x), y) - test_shape(4, 20, 5) - test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0]) - test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0]) + x = x.transpose(i, j) + y = y.transpose(i, j) + self.assertEqual(self.safeToDense(x), y) @cpu_only def test_coalesce_transpose_mm(self): - def test_shape(di, dj, dk, nnz): - x, _, _ = self._gen_sparse(2, nnz, [dj, di]) + def test_shape(di, dj, dk): + x, _, _ = self._gen_sparse(2, 20, [dj, di]) y = torch.randn(dj, dk) x_coalesced = x.coalesce() @@ -547,58 +446,43 @@ def test_shape(di, dj, dk, nnz): expected = torch.mm(self.safeToDense(x_coalesced_t), y) self.assertEqual(res, expected) - test_shape(10, 20, 30, 20) - test_shape(0, 20, 30, 0) - test_shape(10, 0, 30, 0) - test_shape(10, 20, 0, 0) - test_shape(10, 20, 0, 20) + test_shape(10, 20, 30) def test_t_empty(self): - def test_in_place(x): - shape_original = x.shape - x.t_() - self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), x.size()) - self.assertEqual(0, x._indices().numel()) - self.assertEqual(0, x._values().numel()) - self.assertEqual(x._sparseDims(), 2) - self.assertEqual(x._denseDims(), 0) - - def test_not_in_place(x): - shape_original = x.shape - y = x.t() - self.assertEqual(torch.Size([shape_original[1], shape_original[0]]), y.size()) - self.assertEqual(0, y._indices().numel()) - self.assertEqual(0, y._values().numel()) - self.assertEqual(x._sparseDims(), 2) - self.assertEqual(x._denseDims(), 0) - x = self.SparseTensor(2, 3) - test_in_place(x) - test_not_in_place(x) + x.t_() + self.assertEqual(torch.Size([3, 2]), x.size()) + self.assertEqual(0, x._indices().numel()) + self.assertEqual(0, x._values().numel()) + self.assertEqual(x._sparseDims(), 2) + self.assertEqual(x._denseDims(), 0) - x = self.SparseTensor(2, 0) - test_in_place(x) - test_not_in_place(x) + x = self.SparseTensor(2, 3) + y = x.t() + self.assertEqual(torch.Size([3, 2]), y.size()) + self.assertEqual(0, y._indices().numel()) + self.assertEqual(0, y._values().numel()) + self.assertEqual(x._sparseDims(), 2) + self.assertEqual(x._denseDims(), 0) @skipIfRocm def test_add_zeros(self): - def test_shape(sparse_dims, nnz, sizes): - x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes) + def test_shape(sparse_dims, sizes): + x, _, _ = self._gen_sparse(sparse_dims, 20, sizes) zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device) r1 = zeros + x r2 = x + zeros self.assertEqual(r1, x) self.assertEqual(r2, x) - test_shape(1, 20, [1]) - test_shape(4, 20, [3, 17, 19, 5]) - test_shape(2, 20, [3, 17, 19, 5]) - test_shape(2, 20, [3, 17, 19, 0]) + test_shape(1, [1]) + test_shape(4, [3, 17, 19, 5]) + test_shape(2, [3, 17, 19, 5]) @cpu_only def test_mm(self): - def test_shape(di, dj, dk, nnz): - x, _, _ = self._gen_sparse(2, nnz, [di, dj]) + def test_shape(di, dj, dk): + x, _, _ = self._gen_sparse(2, 20, [di, dj]) t = torch.randn(di, dk) y = torch.randn(dj, dk) alpha = random.random() @@ -616,19 +500,15 @@ def test_shape(di, dj, dk, nnz): expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res, expected) - test_shape(10, 100, 100, 20) - test_shape(100, 1000, 200, 20) - test_shape(64, 10000, 300, 20) - test_shape(0, 100, 100, 0) - test_shape(10, 0, 100, 0) - test_shape(10, 100, 0, 0) - test_shape(10, 100, 0, 20) + test_shape(10, 100, 100) + test_shape(100, 1000, 200) + test_shape(64, 10000, 300) @cpu_only def test_saddmm(self): - def test_shape(di, dj, dk, nnz): - x = self._gen_sparse(2, nnz, [di, dj])[0] - t = self._gen_sparse(2, nnz, [di, dk])[0] + def test_shape(di, dj, dk): + x = self._gen_sparse(2, 20, [di, dj])[0] + t = self._gen_sparse(2, 20, [di, dk])[0] y = torch.randn(dj, dk) alpha = random.random() beta = random.random() @@ -645,52 +525,43 @@ def test_shape(di, dj, dk, nnz): expected = torch.mm(self.safeToDense(x), y) self.assertEqual(self.safeToDense(res), expected) - test_shape(7, 5, 3, 20) - test_shape(1000, 100, 100, 20) - test_shape(3000, 64, 300, 20) - test_shape(0, 100, 100, 0) - test_shape(1000, 0, 100, 0) - test_shape(1000, 100, 0, 0) + test_shape(7, 5, 3) + test_shape(1000, 100, 100) + test_shape(3000, 64, 300) @skipIfRocm def test_dsmm(self): - def test_shape(di, dj, dk, nnz): - x = self._gen_sparse(2, nnz, [di, dj])[0] + def test_shape(di, dj, dk): + x = self._gen_sparse(2, 20, [di, dj])[0] y = self.randn(dj, dk) res = torch.dsmm(x, y) expected = torch.mm(self.safeToDense(x), y) self.assertEqual(res, expected) - test_shape(7, 5, 3, 20) - test_shape(1000, 100, 100, 20) - test_shape(3000, 64, 300, 20) - test_shape(0, 100, 100, 0) - test_shape(1000, 0, 100, 0) - test_shape(1000, 100, 0, 0) - test_shape(1000, 100, 0, 20) + test_shape(7, 5, 3) + test_shape(1000, 100, 100) + test_shape(3000, 64, 300) @skipIfRocm def test_hsmm(self): - def test_shape(di, dj, dk, nnz): - x = self._gen_sparse(2, nnz, [di, dj])[0] + def test_shape(di, dj, dk): + x = self._gen_sparse(2, 20, [di, dj])[0] y = self.randn(dj, dk) res = torch.hsmm(x, y) - expected = torch.mm(self.safeToDense(x), y) + # TODO: use self.safeToDense(), but this triggers + # https://github.com/pytorch/pytorch/issues/3170 + expected = torch.mm(x.to_dense(), y) self.assertEqual(res.to_dense(), expected) - test_shape(7, 5, 3, 20) - test_shape(1000, 100, 100, 20) - test_shape(3000, 64, 300, 20) - test_shape(0, 100, 100, 0) - test_shape(1000, 0, 100, 0) - test_shape(1000, 100, 0, 0) - test_shape(1000, 100, 0, 20) + test_shape(7, 5, 3) + test_shape(1000, 100, 100) + test_shape(3000, 64, 300) - def _test_spadd_shape(self, nnz, shape_i, shape_v=None): + def _test_spadd_shape(self, shape_i, shape_v=None): shape = shape_i + (shape_v or []) - x, _, _ = self._gen_sparse(len(shape_i), nnz, shape) + x, _, _ = self._gen_sparse(len(shape_i), 10, shape) y = self.randn(*shape) r = random.random() @@ -712,7 +583,7 @@ def _test_spadd_shape(self, nnz, shape_i, shape_v=None): self.assertEqual(res, expected) - x, i, v = self._gen_sparse(len(shape_i), nnz, shape) + x, i, v = self._gen_sparse(len(shape_i), 10, shape) nnz = i.size(1) # Non contiguous sparse indices tensor @@ -735,40 +606,28 @@ def _test_spadd_shape(self, nnz, shape_i, shape_v=None): @skipIfRocm def test_spadd(self): - self._test_spadd_shape(10, [5, 6]) - self._test_spadd_shape(10, [10, 10, 10]) - self._test_spadd_shape(10, [50, 30, 20]) - self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5]) - self._test_spadd_shape(0, [0, 30, 20]) - self._test_spadd_shape(0, [50, 0, 20]) - self._test_spadd_shape(0, [50, 30, 0]) + self._test_spadd_shape([5, 6]) + self._test_spadd_shape([10, 10, 10]) + self._test_spadd_shape([50, 30, 20]) + self._test_spadd_shape([5, 5, 5, 5, 5, 5]) @skipIfRocm def test_spadd_hybrid(self): - self._test_spadd_shape(10, [5, 6], [2, 3]) - self._test_spadd_shape(10, [10, 10, 10], [3]) - self._test_spadd_shape(10, [50, 30, 20], [2]) - self._test_spadd_shape(10, [5, 5, 5, 5, 5, 5], [2]) - self._test_spadd_shape(0, [0, 30, 20], [2, 0]) - self._test_spadd_shape(0, [50, 0, 20], [2, 0]) - self._test_spadd_shape(0, [50, 30, 0], [2, 0]) - self._test_spadd_shape(10, [50, 30, 20], [2, 0]) + self._test_spadd_shape([5, 6], [2, 3]) + self._test_spadd_shape([10, 10, 10], [3]) + self._test_spadd_shape([50, 30, 20], [2]) + self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2]) @skipIfRocm def test_norm(self): - def test_shape(sparse_dims, nnz, with_size): - x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size) - y = x.coalesce() - self.assertEqual(x.norm(), y._values().norm()) - - test_shape(3, 10, 100) - test_shape(4, 10, [100, 100, 100, 5, 5, 5, 0]) - test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0]) + x, _, _ = self._gen_sparse(3, 10, 100) + y = x.coalesce() + self.assertEqual(x.norm(), y._values().norm()) - def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None): + def _test_basic_ops_shape(self, shape_i, shape_v=None): shape = shape_i + (shape_v or []) - x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape) - x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape) + x1, _, _ = self._gen_sparse(len(shape_i), 9, shape) + x2, _, _ = self._gen_sparse(len(shape_i), 12, shape) y1 = x1 + x2 y2 = x1.clone() @@ -830,49 +689,30 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None): @skipIfRocm def test_basic_ops(self): - self._test_basic_ops_shape(9, 12, [5, 6]) - self._test_basic_ops_shape(9, 12, [10, 10, 10]) - self._test_basic_ops_shape(9, 12, [50, 30, 20]) - self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5]) - self._test_basic_ops_shape(0, 12, [10, 10, 10]) - self._test_basic_ops_shape(9, 0, [10, 10, 10]) - self._test_basic_ops_shape(0, 0, [10, 10, 10]) - self._test_basic_ops_shape(0, 0, [10, 10, 0]) + self._test_basic_ops_shape([5, 6]) + self._test_basic_ops_shape([10, 10, 10]) + self._test_basic_ops_shape([50, 30, 20]) + self._test_basic_ops_shape([5, 5, 5, 5, 5, 5]) @skipIfRocm def test_basic_ops_hybrid(self): - self._test_basic_ops_shape(9, 12, [5, 6], [2, 3]) - self._test_basic_ops_shape(9, 12, [10, 10, 10], [3]) - self._test_basic_ops_shape(9, 12, [50, 30, 20], [2]) - self._test_basic_ops_shape(9, 12, [5, 5, 5, 5, 5, 5], [2]) - self._test_basic_ops_shape(0, 12, [10, 10, 10], [2]) - self._test_basic_ops_shape(9, 0, [10, 10, 10], [2]) - self._test_basic_ops_shape(0, 0, [10, 10, 10], [2]) - self._test_basic_ops_shape(9, 12, [10, 10, 10], [2, 0]) - self._test_basic_ops_shape(0, 12, [10, 10, 10], [2, 0]) - self._test_basic_ops_shape(9, 0, [10, 10, 10], [2, 0]) - self._test_basic_ops_shape(0, 0, [10, 10, 10], [2, 0]) - self._test_basic_ops_shape(0, 0, [10, 10, 0], [2, 0]) + self._test_basic_ops_shape([5, 6], [2, 3]) + self._test_basic_ops_shape([10, 10, 10], [3]) + self._test_basic_ops_shape([50, 30, 20], [2]) + self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2]) @skipIfRocm def test_add_dense_sparse_mismatch(self): - def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size): - x = torch.zeros(dense_size, dtype=self.value_dtype, device=self.device) - sparse_y = self.SparseTensor(torch.zeros(sparse_dims_shape, dtype=torch.int64, device=self.device), - torch.randn(dense_dims_shape, dtype=self.value_dtype, device=self.device), - torch.Size(sparse_size)) - with self.assertRaisesRegex( - RuntimeError, - "add: expected 'self' and 'other' to have same size"): - x + sparse_y - - test_shape([3, 4], [1, 4], [4, 4, 4], [3, 4, 4]) - test_shape([3, 4, 0], [1, 4], [4, 4, 4, 0], [3, 4, 4, 0]) - - def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None): + x = torch.zeros([3, 4], dtype=self.value_dtype, device=self.device) + sparse_y = self.SparseTensor(torch.zeros(1, 4, dtype=torch.int64, device=self.device), + torch.randn(4, 4, 4, dtype=self.value_dtype, device=self.device), + torch.Size([3, 4, 4])) + self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y) + + def _test_sparse_mask_shape(self, shape_i, shape_v=None): shape = shape_i + (shape_v or []) - x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape) - x2, _, _ = self._gen_sparse(len(shape_i), nnz_x2, shape) + x1, _, _ = self._gen_sparse(len(shape_i), 9, shape) + x2, _, _ = self._gen_sparse(len(shape_i), 12, shape) y1 = x1 + x2 y2 = x1.clone() @@ -900,85 +740,18 @@ def _test_sparse_mask_fixed(self): expected = self.SparseTensor(i, exp_v, torch.Size([5, 4])) self.assertEqual(res, expected) - i = self.IndexTensor([ - [1, 3, 0, 4], - [2, 1, 2, 3], - ]) - v = self.ValueTensor(4, 0) - x = self.SparseTensor(i, v, torch.Size([5, 4, 0])).coalesce() - dense = self.ValueTensor(5, 4, 0) - exp_v = self.ValueTensor(4, 0) - res = dense.sparse_mask(x) - expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 0])) - self.assertEqual(res, expected) - @skipIfRocm def test_sparse_mask(self): self._test_sparse_mask_fixed() - self._test_sparse_mask_shape(9, 12, [5, 6]) - self._test_sparse_mask_shape(9, 12, [10, 10, 10]) - self._test_sparse_mask_shape(9, 12, [50, 30, 20]) - self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5]) - self._test_sparse_mask_shape(0, 12, [10, 10, 10]) - self._test_sparse_mask_shape(9, 0, [10, 10, 10]) - self._test_sparse_mask_shape(0, 0, [10, 10, 10]) - self._test_sparse_mask_shape(0, 0, [10, 10, 0]) - - def _test_sparse_mask_hybrid_fixed(self): - i = self.IndexTensor([ - [1, 3, 0, 4], - [2, 1, 2, 3], - ]) - v = self.ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5]]) - # TODO: This is also testing that, if coalesce is a no-op, - # the indices don't get permuted. I don't know if we actually - # want to give this invariant. - x = self.SparseTensor(i, v, torch.Size([5, 4, 2])).coalesce() - dense = self.ValueTensor([ - [[1, 3], [2, 2], [3, 3], [4, 2]], - [[5, 7], [6, 7], [7, 9], [8, 9]], - [[9, 2], [10, 4], [11, 1], [12, 3]], - [[13, 5], [14, 1], [15, 1], [16, 6]], - [[17, 7], [18, 2], [19, 7], [20, 1]], - ]) - res = dense.sparse_mask(x) - exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]]) - expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2])) - self.assertEqual(res, expected) - - i = self.IndexTensor([ - [1, 3, 0, 4], - [2, 1, 2, 3], - ]) - v = self.ValueTensor(4, 2, 0) - x = self.SparseTensor(i, v, torch.Size([5, 4, 2, 0])).coalesce() - dense = self.ValueTensor(5, 4, 2, 0) - res = dense.sparse_mask(x) - exp_v = self.ValueTensor(4, 2, 0) - expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2, 0])) - self.assertEqual(res, expected) - - @skipIfRocm - def test_sparse_mask_hybrid(self): - self._test_sparse_mask_hybrid_fixed() + self._test_sparse_mask_shape([5, 6]) + self._test_sparse_mask_shape([10, 10, 10]) + self._test_sparse_mask_shape([50, 30, 20]) + self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5]) - self._test_sparse_mask_shape(9, 12, [5, 6], [2, 3]) - self._test_sparse_mask_shape(9, 12, [10, 10, 10], [3]) - self._test_sparse_mask_shape(9, 12, [50, 30, 20], [2]) - self._test_sparse_mask_shape(9, 12, [5, 5, 5, 5, 5, 5], [2]) - self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2]) - self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2]) - self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2]) - self._test_sparse_mask_shape(9, 12, [10, 10, 10], [2, 0]) - self._test_sparse_mask_shape(0, 12, [10, 10, 10], [2, 0]) - self._test_sparse_mask_shape(9, 0, [10, 10, 10], [2, 0]) - self._test_sparse_mask_shape(0, 0, [10, 10, 10], [2, 0]) - self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0]) - - def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None): + def _test_zeros(self, shape, out_shape_i, out_shape_v=None): out_shape = out_shape_i + (out_shape_v or []) - for nnz in nnzs: + for nnz in [9, 12]: out, _, _ = self._gen_sparse(len(out_shape_i), nnz, out_shape) torch.zeros(*shape, out=out) self.assertEqual(tuple(out.size()), tuple(shape)) @@ -987,22 +760,53 @@ def _test_zeros(self, nnzs, shape, out_shape_i, out_shape_v=None): self.assertEqual(out._sparseDims(), len(shape)) self.assertEqual(out._denseDims(), 0) + @skipIfRocm + def test_log1p(self): + if self.is_cuda: + input = torch.cuda.sparse.DoubleTensor( + torch.LongTensor([[0], [1], [2]]).transpose(1, 0).cuda(), + torch.FloatTensor([3, 4, 5]).cuda(), + torch.Size([3])) + else: + input = torch.sparse.DoubleTensor( + torch.LongTensor([[0], [1], [2]]).transpose(1, 0), + torch.FloatTensor([3, 4, 5]), + torch.Size([3])) + + expected_output = torch.tensor([3., 4., 5.]).log1p_() + self.assertEqual(expected_output, input.log1p().to_dense()) + self.assertEqual(expected_output, input.coalesce().log1p_().to_dense()) + + # test in-place op on uncoalesced input + self.assertExpectedRaises(RuntimeError, lambda: input.log1p_(), subname="uncoalesced") + + input.requires_grad_() + self.assertTrue(input.requires_grad) + + # test autograd + x = input.clone() + y = input.log1p() + self.assertExpectedRaises(RuntimeError, lambda: y.backward(x), subname="backward") + + # test uncoalesced input + input_uncoalesced = torch.sparse.DoubleTensor( + torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + torch.FloatTensor([2, 3, 4, 1, 1, 1]), + torch.Size([3])) + self.assertEqual(expected_output, input_uncoalesced.log1p().to_dense()) + self.assertEqual(expected_output, input_uncoalesced.coalesce().log1p_().to_dense()) + def test_zeros(self): - def test_shape(i_shapes, v_shapes, shape, nnzs): - for i_dim in range(1, len(i_shapes) + 1): - for v_dim in range(len(v_shapes) + 1): - self._test_zeros(nnzs, shape, i_shapes[:i_dim], v_shapes[:v_dim]) - test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 4], [9, 12]) - test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 4], [0]) - test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 4], [9, 12]) - test_shape([2, 3, 4], [3, 4, 5, 6], [2, 3, 0], [9, 12]) - test_shape([0, 3, 4], [3, 4, 5, 6], [2, 3, 0], [0]) - test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12]) - - def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None): + i_shapes = [2, 3, 4] + v_shapes = [3, 4, 5, 6] + for i_dim in range(1, len(i_shapes) + 1): + for v_dim in range(len(v_shapes) + 1): + self._test_zeros([2, 3, 4], i_shapes[:i_dim], v_shapes[:v_dim]) + + def _test_zeros_like(self, template_shape_i, template_shape_v=None): template_shape_v = template_shape_v or [] template_shape = template_shape_i + template_shape_v - for nnz in nnzs: + for nnz in [9, 12]: t, _, _ = self._gen_sparse(len(template_shape_i), nnz, template_shape) res = torch.zeros_like(t) self.assertEqual(tuple(res.size()), tuple(template_shape)) @@ -1012,65 +816,138 @@ def _test_zeros_like(self, nnzs, template_shape_i, template_shape_v=None): self.assertEqual(res._denseDims(), len(template_shape_v)) def test_zeros_like(self): - def test_shape(i_shapes, v_shapes, nnzs): - for i_dim in range(1, len(i_shapes) + 1): - for v_dim in range(len(v_shapes) + 1): - self._test_zeros_like(nnzs, i_shapes[:i_dim], v_shapes[:v_dim]) - test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12]) - test_shape([0, 3, 4], [3, 4, 5, 6], [0]) - test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12]) - test_shape([2, 3, 4], [3, 4, 5, 6], [9, 12]) - test_shape([0, 3, 4], [3, 4, 5, 6], [0]) - test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12]) - - def _test_log1p_tensor(self, input, dense_tensor): - expected_output = torch.tensor(dense_tensor).log1p_() - self.assertEqual(expected_output, input.log1p().to_dense()) - self.assertEqual(expected_output, input.coalesce().log1p_().to_dense()) + i_shapes = [2, 3, 4] + v_shapes = [3, 4, 5, 6] + for i_dim in range(1, len(i_shapes) + 1): + for v_dim in range(len(v_shapes) + 1): + self._test_zeros_like(i_shapes[:i_dim], v_shapes[:v_dim]) - # test in-place op on uncoalesced input - with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"): - input.log1p_() + def _test_sparse_mask_hybrid_fixed(self): + i = self.IndexTensor([ + [1, 3, 0, 4], + [2, 1, 2, 3], + ]) + v = self.ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5]]) + # TODO: This is also testing that, if coalesce is a no-op, + # the indices don't get permuted. I don't know if we actually + # want to give this invariant. + x = self.SparseTensor(i, v, torch.Size([5, 4, 2])).coalesce() + dense = self.ValueTensor([ + [[1, 3], [2, 2], [3, 3], [4, 2]], + [[5, 7], [6, 7], [7, 9], [8, 9]], + [[9, 2], [10, 4], [11, 1], [12, 3]], + [[13, 5], [14, 1], [15, 1], [16, 6]], + [[17, 7], [18, 2], [19, 7], [20, 1]], + ]) + res = dense.sparse_mask(x) + exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]]) + expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2])) + self.assertEqual(res, expected) - input.requires_grad_() - self.assertTrue(input.requires_grad) + @skipIfRocm + def test_sparse_variable_methods(self): + # TODO: delete when tensor/variable are merged + from torch.autograd import Variable + i = self.IndexTensor([[0, 1, 1], [2, 0, 2]]) + v = self.ValueTensor([3, 4, 5]) + sparse_mat = self.SparseTensor(i, v, torch.Size([2, 3])) + sparse_var = Variable(sparse_mat) + + to_test_one_arg = { + 'zeros_like': lambda x: torch.zeros_like(x), + 'transpose': lambda x: x.transpose(0, 1), + 'transpose_': lambda x: x.transpose_(0, 1), + 't': lambda x: x.t(), + 't_': lambda x: x.t_(), + 'div': lambda x: x.div(2), + 'div_': lambda x: x.div_(2), + 'pow': lambda x: x.pow(2), + '_nnz': lambda x: x._nnz(), + 'is_coalesced': lambda x: x.is_coalesced(), + 'coalesce': lambda x: x.coalesce(), + 'to_dense': lambda x: x.to_dense(), + '_sparseDims': lambda x: x._sparseDims(), + '_denseDims': lambda x: x._denseDims(), + 'norm': lambda x: x.norm(), + 'log1p': lambda x: x.log1p(), + } + + for test_name, test_fn in to_test_one_arg.items(): + var1 = sparse_var.clone() + tensor1 = sparse_mat.clone() + + out_var = test_fn(var1) + out_tensor = test_fn(tensor1) + + if isinstance(out_tensor, int) or isinstance(out_tensor, bool): + if not isinstance(out_var, int) and not isinstance(out_var, bool): + check_var = out_var.data[0] + else: + check_var = out_var + self.assertEqual(out_var, out_tensor) + continue + + # Assume output is variable / tensor + self.assertEqual(test_fn(var1).data, test_fn(tensor1), + test_name) + + i = self.IndexTensor([[0, 0, 1], [1, 2, 1]]) + v = self.ValueTensor([3, 3, 4]) + sparse_mat2 = self.SparseTensor(i, v, torch.Size([2, 3])) + sparse_var2 = Variable(sparse_mat2) + + to_test_two_arg = { + 'sub': lambda x, y: x.sub(y), + 'sub_': lambda x, y: x.sub_(y), + 'mul': lambda x, y: x.mul(y), + 'mul_': lambda x, y: x.mul_(y), + } + + for test_name, test_fn in to_test_two_arg.items(): + var1 = sparse_var.clone() + var2 = sparse_var2.clone() + tensor1 = sparse_mat.clone() + tensor2 = sparse_mat2.clone() + self.assertEqual(test_fn(var1, var2).data, + test_fn(tensor1, tensor2), test_name) + + to_test_mixed = [ + # test name, lambda expression, should_run_when_cuda + ('sspaddmm', lambda sp, de: sp.sspaddmm(sp, de), False), + ('sspaddmm_b', lambda sp, de: sp.sspaddmm(2, sp, de), False), + ('sspaddmm_b_a', lambda sp, de: sp.sspaddmm(3, 2, sp, de), False), + ('addmm', lambda sp, de: de.addmm(sp, de), True), + # TODO: This looks like a typo + ('addmm_', lambda sp, de: de.addmm(sp, de), True), + ('mm', lambda sp, de: torch.mm(sp, de), True), + ('mm_out', lambda sp, de: torch.mm(sp, de, out=de), True), + ] - # test autograd - x = input.clone() - y = input.log1p() - with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"): - y.backward(x) + i = self.IndexTensor([[0, 0, 1, 2, 2], [1, 2, 1, 0, 1]]) + v = self.ValueTensor([3, 3, 4, 1, 2]) + sparse_mat = self.SparseTensor(i, v, torch.Size([3, 3])) + sparse_var = Variable(sparse_mat) + dense_mat = sparse_mat.to_dense().random_(0, 5) + dense_var = Variable(dense_mat) + + for test_name, test_fn, test_cuda in to_test_mixed: + if sparse_var.is_cuda and not test_cuda: + continue + sp_var = sparse_var.clone() + de_var = dense_var.clone() + sp_mat = sparse_mat.clone() + de_mat = dense_mat.clone() + self.assertEqual(test_fn(sp_var, de_var).data, + test_fn(sp_mat, de_mat), test_name) @skipIfRocm - def test_log1p(self): - input = torch.sparse_coo_tensor( - torch.LongTensor([[0], [1], [2]]).transpose(1, 0), - torch.FloatTensor([3, 4, 5]), - torch.Size([3]), - device=self.device) - self._test_log1p_tensor(input, [3., 4., 5.]) + def test_sparse_mask_hybrid(self): + self._test_sparse_mask_hybrid_fixed() - # test uncoalesced input - input_uncoalesced = torch.sparse_coo_tensor( - torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), - torch.FloatTensor([2, 3, 4, 1, 1, 1]), - torch.Size([3]), - device=self.device) - self._test_log1p_tensor(input_uncoalesced, [3., 4., 5.]) - - input = torch.sparse_coo_tensor( - torch.zeros([2, 0]), - torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), - torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]), - device=self.device) - self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0])) - - input = torch.sparse_coo_tensor( - torch.zeros([1, 5]), - torch.zeros([5, 6, 0]), - torch.Size([5, 6, 0]), - device=self.device) - self._test_log1p_tensor(input, torch.zeros([5, 6, 0])) + self._test_sparse_mask_shape([5, 6], [2, 3]) + self._test_sparse_mask_shape([10, 10, 10], [3]) + self._test_sparse_mask_shape([50, 30, 20], [2]) + self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2]) @skipIfRocm def test_sparse_add_coalesce(self): @@ -1082,55 +959,30 @@ def test_sparse_add_coalesce(self): self.assertFalse(z._indices().numel() != 2 and z.is_coalesced()) - i = self.IndexTensor([[1, 2, 1]]) - v = self.ValueTensor(3, 0) - x = self.SparseTensor(i, v, torch.Size([3, 0])) - y = self.SparseTensor(i, v, torch.Size([3, 0])) - z = x + y - - self.assertFalse(z._indices().numel() != 2 and z.is_coalesced()) - @cuda_only def test_storage_not_null(self): x = torch.cuda.sparse.FloatTensor(2) self.assertNotEqual(x.get_device(), -1) - x = torch.cuda.sparse.FloatTensor(2, 0) - self.assertNotEqual(x.get_device(), -1) - @cuda_only @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected") @skipIfRocm def test_same_gpu(self): - def check_device(x, device_id): - self.assertEqual(x.get_device(), device_id) - self.assertEqual(x._values().get_device(), device_id) - self.assertEqual(x._indices().get_device(), device_id) - i = self.IndexTensor([[2]]).cuda(1) v = self.ValueTensor([5]).cuda(1) x = self.SparseTensor(i, v, torch.Size([3]), device=1) - check_device(x, 1) - - i = self.IndexTensor([[2]]).cuda(1) - v = self.ValueTensor(1, 0).cuda(1) - x = self.SparseTensor(i, v, torch.Size([3, 0]), device=1) - check_device(x, 1) + self.assertEqual(x.get_device(), 1) + self.assertEqual(x._values().get_device(), 1) + self.assertEqual(x._indices().get_device(), 1) x = self.SparseTensor(3, device=1) - check_device(x, 1) - - x = self.SparseTensor(3, 0, device=1) - check_device(x, 1) + self.assertEqual(x.get_device(), 1) + self.assertEqual(x._values().get_device(), 1) + self.assertEqual(x._indices().get_device(), 1) - i = self.IndexTensor([[2]]).cuda(1) v = self.ValueTensor([5]).cuda(0) self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3]))) - i = self.IndexTensor([[2]]).cuda(1) - v = self.ValueTensor(1, 0).cuda(0) - self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3, 0]))) - def _test_new_device(self, size, device): with torch.cuda.device(device): x = torch.cuda.sparse.DoubleTensor(*size) @@ -1145,7 +997,6 @@ def test_new_device_single_gpu(self): self._test_new_device((), 0) self._test_new_device((30, 20), 0) self._test_new_device((30, 20, 10), 0) - self._test_new_device((30, 20, 10, 0), 0) @cuda_only @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected") @@ -1153,67 +1004,49 @@ def test_new_device_multi_gpu(self): self._test_new_device((), 1) self._test_new_device((30, 20), 1) self._test_new_device((30, 20, 10), 1) - self._test_new_device((30, 20, 10, 0), 1) @skipIfRocm def test_new(self): - def test_shape(sparse_dims, nnz, with_size): - x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size) - if not x.is_cuda: - # CUDA sparse tensors currently requires the size to be - # specified if nDimV > 0 - self.assertEqual(x.new(indices, values), x) - self.assertEqual(x.new(indices, values, x.size()), x) - - test_shape(3, 10, 100) - test_shape(3, 0, [100, 100, 0]) + x, indices, values = self._gen_sparse(3, 10, 100) + if not x.is_cuda: + # CUDA sparse tensors currently requires the size to be + # specified if nDimV > 0 + self.assertEqual(x.new(indices, values), x) + self.assertEqual(x.new(indices, values, x.size()), x) @cpu_only # not really, but we only really want to run this once @skipIfRocm def test_factory(self): - for test_empty_tensor in [True, False]: - if test_empty_tensor: - default_size = torch.Size([1, 3, 0]) - size = torch.Size([3, 3, 0]) - else: - default_size = torch.Size([1, 3]) - size = torch.Size([3, 3]) - for include_size in [True, False]: - for use_tensor_idx in [True, False]: - for use_tensor_val in [True, False]: - for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]): - # have to include size with cuda sparse tensors - include_size = include_size or use_cuda - dtype = torch.float64 - long_dtype = torch.int64 - device = torch.device('cpu') if not use_cuda else \ - torch.device(torch.cuda.device_count() - 1) - indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2]) - if test_empty_tensor: - values = self.ValueTensor(1, 0) - else: - if use_tensor_val: - values = torch.tensor([1.], dtype=dtype) - else: - values = 1. - if include_size: - sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype, - device=device, requires_grad=True) - else: - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype, - device=device, requires_grad=True) - self.assertEqual(indices, sparse_tensor._indices()) - self.assertEqual(values, sparse_tensor._values()) - self.assertEqual(size if include_size else default_size, sparse_tensor.size()) - self.assertEqual(dtype, sparse_tensor.dtype) - if use_cuda: - self.assertEqual(device, sparse_tensor._values().device) - self.assertEqual(True, sparse_tensor.requires_grad) + default_size = torch.Size([1, 3]) + size = torch.Size([3, 3]) + for include_size in [True, False]: + for use_tensor_idx in [True, False]: + for use_tensor_val in [True, False]: + for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]): + # have to include size with cuda sparse tensors + include_size = include_size or use_cuda + dtype = torch.float64 + long_dtype = torch.int64 + device = torch.device('cpu') if not use_cuda else torch.device(torch.cuda.device_count() - 1) + indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2]) + values = torch.tensor([1.], dtype=dtype) if use_tensor_val else 1. + if include_size: + sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype, + device=device, requires_grad=True) + else: + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype, + device=device, requires_grad=True) + self.assertEqual(indices, sparse_tensor._indices()) + self.assertEqual(values, sparse_tensor._values()) + self.assertEqual(size if include_size else default_size, sparse_tensor.size()) + self.assertEqual(dtype, sparse_tensor.dtype) + if use_cuda: + self.assertEqual(device, sparse_tensor._values().device) + self.assertEqual(True, sparse_tensor.requires_grad) @skipIfRocm def test_factory_size_check(self): - indices = self.IndexTensor([[1, 2], - [0, 2]]) + indices = self.IndexTensor([[1, 2], [0, 2]]) values = self.ValueTensor([.5, .5]) sizes = torch.Size([2, 3]) with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"): @@ -1223,34 +1056,12 @@ def test_factory_size_check(self): with self.assertRaisesRegex(RuntimeError, "found negative index"): torch.sparse_coo_tensor(indices, values, sizes) - indices = self.IndexTensor([[1, 2], - [0, 2]]) - values = self.ValueTensor(2, 1, 0) - sizes = torch.Size([2, 3, 1, 0]) - with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"): - torch.sparse_coo_tensor(indices, values, sizes) - - indices = self.IndexTensor([[1, 2], - [0, 2]]) - values = self.ValueTensor(2, 2, 2) - sizes = torch.Size([0, 0, 2, 2]) - with self.assertRaisesRegex(RuntimeError, "sizes is inconsistent with indices"): - torch.sparse_coo_tensor(indices, values, sizes) - - indices = self.IndexTensor([[1, 2], - [0, 2]]) + indices = self.IndexTensor([[1, 2], [0, 2]]) values = self.ValueTensor([[1, 1, 1], [1, 1, 1]]) sizes = torch.Size([3, 3, 2]) with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): torch.sparse_coo_tensor(indices, values, sizes) - indices = self.IndexTensor([[1, 2], - [0, 2]]) - values = self.ValueTensor(2, 1, 0) - sizes = torch.Size([3, 3, 2, 0]) - with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): - torch.sparse_coo_tensor(indices, values, sizes) - def test_factory_default(self): tensor = self.SparseTensor() expected_indices = self.IndexTensor(1, 0) @@ -1284,31 +1095,25 @@ def test_factory_nnz(self): with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"): torch.sparse_coo_tensor(indices, values, sizes) - indices = self.IndexTensor([[0]]) # (sparseDims, nnz): (1, 1) - values = self.ValueTensor(2, 0) # (nnz, ...): (2, 0) - sizes = torch.Size([2, 0]) - with self.assertRaisesRegex(RuntimeError, "indices and values must have same nnz"): - torch.sparse_coo_tensor(indices, values, sizes) + def _test_factory_tensor_shape(self, i_shape, v_shape, size, expected_size): + device = 'cuda' if self.is_cuda else 'cpu' + if size: + t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device) + else: + t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device) + expected_indices = torch.empty(i_shape, device=device) + expected_values = torch.empty(v_shape, device=device) + expected_size = torch.Size(expected_size) + self.assertEqual(t._indices(), expected_indices) + self.assertEqual(t._values(), expected_values) + self.assertEqual(t.size(), expected_size) def test_factory_nnz_zero(self): - def test_shape(i_shape, v_shape, size, expected_size): - device = 'cuda' if self.is_cuda else 'cpu' - if size: - t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), torch.Size(size), device=device) - else: - t = torch.sparse_coo_tensor(torch.empty(i_shape), torch.empty(v_shape), device=device) - expected_indices = torch.empty(i_shape, device=device) - expected_values = torch.empty(v_shape, device=device) - expected_size = torch.Size(expected_size) - self.assertEqual(t._indices(), expected_indices) - self.assertEqual(t._values(), expected_values) - self.assertEqual(t.size(), expected_size) - - test_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0]) - test_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0]) - test_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0]) - test_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0]) - test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0]) + self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], None, [0, 2, 4, 0]) + self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], None, [0, 0, 0, 2, 4, 0]) + self._test_factory_tensor_shape([1, 0], [0, 2, 4, 0], [0, 2, 4, 0], [0, 2, 4, 0]) + self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [0, 0, 0, 2, 4, 0], [0, 0, 0, 2, 4, 0]) + self._test_factory_tensor_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0]) @skipIfRocm def test_factory_dense_dims(self): @@ -1318,12 +1123,6 @@ def test_factory_dense_dims(self): with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): torch.sparse_coo_tensor(indices, values, sizes) - indices = self.IndexTensor([[0]]) - values = self.ValueTensor(1, 2, 3, 0) - sizes = torch.Size([1, 3, 4, 0]) - with self.assertRaisesRegex(RuntimeError, "values has incorrect size"): - torch.sparse_coo_tensor(indices, values, sizes) - @cpu_only def test_factory_type_inference(self): t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32)) @@ -1333,13 +1132,6 @@ def test_factory_type_inference(self): t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1])) self.assertEqual(torch.int64, t.dtype) - t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0)) - self.assertEqual(torch.float32, t.dtype) - t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0)) - self.assertEqual(torch.float64, t.dtype) - t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.LongTensor(1, 0)) - self.assertEqual(torch.int64, t.dtype) - @cuda_only @skipIfRocm def test_factory_device_type_inference(self): @@ -1348,66 +1140,41 @@ def test_factory_device_type_inference(self): for indices_device in ['cuda', 'cpu']: for values_device in ['cuda', 'cpu']: for sparse_device in ['cuda', 'cpu', None]: - for test_empty_tensor in [True, False]: - if test_empty_tensor: - t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device), - self.ValueTensor(1, 0).to(values_device), - (1, 3, 0), device=sparse_device) - else: - t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device), - torch.tensor([1.], device=values_device), - (1, 3), device=sparse_device) - should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda') - self.assertEqual(should_be_cuda, t.is_cuda) + t = torch.sparse_coo_tensor(torch.tensor(([0], [2]), device=indices_device), + torch.tensor([1.], device=values_device), + (1, 3), device=sparse_device) + should_be_cuda = sparse_device == 'cuda' or (sparse_device is None and values_device == 'cuda') + self.assertEqual(should_be_cuda, t.is_cuda) @cpu_only def test_factory_copy(self): - def test_tensor(indices, values, indices_equal, values_equal): - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) - if indices_equal: - self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) - else: - self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) - if values_equal: - self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) - else: - self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) - # both correct indices = torch.tensor(([0], [2]), dtype=torch.int64) values = torch.tensor([1.], dtype=torch.float64) - test_tensor(indices, values, True, True) - - indices = torch.tensor(([0], [2]), dtype=torch.int64) - values = torch.DoubleTensor(1, 0) - test_tensor(indices, values, True, True) + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) + self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) + self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) # only indices correct indices = torch.tensor(([0], [2]), dtype=torch.int64) values = torch.tensor([1.], dtype=torch.float32) - test_tensor(indices, values, True, False) - - indices = torch.tensor(([0], [2]), dtype=torch.int64) - values = torch.FloatTensor(1, 0) - test_tensor(indices, values, True, True) # An empty tensor's data_ptr is always equal to 0 + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) + self.assertEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) + self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) # only values correct indices = torch.tensor(([0], [2]), dtype=torch.int32) values = torch.tensor([1.], dtype=torch.float64) - test_tensor(indices, values, False, True) - - indices = torch.tensor(([0], [2]), dtype=torch.int32) - values = torch.DoubleTensor(1, 0) - test_tensor(indices, values, False, True) + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) + self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) + self.assertEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) # neither correct indices = torch.tensor(([0], [2]), dtype=torch.int32) values = torch.tensor([1.], dtype=torch.float32) - test_tensor(indices, values, False, False) - - indices = torch.tensor(([0], [2]), dtype=torch.int32) - values = torch.FloatTensor(1, 0) - test_tensor(indices, values, False, True) # An empty tensor's data_ptr is always equal to 0 + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=torch.float64) + self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr()) + self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr()) @cpu_only # just run once, we test both cpu and cuda def test_constructor_device_legacy(self): @@ -1458,15 +1225,9 @@ def test_is_sparse(self): x = torch.randn(3, 3) self.assertFalse(x.is_sparse) - x = torch.randn(3, 3, 0) - self.assertFalse(x.is_sparse) - x = self.SparseTensor() self.assertTrue(x.is_sparse) - x = self.SparseTensor(1, 0) - self.assertTrue(x.is_sparse) - @skipIfRocm def test_resize_as(self): def do_test(t): @@ -1477,8 +1238,6 @@ def do_test(t): self.assertEqual(t, t + y) do_test(self.SparseTensor()) - do_test(self.SparseTensor(3, 0)) - do_test(self.SparseTensor(3, 3)) @skipIfRocm def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size): @@ -1505,13 +1264,10 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size): @skipIfRocm def test_resize(self): - # 1. Expand the size of some dense dimensions [Supported] + # 1. Increase the size of some dense dimensions [Supported] self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 4], [2, 2, 4]) - self._test_resize_shape([1, 1], [1, 2, 0], [2, 2, 0], - [1, 1], [1, 2, 4], [2, 2, 4]) - # 2. Expand the size of some sparse dimensions [Supported] self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 3], [4, 2, 3]) @@ -1520,18 +1276,11 @@ def test_resize(self): self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3], [2, 0], [0, 2, 4, 5], [1, 1, 2, 4, 5]) - self._test_resize_shape([1, 0], [0, 2, 3], [2, 2, 3], - [2, 0], [0, 2, 4, 0], [1, 1, 2, 4, 0]) - # 4. Add dims to dense dimensions [Not Supported] with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 3, 4], [2, 2, 3, 4]) - with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"): - self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], - [1, 1], [1, 2, 3, 0], [2, 2, 3, 0]) - # 5. Remove dims from dense dimensions [Not Supported] with self.assertRaisesRegex(RuntimeError, "changing the number of dense dimensions"): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], @@ -1552,10 +1301,6 @@ def test_resize(self): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 2], [2, 2, 2]) - with self.assertRaisesRegex(RuntimeError, "shrinking the size of dense dimensions"): - self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], - [1, 1], [1, 2, 0], [2, 2, 0]) - def test_is_nonzero(self): self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero()) self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero()) @@ -1563,8 +1308,6 @@ def test_is_nonzero(self): self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (0., 0.), (1,)).is_nonzero()) self.assertFalse(torch.sparse_coo_tensor(([0, 0],), (-1., 1.), (1,)).is_nonzero()) self.assertTrue(torch.sparse_coo_tensor(torch.zeros(0, 1), 12.3, []).is_nonzero()) # scalar sparse tensor - with self.assertRaisesRegex(RuntimeError, "bool value of Tensor with no values is ambiguous"): - torch.sparse_coo_tensor(([0, 1],), self.ValueTensor(2, 0), (4, 0)).is_nonzero() class TestUncoalescedSparse(TestSparse): @@ -1595,26 +1338,11 @@ class TestSparseOneOff(TestCase): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @skipIfRocm def test_cuda_from_cpu(self): - with self.assertRaisesRegex( - RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): - torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), - torch.randn(4, 4, 4), - [3, 4, 4]) - - with self.assertRaisesRegex( - RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): - torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), - torch.randn(4, 4, 4, 0), - [3, 4, 4, 0]) - - with self.assertRaisesRegex( - RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): - torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), - torch.randn(0, 4, 4, 0), - [0, 4, 4, 0]) + self.assertExpectedRaises( + RuntimeError, + lambda: torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), + torch.randn(4, 4, 4), + [3, 4, 4])) @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @skipIfRocm @@ -1623,22 +1351,7 @@ def test_cuda_sparse_cpu_dense_add(self): sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4).cuda(), [3, 4, 4]) - with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"): - x + sparse_y - - x = torch.zeros(3, 4, 4, 0) - sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), - torch.randn(4, 4, 4, 0).cuda(), - [3, 4, 4, 0]) - with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"): - x + sparse_y - - x = torch.zeros(0, 4, 4, 0) - sparse_y = torch.cuda.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), - torch.randn(0, 4, 4, 0).cuda(), - [0, 4, 4, 0]) - with self.assertRaisesRegex(RuntimeError, "add: expected 'other' to be a CPU tensor\\, but got a CUDA tensor"): - x + sparse_y + self.assertExpectedRaises(RuntimeError, lambda: x + sparse_y) if __name__ == '__main__': From 9eb72889b49a2fbeeb93313d38c1f7ece7eed014 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Tue, 18 Sep 2018 12:14:38 -0700 Subject: [PATCH 228/237] Add successor/predecessor functions Summary: More functionality to prep nomnigraph for scheduler implementations Reviewed By: duc0 Differential Revision: D9794686 fbshipit-source-id: b460859d8ff965d0049b2a696bd8d2f5c97f3f86 --- caffe2/python/nomnigraph.py | 2 ++ caffe2/python/nomnigraph_test.py | 33 ++++++++++++++++++++++++ caffe2/python/pybind_state_nomni.cc | 39 ++++++++++++++++++++++++++--- 3 files changed, 70 insertions(+), 4 deletions(-) diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py index 417ad64ccc2bb2..abe1971680a7e9 100644 --- a/caffe2/python/nomnigraph.py +++ b/caffe2/python/nomnigraph.py @@ -68,7 +68,9 @@ def render(s): NeuralNetOperator = C.NeuralNetOperator +Operator = C.NeuralNetOperator NeuralNetData = C.NeuralNetData +Data = C.NeuralNetData NNSubgraph = C.NNSubgraph NNMatchGraph = C.NNMatchGraph Graph = C.Graph diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 1956d2e9f01b68..9288364bbcb944 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -175,6 +175,39 @@ def test_convertToProto(self): for a, b in zip(new_netdef.external_output, net.Proto().external_output): assert a == b + def test_node_interactions(self): + nn = ng.NNModule() + dfg = nn.dataFlow + test1 = dfg.createNode(ng.Operator("test1")) + test2 = dfg.createNode(ng.Operator("test2")) + x = dfg.createNode(ng.Data("x")) + dfg.createEdge(test1, x) + dfg.createEdge(x, test2) + p = test2.getOperatorPredecessors() + assert len(p) == 1 + assert p[0] == test1 + + # Add another node + test3 = dfg.createNode(ng.Operator("test3")) + y = dfg.createNode(ng.Data("y")) + dfg.createEdge(test3, y) + dfg.createEdge(y, test2) + p = test2.getOperatorPredecessors() + assert len(p) == 2 + assert test1 in p + assert test3 in p + + # Successors + assert len(test2.getOperatorSuccessors()) == 0 + assert len(test1.getOperatorSuccessors()) == 1 + assert test1.getOperatorSuccessors()[0] == test2 + + # Check all the nodes are valid (pybind ownership test) + for node in [test1, test2, test3]: + assert node.isOperator() + for node in [x, y]: + assert node.isTensor() + def test_annotation_basic(self): annot = ng.Annotation() annot.setDevice("woot") diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc index fa7baf60c92541..1f05d3bd1beeb4 100644 --- a/caffe2/python/pybind_state_nomni.cc +++ b/caffe2/python/pybind_state_nomni.cc @@ -202,10 +202,41 @@ void addNomnigraphMethods(pybind11::module& m) { .def( "getAnnotation", [](NNGraph::NodeRef n) { return getOrAddCaffe2Annotation(n); }) - .def("setAnnotation", [](NNGraph::NodeRef n, Caffe2Annotation annot) { - auto* nnOp = nn::get(n); - nnOp->setAnnotation(nom::util::make_unique(annot)); - }); + .def( + "setAnnotation", + [](NNGraph::NodeRef n, Caffe2Annotation annot) { + auto* nnOp = nn::get(n); + nnOp->setAnnotation( + nom::util::make_unique(annot)); + }) + .def( + "getOperatorPredecessors", + [](NNGraph::NodeRef n) { + CAFFE_ENFORCE(nn::is(n)); + std::vector pred; + for (const auto& inEdge : n->getInEdges()) { + auto data = inEdge->tail(); + if (nn::hasProducer(data)) { + pred.emplace_back(nn::getProducer(data)); + } + } + return pred; + }, + py::return_value_policy::reference) + .def( + "getOperatorSuccessors", + [](NNGraph::NodeRef n) { + CAFFE_ENFORCE(nn::is(n)); + std::vector succ; + for (const auto& outEdge : n->getOutEdges()) { + auto data = outEdge->head(); + for (const auto& consumer : nn::getConsumers(data)) { + succ.emplace_back(consumer); + } + } + return succ; + }, + py::return_value_policy::reference); py::class_ nnop(m, "NeuralNetOperator"); py::class_ nndata(m, "NeuralNetData"); From 1d399a80a03bb2b2f2e6db514c48c6cfd08363a0 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 18 Sep 2018 13:10:08 -0700 Subject: [PATCH 229/237] Handle pollution of MAX, MIN and CHECK macros. (#11805) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11805 Some of our headers in Caffe2 pollute the macro namespace with things like MAX, MIN, CHECK, so I renamed these in places where this is a problem. This patch courtesy of gchanan, extracted out of #11721 Reviewed By: Yangqing Differential Revision: D9917757 fbshipit-source-id: 17fc692ca04b208dcb8ae00731ed60e393284f7c --- aten/src/TH/THTensorCopy.cpp | 2 ++ aten/src/TH/generic/THTensorCopy.cpp | 18 +++++++----------- torch/csrc/cuda/nccl.cpp | 4 ++-- torch/csrc/cuda/nccl.h | 6 +++--- torch/csrc/cuda/python_nccl.cpp | 14 +++++++------- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/aten/src/TH/THTensorCopy.cpp b/aten/src/TH/THTensorCopy.cpp index d8df519e26bdbc..482a7b986f5302 100644 --- a/aten/src/TH/THTensorCopy.cpp +++ b/aten/src/TH/THTensorCopy.cpp @@ -1,6 +1,8 @@ #include "THTensor.hpp" #include "THVector.h" +#include + #include "generic/THTensorCopy.cpp" #include "THGenerateAllTypes.h" diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp index f04c75cd3f9105..91a60275a6c03e 100644 --- a/aten/src/TH/generic/THTensorCopy.cpp +++ b/aten/src/TH/generic/THTensorCopy.cpp @@ -26,13 +26,11 @@ int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) { // special case copy where tensor is contiguous and src is a transposed matrix // This can be generalized to most copies, but it's tricker void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { - #define MIN(x, y) (((x) < (y)) ? (x) : (y)) - #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #ifdef TH_REAL_IS_BYTE - const int BLOCK_SZ = 120; + const int64_t BLOCK_SZ = 120; #else - const int BLOCK_SZ = 60; + const int64_t BLOCK_SZ = 60; #endif THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ); @@ -48,8 +46,8 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { scalar_t *spo = sp + R + C * NR; scalar_t *rpo = rp + C + R * NC; - int nr = MIN(NR - R, BLOCK_SZ); - int nc = MIN(NC - C, BLOCK_SZ); + int nr = std::min(NR - R, BLOCK_SZ); + int nc = std::min(NC - C, BLOCK_SZ); // 1. copy columns from src to buf for (int c = 0; c < nc; c++) { @@ -57,10 +55,10 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { } // 2. transpose buf in place - int rc_max = MAX(nr, nc); - int rc_min = MIN(nr, nc); + int rc_max = std::max(nr, nc); + int rc_min = std::min(nr, nc); for (int r = 0; r < rc_max; r++) { - int end = MIN(r, rc_min); + int end = std::min(r, rc_min); for (int c = 0; c < end; c++) { scalar_t tmp = bp[r + BLOCK_SZ * c]; bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c]; @@ -75,8 +73,6 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) { } } c10::raw::intrusive_ptr::decref(buf); - #undef MIN - #undef MAX } void THTensor_(copy)(THTensor *tensor, THTensor *src) diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index ab0fecfd761c24..e769b85f8e2d90 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -28,7 +28,7 @@ struct NcclCommList { int ndevices; NcclCommList(const std::vector& devices) : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) { - CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data())); + NCCL_CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data())); } NcclCommList(NcclCommList&& foo) = default; ~NcclCommList() { @@ -219,7 +219,7 @@ void broadcast(TensorList tensors, const stream_list& streams, const comm_list& AT_CHECK(static_cast(numel) <= static_cast(count_max), "Broadcast tensor has ", numel, " elements, which exceeds the " "maximum NCCL supports (", count_max, ")"); - CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream)); + NCCL_CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream)); } #else throw std::runtime_error("PyTorch built without NCCL support"); diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h index 919363474b9f9f..349d8bcfdf507f 100644 --- a/torch/csrc/cuda/nccl.h +++ b/torch/csrc/cuda/nccl.h @@ -12,7 +12,7 @@ namespace detail { void throw_nccl_error(ncclResult_t status); -static inline void CHECK(ncclResult_t status) { +static inline void NCCL_CHECK(ncclResult_t status) { if (status != ncclSuccess) { throw_nccl_error(status); } @@ -21,12 +21,12 @@ static inline void CHECK(ncclResult_t status) { struct AutoNcclGroup { AutoNcclGroup() { #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - CHECK(ncclGroupStart()); + NCCL_CHECK(ncclGroupStart()); #endif } ~AutoNcclGroup() { #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - CHECK(ncclGroupEnd()); + NCCL_CHECK(ncclGroupEnd()); #endif } }; diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp index 49d36e51649a4a..79a859f20d1075 100644 --- a/torch/csrc/cuda/python_nccl.cpp +++ b/torch/csrc/cuda/python_nccl.cpp @@ -26,7 +26,7 @@ PyObject * THCPModule_nccl_version(PyObject *self, PyObject *args) { PyObject * THCPModule_nccl_unique_id(PyObject *self, PyObject *args) { HANDLE_TH_ERRORS ncclUniqueId id; - CHECK(ncclGetUniqueId(&id)); + NCCL_CHECK(ncclGetUniqueId(&id)); return PyBytes_FromStringAndSize((char*)&id, NCCL_UNIQUE_ID_BYTES); END_HANDLE_TH_ERRORS } @@ -109,7 +109,7 @@ PyObject * THCPModule_nccl_init_rank(PyObject *self, PyObject *args) { memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES); ncclComm_t comm; with_no_gil([&]{ - CHECK(ncclCommInitRank(&comm, nranks, commId, rank)); + NCCL_CHECK(ncclCommInitRank(&comm, nranks, commId, rank)); }); return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm); END_HANDLE_TH_ERRORS @@ -149,7 +149,7 @@ PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) { int device = inputs[i].get_device(); device_guard.set_index(device); auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]); - CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(), + NCCL_CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, (ncclRedOp_t) op, root, comms[i], stream)); } }); @@ -191,7 +191,7 @@ PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args) { int device = inputs[i].get_device(); device_guard.set_index(device); auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]); - CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(), + NCCL_CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, (ncclRedOp_t) op, comms[i], stream)); } }); @@ -255,10 +255,10 @@ PyObject * THCPModule_nccl_all_gather(PyObject *self, PyObject *args) { device_guard.set_index(device); auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]); #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(), + NCCL_CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, comms[i], stream)); #else - CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type, + NCCL_CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type, outputs[i].data_ptr(), comms[i], stream)); #endif } @@ -299,7 +299,7 @@ PyObject * THCPModule_nccl_reduce_scatter(PyObject *self, PyObject *args) { int device = inputs[i].get_device(); device_guard.set_index(device); auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]); - CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(), + NCCL_CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, (ncclRedOp_t) op, comms[i], stream)); } }); From 7d25fa3c721f6b515af5bbfc704e96a978bef3c9 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 18 Sep 2018 13:39:42 -0700 Subject: [PATCH 230/237] Emit Undefined type for value when it is Dynamic type (#11810) Summary: For example, outputs of control blocks often have Dynamic type, and when we try to export them to ONNX we get an invalid proto, since `elem_type` is not populated on the TypeInfoProto. This makes it so at least we can get past the checker, since having a dynamic typed output from a control block should still be semantically valid Pull Request resolved: https://github.com/pytorch/pytorch/pull/11810 Differential Revision: D9922754 Pulled By: jamesr66a fbshipit-source-id: 5c66113cc302a9d9b8b9f5a8605473d3c6ad5af1 --- torch/csrc/jit/export.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 3a0500b765dead..1984f35fcc8974 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -179,6 +179,8 @@ void EncoderBase::EncodeValueInfo( shape->mutable_dim(i)->set_dim_value(sizes[i]); } tensor_type->set_elem_type(ATenTypeToOnnxType(node_type->scalarType())); + } else { + tensor_type->set_elem_type(onnx::TensorProto_DataType_UNDEFINED); } } @@ -500,6 +502,7 @@ void ModuleEncoder::EncodeTypeInfo( auto kind = type->kind(); if (kind == TypeKind::DynamicType) { type_proto->set_denotation("DynamicType"); + tensortype_proto->set_elem_type(onnx::TensorProto_DataType_UNDEFINED); } else if (kind == TypeKind::TensorType) { type_proto->set_denotation("TensorType"); // encode the number of dimensions by pushing that number of ones into the shape proto From d4e1fa45d055a1b00e8b7cfefa5c9f5db9ae6160 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Tue, 18 Sep 2018 13:41:11 -0700 Subject: [PATCH 231/237] allow no-alpha add/sub in onnx symbolic (#10972) Summary: The PR fixes #10873 The context is aten::add and aten::sub ST overloads don't have alpha, so onnx symbolic does not match. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10972 Reviewed By: jamesr66a Differential Revision: D9724224 Pulled By: wanchaol fbshipit-source-id: eb5d1b09fa8f1604b288f4a62b8d1f0bc66611af --- ...nnx_export_script_non_alpha_add_sub.expect | 22 +++++++++++++++++++ .../expect/TestOperators.test_rsub.expect | 4 ++-- test/test_jit.py | 16 ++++++++++++++ torch/onnx/symbolic.py | 17 ++++++-------- 4 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect diff --git a/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect b/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect new file mode 100644 index 00000000000000..1c2b3c655d3324 --- /dev/null +++ b/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect @@ -0,0 +1,22 @@ +ModelProto { + producer_name: "pytorch" + domain: "" + doc_string: "" + graph: + GraphProto { + name: "torch-jit-export" + inputs: [{name: "x", type:Tensor dims: 3 4}] + outputs: [{name: "7", type:Tensor dims: 1}] + initializers: [] + nodes: [ + Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Shape", inputs: [x], outputs: [2], attributes: []}, + Node {type: "Gather", inputs: [2,1], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]}, + Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Add", inputs: [3,4], outputs: [5], attributes: []}, + Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Sub", inputs: [5,6], outputs: [7], attributes: []} + ] + } + opset_import: [OperatorSetIdProto { domain: }], +} diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect index 49fa976a5e1bd6..7f2e5284e0cef8 100644 --- a/test/onnx/expect/TestOperators.test_rsub.expect +++ b/test/onnx/expect/TestOperators.test_rsub.expect @@ -8,8 +8,8 @@ graph { attribute { name: "value" t { - data_type: DOUBLE - raw_data: "\000\000\000\000\000\000\360?" + data_type: INT64 + raw_data: "\001\000\000\000\000\000\000\000" } type: TENSOR } diff --git a/test/test_jit.py b/test/test_jit.py index d91b5b4653d57c..b77846908f72ab 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -5331,6 +5331,22 @@ def forward(self, x): mte, (torch.zeros(1, 2, 3),), None, verbose=False, example_outputs=outputs, export_raw_ir=True)) + def test_onnx_export_script_non_alpha_add_sub(self): + class ModuleToExport(torch.jit.ScriptModule): + def __init__(self): + super(ModuleToExport, self).__init__() + + @torch.jit.script_method + def forward(self, x): + bs = x.size(0) + 1 + return bs - 1 + + mte = ModuleToExport() + outputs = torch.LongTensor([mte(torch.rand(3, 4))]) + self.assertExpected(torch.onnx.export_to_pretty_string( + mte, (torch.rand(3, 4),), None, verbose=False, + example_outputs=outputs)) + def test_onnx_export_script_module_if(self): class ModuleToExport(torch.jit.ScriptModule): def __init__(self): diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index d5b586c3846d6f..3f334304705ded 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -192,25 +192,22 @@ def unused(g): return g.op("prim::Undefined") -@parse_args('v', 'v', 't') -def add(g, self, other, alpha): - if _scalar(alpha) != 1: +def add(g, self, other, alpha=None): + # default alpha arg is to allow no-alpha add (aten add st overload no alpha) + if alpha and _scalar(_maybe_get_scalar(alpha)) != 1: return _unimplemented("add", "alpha != 1") # See Note [Pointwise by scalar] other = _maybe_get_scalar(other) return g.op("Add", self, _if_scalar_type_as(g, other, self)) -@parse_args('v', 'v', 't') -def sub(g, self, other, alpha): - if _scalar(alpha) != 1: +def sub(g, self, other, alpha=None): + # default alpha arg is to allow no-alpha sub (aten sub st overload no alpha) + if alpha and _scalar(_maybe_get_scalar(alpha)) != 1: return _unimplemented("sub", "alpha != 1") # See Note [Pointwise by scalar]. Note that self or other may be scalars. other = _maybe_get_scalar(other) - self = _maybe_get_scalar(self) - self = _if_scalar_type_as(g, self, other) - other = _if_scalar_type_as(g, other, self) - return g.op("Sub", self, other) + return g.op("Sub", self, _if_scalar_type_as(g, other, self)) def mul(g, self, other): From 8ad846fda55f8cd04e832440c794ace04daf40fa Mon Sep 17 00:00:00 2001 From: Orion Reblitz-Richardson Date: Tue, 18 Sep 2018 14:00:52 -0700 Subject: [PATCH 232/237] Don't build Detectron ops with NO_CAFFE2_OPS=1 (#11799) Summary: cc apaszke Pull Request resolved: https://github.com/pytorch/pytorch/pull/11799 Differential Revision: D9922745 Pulled By: orionr fbshipit-source-id: b88724b7c2919aabc00d98658e8e563233e01c85 --- modules/detectron/CMakeLists.txt | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt index 1791ca27a98590..971baa41adebbe 100644 --- a/modules/detectron/CMakeLists.txt +++ b/modules/detectron/CMakeLists.txt @@ -1,18 +1,20 @@ file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu) -# Note(ilijar): Since Detectron ops currently have no -# CPU implementation, we only build GPU ops for now. -if (USE_CUDA) - CUDA_ADD_LIBRARY( - caffe2_detectron_ops_gpu SHARED - ${Detectron_CPU_SRCS} - ${Detectron_GPU_SRCS}) +if (BUILD_CAFFE2_OPS) + # Note(ilijar): Since Detectron ops currently have no + # CPU implementation, we only build GPU ops for now. + if (USE_CUDA) + CUDA_ADD_LIBRARY( + caffe2_detectron_ops_gpu SHARED + ${Detectron_CPU_SRCS} + ${Detectron_GPU_SRCS}) - target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu) - install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) -elseif(NOT IOS_PLATFORM) - add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) - target_link_libraries(caffe2_detectron_ops caffe2) - install(TARGETS caffe2_detectron_ops DESTINATION lib) + target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu) + install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) + elseif(NOT IOS_PLATFORM) + add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) + target_link_libraries(caffe2_detectron_ops caffe2) + install(TARGETS caffe2_detectron_ops DESTINATION lib) + endif() endif() From e585f2fb480f1bdf2a570a0f049ab3e4f45ca3a1 Mon Sep 17 00:00:00 2001 From: sven Date: Tue, 18 Sep 2018 14:47:54 -0700 Subject: [PATCH 233/237] Polish CPP docs, Minor Python Docs Fixes (#11722) Differential Revision: D9919120 Pulled By: goldsborough fbshipit-source-id: bf14cbe4ab79524495957cb749828046af864aab --- .gitignore | 5 +- .travis.yml | 2 +- docs/Makefile | 2 +- docs/cpp/Doxyfile | 2032 ----------------- docs/cpp/Makefile | 14 +- docs/cpp/source/Doxyfile | 132 ++ docs/cpp/{ => source}/building.rst | 0 docs/cpp/{ => source}/check-doxygen.sh | 8 +- docs/cpp/{ => source}/conf.py | 56 +- docs/cpp/{ => source}/contributing.rst | 0 docs/cpp/{ => source}/examples.rst | 0 docs/cpp/{ => source}/index.rst | 0 docs/source/conf.py | 26 +- torch/csrc/api/include/torch/nn/module.h | 3 + .../api/include/torch/nn/modules/functional.h | 18 +- 15 files changed, 223 insertions(+), 2075 deletions(-) delete mode 100644 docs/cpp/Doxyfile create mode 100644 docs/cpp/source/Doxyfile rename docs/cpp/{ => source}/building.rst (100%) rename docs/cpp/{ => source}/check-doxygen.sh (60%) rename docs/cpp/{ => source}/conf.py (81%) rename docs/cpp/{ => source}/contributing.rst (100%) rename docs/cpp/{ => source}/examples.rst (100%) rename docs/cpp/{ => source}/index.rst (100%) diff --git a/.gitignore b/.gitignore index b600e7515b9134..a56ae2ab81df84 100644 --- a/.gitignore +++ b/.gitignore @@ -25,9 +25,8 @@ aten/src/ATen/cuda/CUDAConfig.h build/ dist/ docs/src/**/* -docs/cpp/xml/ -docs/cpp/html/ -docs/cpp/api/ +docs/cpp/build +docs/cpp/source/api test/.coverage test/cpp/api/mnist test/custom_operator/model.pt diff --git a/.travis.yml b/.travis.yml index be45e69f67cb2f..77d430ee8917a4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,4 +28,4 @@ matrix: script: mypy @mypy-files.txt - env: CPP_DOC_CHECK install: sudo apt-get install -y doxygen - script: cd docs/cpp && ./check-doxygen.sh + script: cd docs/cpp/source && ./check-doxygen.sh diff --git a/docs/Makefile b/docs/Makefile index 4a56c12ca22d89..59c2397bb023e1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -39,4 +39,4 @@ html-stable: clean: @echo "Removing everything under 'build'.." - @rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees + @rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile deleted file mode 100644 index 37abadc83ff7ca..00000000000000 --- a/docs/cpp/Doxyfile +++ /dev/null @@ -1,2032 +0,0 @@ -# Doxyfile 1.8.14 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a double hash (##) is considered a comment and is placed in -# front of the TAG it is preceding. -# -# All text after a single hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists, items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (\" \"). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See -# https://www.gnu.org/software/libiconv/ for the list of possible encodings. -# The default value is: UTF-8. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -# double-quotes, unless you are using Doxywizard) that should identify the -# project for which the documentation is generated. This name is used in the -# title of most generated pages and in a few other places. -# The default value is: My Project. - -PROJECT_NAME = "PyTorch" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -# could be handy for archiving the generated documentation or if some version -# control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a -# quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = - -# With the PROJECT_LOGO tag one can specify a logo or an icon that is included -# in the documentation. The maximum height of the logo should not exceed 55 -# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy -# the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -# into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If -# left blank the current directory will be used. - -OUTPUT_DIRECTORY = build - -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. -# The default value is: NO. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. -# The default value is: English. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member -# descriptions after the members that are listed in the file and class -# documentation (similar to Javadoc). Set to NO to disable this. -# The default value is: YES. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief -# description of a member or function before the detailed description -# -# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. -# The default value is: YES. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator that is -# used to form the text in various listings. Each string in this list, if found -# as the leading text of the brief description, will be stripped from the text -# and the result, after processing the whole list, is used as the annotated -# text. Otherwise, the brief description is used as-is. If left blank, the -# following values are used ($name is automatically replaced with the name of -# the entity):The $name class, The $name widget, The $name file, is, provides, -# specifies, contains, represents, a, an and the. - -ABBREVIATE_BRIEF = "The $name class" \ - "The $name widget" \ - "The $name file" \ - is \ - provides \ - specifies \ - contains \ - represents \ - a \ - an \ - the - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief -# description. -# The default value is: NO. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. -# The default value is: NO. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path -# before files name in the file list and in the header files. If set to NO the -# shortest path that makes the file name unique will be used -# The default value is: YES. - -FULL_PATH_NAMES = YES - -# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -# Stripping is only done if one of the specified strings matches the left-hand -# part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to -# strip. -# -# Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. -# This tag requires that the tag FULL_PATH_NAMES is set to YES. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -# path mentioned in the documentation of a class, which tells the reader which -# header file to include in order to use a class. If left blank only the name of -# the header file containing the class definition is used. Otherwise one should -# specify the list of include paths that are normally passed to the compiler -# using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't -# support long names like on DOS, Mac, or CD-ROM. -# The default value is: NO. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) -# The default value is: NO. - -JAVADOC_AUTOBRIEF = YES - -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) -# The default value is: NO. - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -# a brief description. This used to be the default behavior. The new default is -# to treat a multi-line C++ comment block as a detailed description. Set this -# tag to YES if you prefer the old behavior instead. -# -# Note that setting this tag to YES also means that rational rose comments are -# not recognized any more. -# The default value is: NO. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -# documentation from any documented member that it re-implements. -# The default value is: YES. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new -# page for each member. If set to NO, the documentation of a member will be part -# of the file/class/namespace that contains it. -# The default value is: NO. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -# uses this value to replace tabs by spaces in code fragments. -# Minimum value: 1, maximum value: 16, default value: 4. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that act as commands in -# the documentation. An alias has the form: -# name=value -# For example adding -# "sideeffect=@par Side Effects:\n" -# will allow you to put the command \sideeffect (or @sideeffect) in the -# documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines (in the resulting output). You can put ^^ in the value part of an -# alias to insert a newline as if a physical newline was in the original file. - -ALIASES = "rst=\verbatim embed:rst:leading-asterisk" -ALIASES += "endrst=\endverbatim" - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -# only. Doxygen will then generate output that is more tailored for C. For -# instance, some of the names that are used will be different. The list of all -# members will be omitted, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -# Python sources only. Doxygen will then generate output that is more tailored -# for that language. For instance, namespaces will be presented as packages, -# qualified scopes will look different, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources. Doxygen will then generate output that is tailored for Fortran. -# The default value is: NO. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for VHDL. -# The default value is: NO. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. -# -# Note: For files without extension you can use no_extension as a placeholder. -# -# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -# according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -# case of backward compatibilities issues. -# The default value is: YES. - -MARKDOWN_SUPPORT = YES - -# When enabled doxygen tries to link words that correspond to documented -# classes, or namespaces to their corresponding documentation. Such a link can -# be prevented in individual cases by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. -# The default value is: YES. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. -# The default value is: NO. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. -# The default value is: NO. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. -# The default value is: NO. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate -# getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. -# This will only work if the methods are indeed getting or setting a simple -# type. If this is not the case, or you want to show the methods anyway, you -# should set this option to NO. -# The default value is: YES. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. -# The default value is: NO. - -DISTRIBUTE_GROUP_DOC = NO - -# Set the SUBGROUPING tag to YES to allow class member groups of the same type -# (for instance a group of public functions) to be put as a subgroup of that -# type (e.g. under the Public Functions section). Set it to NO to prevent -# subgrouping. Alternatively, this can be done per class using the -# \nosubgrouping command. -# The default value is: YES. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -# are shown inside the group in which they are included (e.g. using \ingroup) -# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -# and RTF). -# -# Note that this feature does not work in combination with -# SEPARATE_MEMBER_PAGES. -# The default value is: NO. - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -# with only public data fields or simple typedef fields will be shown inline in -# the documentation of the scope in which they are defined (i.e. file, -# namespace, or group documentation), provided this scope is documented. If set -# to NO, structs, classes, and unions are shown on a separate page (for HTML and -# Man pages) or section (for LaTeX and RTF). -# The default value is: NO. - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -# enum is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically be -# useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. -# The default value is: NO. - -TYPEDEF_HIDES_STRUCT = NO - -# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -# cache is used to resolve symbols given their name and scope. Since this can be -# an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The -# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest -# the optimal cache size from a speed point of view. -# Minimum value: 0, maximum value: 9, default value: 0. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in -# documentation are documented, even if no documentation was available. Private -# class members and static file members will be hidden unless the -# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -# Note: This will also disable the warnings about undocumented members that are -# normally produced when WARNINGS is set to YES. -# The default value is: NO. - -EXTRACT_ALL = YES - -# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will -# be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIVATE = NO - -# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal -# scope will be included in the documentation. -# The default value is: NO. - -EXTRACT_PACKAGE = YES - -# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be -# included in the documentation. -# The default value is: NO. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined -# locally in source files will be included in the documentation. If set to NO, -# only classes defined in header files are included. Does not have any effect -# for Java sources. -# The default value is: YES. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. If set to YES, local methods, -# which are defined in the implementation section but not in the interface are -# included in the documentation. If set to NO, only methods in the interface are -# included. -# The default value is: NO. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base name of -# the file that contains the anonymous namespace. By default anonymous namespace -# are hidden. -# The default value is: NO. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -# undocumented members inside documented classes or files. If set to NO these -# members will be included in the various overviews, but no documentation -# section is generated. This option has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. If set -# to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. -# The default value is: NO. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -# documentation blocks found inside the body of a function. If set to NO, these -# blocks will be appended to the function's detailed documentation block. -# The default value is: NO. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation that is typed after a -# \internal command is included. If the tag is set to NO then the documentation -# will be excluded. Set it to YES to include the internal documentation. -# The default value is: NO. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. -# The default value is: system dependent. - -CASE_SENSE_NAMES = NO - -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -# their full class and namespace scopes in the documentation. If set to YES, the -# scope will be hidden. -# The default value is: NO. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -# the files that are included by a file in the documentation of that file. -# The default value is: YES. - -SHOW_INCLUDE_FILES = YES - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -# files with double quotes in the documentation rather than with sharp brackets. -# The default value is: NO. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -# documentation for inline members. -# The default value is: YES. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -# (detailed) documentation of file and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. -# The default value is: YES. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -# descriptions of file, namespace and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. Note that -# this will also influence the order of the classes in the class list. -# The default value is: NO. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -# (brief and detailed) documentation of class members so that constructors and -# destructors are listed first. If set to NO the constructors will appear in the -# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -# member documentation. -# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -# detailed member documentation. -# The default value is: NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -# of group names into alphabetical order. If set to NO the group names will -# appear in their defined order. -# The default value is: NO. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -# fully-qualified names, including namespaces. If set to NO, the class list will -# be sorted only by class name, not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the alphabetical -# list. -# The default value is: NO. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -# type resolution of all parameters of a function it will reject a match between -# the prototype and the implementation of a member function even if there is -# only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -# accept a match between prototype and implementation in such cases. -# The default value is: NO. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo -# list. This list is created by putting \todo commands in the documentation. -# The default value is: YES. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test -# list. This list is created by putting \test commands in the documentation. -# The default value is: YES. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug -# list. This list is created by putting \bug commands in the documentation. -# The default value is: YES. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) -# the deprecated list. This list is created by putting \deprecated commands in -# the documentation. -# The default value is: YES. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional documentation -# sections, marked by \if ... \endif and \cond -# ... \endcond blocks. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -# initial value of a variable or macro / define can have for it to appear in the -# documentation. If the initializer consists of more lines than specified here -# it will be hidden. Use a value of 0 to hide initializers completely. The -# appearance of the value of individual variables and macros / defines can be -# controlled using \showinitializer or \hideinitializer command in the -# documentation regardless of this setting. -# Minimum value: 0, maximum value: 10000, default value: 30. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -# the bottom of the documentation of classes and structs. If set to YES, the -# list will mention the files that were used to generate the documentation. -# The default value is: YES. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -# will remove the Files entry from the Quick Index and from the Folder Tree View -# (if specified). -# The default value is: YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -# page. This will remove the Namespaces entry from the Quick Index and from the -# Folder Tree View (if specified). -# The default value is: YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command command input-file, where command is the value of the -# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file -# version. For an example see the documentation. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can -# optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. -# -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -# tag is left empty. - -LAYOUT_FILE = - -GENERATE_LATEX = NO - -# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -# the reference definitions. This must be a list of .bib files. The .bib -# extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. -# For LaTeX the style of the bibliography can be controlled using -# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -# search path. See also \cite for info how to create references. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# Configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the -# messages are off. -# The default value is: NO. - -QUIET = YES - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES -# this implies that the warnings are on. -# -# Tip: Turn warnings on while writing the documentation. -# The default value is: YES. - -WARNINGS = YES - -# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate -# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -# will automatically be disabled. -# The default value is: YES. - -WARN_IF_UNDOCUMENTED = NO - -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. -# The default value is: YES. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -# are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. -# The default value is: NO. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that doxygen -# can produce. The string should contain the $file, $line, and $text tags, which -# will be replaced by the file and line number from which the warning originated -# and the warning text. Optionally the format may contain $version, which will -# be replaced by the version of the file (if it could be obtained via -# FILE_VERSION_FILTER) -# The default value is: $file:$line: $text. - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning and error -# messages should be written. If left blank the output is written to standard -# error (stderr). - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# Configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag is used to specify the files and/or directories that contain -# documented source files. You may enter file names like myfile.cpp or -# directories like /usr/src/myproject. Separate the files or directories with -# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING -# Note: If this tag is empty the current directory is searched. - -INPUT = ../../torch/csrc/api/include \ - ../../torch/csrc/api/src \ - ../../torch/csrc/jit/custom_operator.h \ - ../../torch/csrc/jit/import.h \ - ../../torch/csrc/jit/ivalue.h \ - ../../torch/csrc/jit/script/module.h \ - ../../aten/src/ATen/ATen.h \ - ../../aten/src/ATen/Backend.h \ - ../../aten/src/ATen/Device.h \ - ../../aten/src/ATen/DeviceGuard.h \ - ../../aten/src/ATen/Layout.h \ - ../../aten/src/ATen/OptionsGuard.h \ - ../../aten/src/ATen/Scalar.h \ - ../../aten/src/ATen/TensorOptions.h \ - ../../aten/src/ATen/core/ArrayRef.h \ - ../../aten/src/ATen/core/DeviceType.h \ - ../../aten/src/ATen/core/Error.h \ - ../../aten/src/ATen/core/Half.h \ - ../../aten/src/ATen/core/ScalarType.h \ - ../../aten/src/ATen/cuda/CUDAGuard.h \ - ../../aten/src/ATen/cuda/CUDAStream.h \ - ../../aten/src/ATen/cuda/CUDAContext.h \ - ../../aten/src/ATen/cudnn/Descriptors.h \ - ../../aten/src/ATen/cudnn/Handles.h \ - ../../aten/src/ATen/cudnn/Types.h \ - ../../aten/src/ATen/cudnn/Utils.h \ - ../../aten/src/ATen/mkl/Descriptors.h \ - ../../build/aten/src/ATen/Tensor.h \ - ../../build/aten/src/ATen/Functions.h \ - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. -# The default value is: UTF-8. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -# *.h) to filter out the source-files in the directories. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# read by doxygen. -# -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, -# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. - -FILE_PATTERNS = *.h - -# The RECURSIVE tag can be used to specify whether or not subdirectories should -# be searched for input files as well. -# The default value is: NO. - -RECURSIVE = YES - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. -# The default value is: NO. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* - -EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn* - -# The EXAMPLE_PATH tag can be used to specify one or more files or directories -# that contain example code fragments that are included (see the \include -# command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank all -# files are included. - -EXAMPLE_PATTERNS = * - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude commands -# irrespective of the value of the RECURSIVE tag. -# The default value is: NO. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or directories -# that contain images that are to be included in the documentation (see the -# \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command: -# -# -# -# where is the value of the INPUT_FILTER tag, and is the -# name of an input file. Doxygen will then use the output that the filter -# program writes to standard output. If FILTER_PATTERNS is specified, this tag -# will be ignored. -# -# Note that the filter must not add or remove lines; it is applied before the -# code is scanned, but not when the output code is generated. If lines are added -# or removed, the anchors will not be placed correctly. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: pattern=filter -# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -# patterns match the file name, INPUT_FILTER is applied. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will also be used to filter the input files that are used for -# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -# The default value is: NO. - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -# it is also possible to disable source filtering for a specific pattern using -# *.ext= (so without naming a filter). -# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - -FILTER_SOURCE_PATTERNS = - -# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -# is part of the input, its contents will be placed on the main page -# (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. - -USE_MDFILE_AS_MAINPAGE = - -#--------------------------------------------------------------------------- -# Configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -# generated. Documented entities will be cross-referenced with these sources. -# -# Note: To get rid of all source code in the generated output, make sure that -# also VERBATIM_HEADERS is set to NO. -# The default value is: NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. -# The default value is: NO. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -# special comment blocks from generated source code fragments. Normal C, C++ and -# Fortran comments will always remain visible. -# The default value is: YES. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. -# The default value is: NO. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES then for each documented function -# all documented entities called/used by that function will be listed. -# The default value is: NO. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -# to YES then the hyperlinks from functions in REFERENCES_RELATION and -# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -# link to the documentation. -# The default value is: YES. - -REFERENCES_LINK_SOURCE = YES - -# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -# source code will show a tooltip with additional information such as prototype, -# brief description and links to the definition and documentation. Since this -# will make the HTML file larger and loading of large files a bit slower, you -# can opt to disable this feature. -# The default value is: YES. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -SOURCE_TOOLTIPS = YES - -# If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in -# source browser. The htags tool is part of GNU's global source tagging system -# (see https://www.gnu.org/software/global/global.html). You will need version -# 4.8.6 or higher. -# -# To use it do the following: -# - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file -# - Make sure the INPUT points to the root of the source tree -# - Run doxygen as normal -# -# Doxygen will invoke htags (and that will in turn invoke gtags), so these -# tools must be available from the command line (i.e. in the search path). -# -# The result: instead of the source browser generated by doxygen, the links to -# source code will now point to the output of htags. -# The default value is: NO. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -# verbatim copy of the header file for each class for which an include is -# specified. Set to NO to disable this. -# See also: Section \class. -# The default value is: YES. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# Configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -# compounds will be generated. Enable this if the project contains a lot of -# classes, structs, unions or interfaces. -# The default value is: YES. - -ALPHABETICAL_INDEX = YES - -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output -# The default value is: YES. - -GENERATE_HTML = NO - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -# it. -# The default directory is: html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -# generated HTML page (for example: .htm, .php, .asp). -# The default value is: .html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a -# standard header. -# -# To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. -# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -# default header using -# doxygen -w html new_header.html new_footer.html new_stylesheet.css -# YourConfigFile -# and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally -# uses. -# Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description -# of the possible markers and block names see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard -# footer. See HTML_HEADER for more information on how to generate a default -# footer and what special commands can be used inside the footer. See also -# section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -# sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. -# See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. -# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -# it is more robust and this tag (HTML_STYLESHEET) will in the future become -# obsolete. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined -# cascading style sheets that are included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. -# This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefore more robust against future updates. -# Doxygen will copy the style sheet files to the output directory. -# Note: The order of the extra style sheet files is of importance (e.g. the last -# style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -# files will be copied as-is; there are no commands or markers available. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -# will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see -# https://en.wikipedia.org/wiki/Hue for more information. For instance the value -# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -# purple, and 360 is red again. -# Minimum value: 0, maximum value: 359, default value: 220. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A -# value of 255 will produce the most vivid colors. -# Minimum value: 0, maximum value: 255, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -# luminance component of the colors in the HTML output. Values below 100 -# gradually make the output lighter, whereas values above 100 make the output -# darker. The value divided by 100 is the actual gamma applied, so 80 represents -# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -# change the gamma. -# Minimum value: 40, maximum value: 240, default value: 80. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -# shown in the various tree structured indices initially; the user can expand -# and collapse entries dynamically later on. Doxygen will expand the tree to -# such a level that at most the specified number of entries are visible (unless -# a fully collapsed tree already exceeds this amount). So setting the number of -# entries 1 will produce a full collapsed tree by default. 0 is a special value -# representing an infinite number of entries and will result in a full expanded -# tree by default. -# Minimum value: 0, maximum value: 9999, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files will be -# generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_DOCSET = NO - -# This tag determines the name of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# The default value is: Doxygen generated docs. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# This tag specifies a string that should uniquely identify the documentation -# set bundle. This should be a reverse domain-name style string, e.g. -# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. -# The default value is: org.doxygen.Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -# The default value is: Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. -# -# The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -# files are now used as the Windows 98 help format, and will replace the old -# Windows help format (.hlp) on all Windows platforms in the future. Compressed -# HTML files also contain an index, a table of contents, and you can search for -# words in the documentation. The HTML workshop also contains a viewer for -# compressed HTML files. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_HTMLHELP = NO - -# The CHM_FILE tag can be used to specify the file name of the resulting .chm -# file. You can add a path in front of the file if the result should not be -# written to the html output directory. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_FILE = - -# The HHC_LOCATION tag can be used to specify the location (absolute path -# including file name) of the HTML help compiler (hhc.exe). If non-empty, -# doxygen will try to run the HTML help compiler on the generated index.hhp. -# The file has to be specified with full path. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -HHC_LOCATION = - -# The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -GENERATE_CHI = NO - -# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) -# and project file content. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_INDEX_ENCODING = - -# The BINARY_TOC flag controls whether a binary table of contents is generated -# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it -# enables the Previous and Next buttons. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members to -# the table of contents of the HTML help documentation and to the tree view. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -# (.qch) of the generated HTML documentation. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -# the file name of the resulting .qch file. The path specified is relative to -# the HTML output folder. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -# Project output. For more information please see Qt Help Project / Namespace -# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -# Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). -# The default value is: doc. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_VIRTUAL_FOLDER = doc - -# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -# filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_SECT_FILTER_ATTRS = - -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -# generated, together with the HTML files, they form an Eclipse help plugin. To -# install this plugin and make it available under the help contents menu in -# Eclipse, the contents of the directory containing the HTML and XML files needs -# to be copied into the plugins directory of eclipse. The name of the directory -# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -# After copying Eclipse needs to be restarted before the help appears. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the Eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have this -# name. Each documentation set should have its own identifier. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# If you want full control over the layout of the generated HTML pages it might -# be necessary to disable the index and replace it with your own. The -# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -# of each HTML page. A value of NO enables the index and the value YES disables -# it. Since the tabs in the index contain the same information as the navigation -# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. If the tag -# value is set to YES, a side panel will be generated containing a tree-like -# index structure (just like the one that is generated for HTML Help). For this -# to work a browser that supports JavaScript, DHTML, CSS and frames is required -# (i.e. any modern browser). Windows users are probably better off using the -# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. -# -# Note that a value of 0 will completely suppress the enum values from appearing -# in the overview section. -# Minimum value: 0, maximum value: 20, default value: 4. -# This tag requires that the tag GENERATE_HTML is set to YES. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -# to set the initial width (in pixels) of the frame in which the tree is shown. -# Minimum value: 0, maximum value: 1500, default value: 250. -# This tag requires that the tag GENERATE_HTML is set to YES. - -TREEVIEW_WIDTH = 250 - -# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to -# external symbols imported via tag files in a separate window. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of LaTeX formulas included as images in -# the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML -# output directory to force them to be regenerated. -# Minimum value: 8, maximum value: 50, default value: 10. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# https://www.mathjax.org) which uses client side Javascript for the rendering -# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX -# installed or if you want to formulas look prettier in the HTML output. When -# enabled you may also need to install MathJax separately and configure the path -# to it using the MATHJAX_RELPATH option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -USE_MATHJAX = NO - -# When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. -# Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. -# The default value is: HTML-CSS. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_FORMAT = HTML-CSS - -# When MathJax is enabled you need to specify the location relative to the HTML -# output directory using the MATHJAX_RELPATH option. The destination directory -# should contain the MathJax.js script. For instance, if the mathjax directory -# is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -# Content Delivery Network so you can quickly see the result without installing -# MathJax. However, it is strongly recommended to install a local copy of -# MathJax from https://www.mathjax.org before deployment. -# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ - -# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -# extension names that should be enabled during MathJax rendering. For example -# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_EXTENSIONS = - -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -# of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an -# example see the documentation. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_CODEFILE = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and -# should work on any modern browser. Note that when using HTML help -# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -# there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then -# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -# search using the keyboard; to jump to the search box use + S -# (what the is depends on the OS and browser, but it is typically -# , /