Revert [dynamo] support group=None when rewriting collectives (#12018) (#120677)

kit1980 · pytorchmergebot · commit d341b66e9604 · 2024-02-27T00:33:35.000Z
This reverts commit 298c686. Pull Request resolved: #120677 Approved by: https://github.com/yifuwang, https://github.com/huydhn
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -234,7 +234,7 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,5
+moco,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -182,7 +182,7 @@ mobilenet_v3_large,pass,7
 
 
 
-moco,pass,11
+moco,pass,17
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -230,7 +230,7 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,5
+moco,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -178,7 +178,7 @@ mobilenet_v3_large,pass,7
 
 
 
-moco,pass,11
+moco,pass,17
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -230,7 +230,7 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,5
+moco,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -178,7 +178,7 @@ mobilenet_v3_large,pass,7
 
 
 
-moco,pass,11
+moco,pass,17
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -234,7 +234,7 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,5
+moco,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -182,7 +182,7 @@ mobilenet_v3_large,pass,7
 
 
 
-moco,pass,11
+moco,pass,17
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -234,7 +234,7 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,5
+moco,pass,11
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -182,7 +182,7 @@ mobilenet_v3_large,pass,7
 
 
 
-moco,pass,11
+moco,pass,17
 
 
 
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
@@ -22,11 +22,7 @@
     run_with_both_funcol_impls_with_arg,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-    requires_cuda,
-)
+from torch.testing._internal.common_utils import instantiate_parametrized_tests, requires_cuda
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
 from torch.utils._triton import has_triton
 from torch._inductor.utils import run_and_get_triton_code
@@ -829,43 +825,22 @@ def func(inp, out, *, pg):
         assert same(outputs, correct_outputs)
 
     @run_with_both_funcol_impls
-    @parametrize(
-        "pg_mode",
-        [
-            "kwargs",
-            "kwargs_none",
-            "unspecified",
-        ]
-    )
-    def test_dynamo_rewrite_dist_allreduce(self, pg_mode):
-
-        def func(tensor, *args, **kwargs):
+    def test_dynamo_rewrite_dist_allreduce(self):
+
+        def func(tensor, pg):
             torch.distributed.all_reduce(
                 tensor,
-                *args,
-                **kwargs,
+                group=pg
             )
 
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter, fullgraph=True)
 
-        args = []
-        kwargs = {}
-
-        # TODO(yifu): test positional and positional_none
-        # once explicit reduce op is supported
-        if pg_mode == "kwargs":
-            kwargs["group"] = GroupMember.WORLD
-        elif pg_mode == "kwargs_none":
-            kwargs["group"] = None
-        else:
-            assert pg_mode == "unspecified"
-
         inputs_compiled = torch.ones(2, device=self.device)
         inputs_eager = torch.ones(2, device=self.device)
 
-        compiled(inputs_compiled, *args, **kwargs)
-        func(inputs_eager, *args, **kwargs)
+        compiled(inputs_compiled, GroupMember.WORLD)
+        func(inputs_eager, GroupMember.WORLD)
 
         assert counter.frame_count == 1
         # should test more precisely, but the 3 is supposed to be (all_reduce, wait, copy_)
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
@@ -6,8 +6,6 @@
 import torch
 from .. import variables
 from ..exc import unimplemented
-from ..guards import GuardBuilder, install_guard
-from ..source import AttrSource, GlobalSource
 from ..utils import istype
 from .base import VariableTracker
 from .constant import ConstantVariable
@@ -257,30 +255,3 @@ def is_process_group(value):
         from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
 
         return istype(value, (ProcessGroup, FakeProcessGroup))
-
-    @staticmethod
-    def get_global_pg_variable():
-        """
-        Make a ProcessGroupVariable from torch.distributed.group.WORLD and
-        intall guards.
-        """
-        import torch.distributed as dist
-
-        source = AttrSource(
-            AttrSource(
-                base=AttrSource(
-                    base=GlobalSource(global_name="torch"),
-                    member="distributed",
-                    get_static=False,
-                ),
-                member="group",
-                get_static=False,
-            ),
-            member="WORLD",
-            get_static=False,
-        )
-        install_guard(source.make_guard(GuardBuilder.ID_MATCH))
-        return ProcessGroupVariable(
-            dist.group.WORLD,
-            source=source,
-        )
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
@@ -17,7 +17,6 @@
 from ..utils import check_constant_args, get_first_attr, identity, istype, make_cell
 from .base import MutableLocal, typestr, VariableTracker
 from .constant import ConstantVariable
-from .distributed import ProcessGroupVariable
 
 if TYPE_CHECKING:
     from torch._guards import Source
@@ -687,21 +686,10 @@ def call_function(
         # call_function must check any unsupported arguments and graph-break.
         # It's safe to assume args/kwargs from orig_fn map 1:1 to args/kwargs of remapped_fn,
         # since that's the contract for putting a mapping in `traceable_collective_remaps`
-
-        # Merge args into kwargs so positional and keyword args
-        # can be processed the same way.
-        signature = inspect.signature(self.fn)
-        kwargs = dict(signature.bind(*args, **kwargs).arguments)
-        args = ()
-
         if "async_op" in kwargs and kwargs["async_op"].as_python_constant():
             unimplemented(
                 f"CollectiveFunctionRewriteVariable can't support async_op=True for {self.fn}"
             )
-
-        if kwargs.get("group") is None or kwargs["group"].value is None:
-            kwargs["group"] = ProcessGroupVariable.get_global_pg_variable()
-
         return self.replacement_var.call_function(tx, args, kwargs)
 
 
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
@@ -1035,20 +1035,10 @@ def all_gather_inplace(
     assert (
         not async_op
     ), "Can't remap async version of inplace op to functional collective"
-    assert all(
-        t.size(0) == tensor.size(0) for t in tensor_list
-    ), "Remapping variable size all_gather is not yet supported"
-
     output = all_gather_tensor(tensor, 0, group, tag)
-
-    # Use aten.slice as instead of aten.split because the latter causes
-    # tensor.shape(0) to be unnecessarily baked in when it's a SymInt.
-    output_splits = []
-    offset = 0
-    for t in tensor_list:
-        output_splits.append(output[offset : offset + t.size(0)])
-        offset += t.size(0)
-    for dst, src in zip(tensor_list, output_splits):
+    for dst, src in zip(
+        tensor_list, output.split([t.size(0) for t in tensor_list], dim=0)
+    ):
         dst.copy_(src)
     return tensor_list
 

Original file line number	Diff line number	Diff line change
`@@ -234,7 +234,7 @@ mobilenet_v3_large,pass,0`
`234`	`234`
`235`	`235`
`236`	`236`
`237`		`-moco,pass,5`
	`237`	`+moco,pass,11`
`238`	`238`
`239`	`239`
`240`	`240`
Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ mobilenet_v3_large,pass,7`
`182`	`182`
`183`	`183`
`184`	`184`
`185`		`-moco,pass,11`
	`185`	`+moco,pass,17`
`186`	`186`
`187`	`187`
`188`	`188`
Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@ mobilenet_v3_large,pass,0`
`230`	`230`
`231`	`231`
`232`	`232`
`233`		`-moco,pass,5`
	`233`	`+moco,pass,11`
`234`	`234`
`235`	`235`
`236`	`236`
Original file line number	Diff line number	Diff line change
`@@ -178,7 +178,7 @@ mobilenet_v3_large,pass,7`
`178`	`178`
`179`	`179`
`180`	`180`
`181`		`-moco,pass,11`
	`181`	`+moco,pass,17`
`182`	`182`
`183`	`183`
`184`	`184`