a2a dispatch and combine configurable separately

danielvegamyhre · danielvegamyhre · commit bdbdd4591abc · 2025-10-02T15:30:46.000-07:00
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -399,10 +399,16 @@ class Parallelism:
     Note that this is still an experimental feature.
     """
 
-    expert_parallel_a2a_impl: Literal["default", "mxfp8"] = "default"
+    expert_parallel_a2a_dispatch_impl: Literal["default", "mxfp8"] = "default"
     """
-    MXFP8 all-to-all removes the need for device-to-host sync and optimizes network bandwidth usage
-    by using dynamic MXFP8 quantization on the all-to-all inputs, then dequantizes the outputs.
+    MXFP8 all-to-all optimizes network bandwidth usage by using dynamic MXFP8 quantization on the all-to-all
+    inputs, then dequantizing the outputs.
+    """
+
+    expert_parallel_a2a_combine_impl: Literal["default", "mxfp8"] = "default"
+    """
+    MXFP8 all-to-all optimizes network bandwidth usage by using dynamic MXFP8 quantization on the all-to-all
+    inputs, then dequantizing the outputs.
     """
 
 
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -22,7 +22,6 @@
 )
 from torch.distributed.tensor.parallel import ParallelStyle
 
-from torchtitan.tools.logging import logger
 from torchtitan.tools.utils import _round_up
 
 
@@ -90,18 +89,19 @@ class ExpertParallel(ParallelStyle):
         a2a_impl (str): The implementation of all-to-all. Default is "default". Options are ["default","mxfp8"].
     """
 
-    def __init__(self, a2a_impl: str = "default"):
+    def __init__(
+        self, a2a_dispatch_impl: str = "default", a2a_combine_impl: str = "default"
+    ):
         super().__init__()
         self.input_splits = None
         self.output_splits = None
-        self.a2a_func = self._get_a2a_func(a2a_impl)
+        self.a2a_dispatch_func = self._get_a2a_func(a2a_dispatch_impl)
+        self.a2a_combine_func = self._get_a2a_func(a2a_combine_impl)
 
     def _get_a2a_func(self, a2a_impl: str):
         if a2a_impl == "default":
-            logger.info("Using default all-to-all implementation")
             return all_to_all_single_autograd
         elif a2a_impl == "mxfp8":
-            logger.info("Using mxfp8 all-to-all implementation")
             from torchao.prototype.moe_training.kernels.mxfp8.comms import (
                 to_mxfp8_a2a_dequant,
             )
@@ -143,6 +143,13 @@ def _token_dispatch(self, mod, inputs, device_mesh):
             self.input_splits = input_splits.tolist()
             self.output_splits = output_splits.tolist()
 
+        routed_input = self.a2a_dispatch_func(
+            routed_input,
+            self.output_splits,
+            self.input_splits,
+            device_mesh.get_group(),
+        )
+
         # NOTE: After this all-to-all, the routed input is put on proper EP rank.
         # However, the num_tokens_per_expert_group is not of the final target format
         # [#tokens for local expert 0, #tokens for local expert 1, ...]
@@ -152,12 +159,7 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         # We need to perform another shuffle to get the correct format -- this is done via the function
         # generate_permute_indices in moe.py, which also does padding to make sure the number of tokens
         # each expert gets locally is a multiple of ALIGN_SIZE_M.
-        routed_input = self.a2a_func(
-            routed_input,
-            self.output_splits,
-            self.input_splits,
-            device_mesh.get_group(),
-        )
+
         return routed_input, num_tokens_per_expert_group
 
     @staticmethod
@@ -170,7 +172,7 @@ def _partition_fn(name, mod, device_mesh):
     # performing all-to-all combine on the output
     def _token_combine(self, mod, routed_output, device_mesh):
         # For a2a combine, input splits and output splits are opposite of a2a dispatch.
-        routed_output = self.a2a_func(
+        routed_output = self.a2a_combine_func(
             routed_output,
             self.input_splits,
             self.output_splits,
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -107,7 +107,8 @@ def parallelize_llama(
                 else None
             ),
             etp_enabled=parallel_dims.etp_enabled,
-            a2a_impl=job_config.parallelism.expert_parallel_a2a_impl,
+            a2a_dispatch_impl=job_config.parallelism.expert_parallel_a2a_dispatch_impl,
+            a2a_combine_impl=job_config.parallelism.expert_parallel_a2a_combine_impl,
         )
 
     model_compile_enabled = (
@@ -439,8 +440,11 @@ def apply_moe_ep_tp(
     ep_mesh: DeviceMesh | None,
     ep_tp_mesh: DeviceMesh | None,
     etp_enabled: bool,
-    a2a_impl: str = "default",
+    a2a_dispatch_impl: str = "default",
+    a2a_combine_impl: str = "default",
 ):
+    logger.info(f"Using all-to-all dispatch: {a2a_dispatch_impl}")
+    logger.info(f"Using all-to-all combine: {a2a_combine_impl}")
     for transformer_block in model.layers.values():
         if not transformer_block.moe_enabled:
             continue
@@ -489,13 +493,19 @@ def apply_moe_ep_tp(
         elif tp_mesh is None:
             experts_mesh = ep_mesh
             # input / output sharding on the batch / tokens dim
-            experts_plan = ExpertParallel(a2a_impl=a2a_impl)
+            experts_plan = ExpertParallel(
+                a2a_dispatch_impl=a2a_dispatch_impl,
+                a2a_combine_impl=a2a_combine_impl,
+            )
         elif etp_enabled:
             experts_mesh = ep_tp_mesh
             experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
         else:
             experts_mesh = ep_mesh
-            experts_plan = ExpertParallel(a2a_impl=a2a_impl)
+            experts_plan = ExpertParallel(
+                a2a_dispatch_impl=a2a_dispatch_impl,
+                a2a_combine_impl=a2a_combine_impl,
+            )
 
         parallelize_module(
             module=transformer_block.moe.experts,