Update limits

ilmarkov · ilmarkov · commit d7600ec809ec · 2025-09-01T12:56:13.000Z
Signed-off-by: ilmarkov &lt;markovilya197@gmail.com&gt;
diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py
@@ -64,12 +64,13 @@
 FP8_DTYPE = current_platform.fp8_dtype()
 MiB = 1024 * 1024
 
-# FlashInfer max sizes per world size (from collective_fusion.py)
+# FlashInfer max sizes per world size
+# Enable 64MB for 2, 4, 8 world sizes to verify large input sizes
+# use --disable-oneshot to disable oneshot mode for very large input sizes
 _FI_MAX_SIZES = {
     2: 64 * MiB,  # 64MB
-    4: 32 * MiB,  # 32MB
-    6: 32 * MiB,  # 32MB
-    8: 32 * MiB,  # 32MB
+    4: 64 * MiB,  # 64MB
+    8: 64 * MiB,  # 64MB
 }
 
 # Global workspace tensor for FlashInfer
@@ -186,7 +187,7 @@ def flashinfer_fused_allreduce_rmsnorm(
         allreduce_out=None,
         quant_out=None,
         scale_out=None,
-        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4_,
         scale_factor=None,
         use_oneshot=use_oneshot,
         **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
@@ -228,7 +229,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
         allreduce_out=None,
         quant_out=quant_out,
         scale_out=None,
-        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
         scale_factor=scale_factor,
         use_oneshot=use_oneshot,
         **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
@@ -271,7 +272,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
         allreduce_out=None,
         quant_out=quant_out,
         scale_out=output_scale,
-        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
         scale_factor=input_global_scale,
         use_oneshot=use_oneshot,
         **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
@@ -579,6 +580,7 @@ def run_benchmarks(
     use_residual: bool,
     allreduce_params: Optional[FlashInferFusedAllReduceParams],
     quant_mode: str = "all",
+    disable_oneshot: bool = False,
 ):
     """Run all benchmarks for given configuration.
 
@@ -638,17 +640,18 @@ def run_benchmarks(
         # FlashInfer Fused AllReduce + RMSNorm Oneshot
         if flashinfer_comm is not None and allreduce_params is not None:
             try:
-                time_ms = benchmark_operation(
-                    flashinfer_fused_allreduce_rmsnorm,
-                    input_tensor,
-                    residual=residual,
-                    norm_out=norm_out,
-                    rms_gamma=rms_gamma,
-                    rms_eps=rms_eps,
-                    allreduce_params=allreduce_params,
-                    use_oneshot=True,
-                )
-                results["flashinfer_fused_allreduce_rmsnorm_oneshot"] = time_ms
+                if not disable_oneshot:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=True,
+                    )
+                    results["flashinfer_fused_allreduce_rmsnorm_oneshot"] = time_ms
             except Exception as e:
                 logger.error("FlashInfer Fused AllReduce+RMSNorm Oneshot failed: %s", e)
                 results["flashinfer_fused_allreduce_rmsnorm_oneshot"] = float("inf")
@@ -712,21 +715,22 @@ def run_benchmarks(
         # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
         if flashinfer_comm is not None and allreduce_params is not None:
             try:
-                time_ms = benchmark_operation(
-                    flashinfer_fused_allreduce_rmsnorm_fp8_quant,
-                    input_tensor,
-                    norm_out=norm_out,
-                    residual=residual,
-                    rms_gamma=rms_gamma,
-                    rms_eps=rms_eps,
-                    scale_factor=scale_fp8,
-                    quant_out=quant_out_fp8,
-                    allreduce_params=allreduce_params,
-                    use_oneshot=True,
-                )
-                results["flashinfer_fused_allreduce_rmsnorm_fp8_quant_oneshot"] = (
-                    time_ms
-                )
+                if not disable_oneshot:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp8_quant,
+                        input_tensor,
+                        norm_out=norm_out,
+                        residual=residual,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        scale_factor=scale_fp8,
+                        quant_out=quant_out_fp8,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=True,
+                    )
+                    results["flashinfer_fused_allreduce_rmsnorm_fp8_quant_oneshot"] = (
+                        time_ms
+                    )
             except Exception as e:
                 logger.error(
                     "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
@@ -802,22 +806,23 @@ def run_benchmarks(
         # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
         if flashinfer_comm is not None and allreduce_params is not None:
             try:
-                time_ms = benchmark_operation(
-                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
-                    input_tensor,
-                    residual=residual,
-                    norm_out=norm_out,
-                    rms_gamma=rms_gamma,
-                    rms_eps=rms_eps,
-                    input_global_scale=scale_fp4,
-                    allreduce_params=allreduce_params,
-                    quant_out=fp4_quant_out,
-                    output_scale=fp4_output_scale,
-                    use_oneshot=True,
-                )
-                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_oneshot"] = (
-                    time_ms
-                )
+                if not disable_oneshot:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        input_global_scale=scale_fp4,
+                        allreduce_params=allreduce_params,
+                        quant_out=fp4_quant_out,
+                        output_scale=fp4_output_scale,
+                        use_oneshot=True,
+                    )
+                    results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_oneshot"] = (
+                        time_ms
+                    )
             except Exception as e:
                 logger.error(
                     "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
@@ -1224,6 +1229,7 @@ def main():
                 use_residual,
                 allreduce_params,
                 quant_mode=quant_mode,
+                disable_oneshot=args.disable_oneshot,
             )
 
             # Store results for markdown export
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -20,7 +20,8 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import (_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES,
+                        direct_register_custom_op, flashinfer_max_size)
 from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
@@ -439,6 +440,23 @@ def call_trtllm_fused_allreduce_norm(
         scale_out: Optional[torch.Tensor] = None,
         scale_factor: Optional[torch.Tensor] = None,
     ) -> None:
+        num_tokens, hidden_size = allreduce_in.shape
+        element_size = allreduce_in.element_size()
+        current_tensor_size = num_tokens * hidden_size * element_size
+        max_tensor_size = max_token_num * hidden_size * element_size
+        assert current_tensor_size <= max_tensor_size, \
+            f"Current tensor size {current_tensor_size} is larger than " \
+            f"max token num {max_token_num} * hidden size {hidden_size} * " \
+            f"element size {element_size}"
+        device_capability = current_platform.get_device_capability(
+        ).as_version_str()
+        max_sizes = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES.get(device_capability, {})
+        # Get one shot input size limit for the current world size
+        max_one_shot_size = max_sizes.get(world_size, None)
+        # Use one shot if no max size is specified
+        use_oneshot = max_one_shot_size is None or \
+            current_tensor_size <= max_one_shot_size
+
         assert (
             _FI_WORKSPACE_TENSOR
             is not None), "Flashinfer must be enabled when using flashinfer"
@@ -465,7 +483,7 @@ def call_trtllm_fused_allreduce_norm(
             hidden_dim=allreduce_in.shape[-1],
             workspace_ptrs=_FI_WORKSPACE_TENSOR,
             launch_with_pdl=launch_with_pdl,
-            use_oneshot=True,
+            use_oneshot=use_oneshot,
             trigger_completion_at_end=trigger_completion_at_end,
             fp32_acc=fp32_acc,
             pattern_code=pattern_code,
@@ -1458,24 +1476,28 @@ def __init__(self, config: VllmConfig):
                 "Flashinfer is not installed or comm module not found, "
                 "skipping allreduce fusion pass")
             return
-        # Check if the world size is supported
-        if self.tp_size not in _FI_MAX_SIZES:
+        max_size = flashinfer_max_size(self.tp_size, config)
+        if max_size is None:
+            # Flashinfer doesn't support current world size
             logger.warning(
                 "Flashinfer allreduce fusion is not "
                 "supported for world size %s",
                 self.tp_size,
             )
             return
-        max_num_token = min(
-            _FI_MAX_SIZES.get(self.tp_size, _DEFAULT_FI_MAX_SIZE) //
-            (self.hidden_dim * self.tp_size * (4 if use_fp32_lamport else 2)),
-            config.compilation_config.pass_config.
-            fi_allreduce_fusion_max_token_num)
+        element_size = 4 if use_fp32_lamport else 2
+        max_token_num = (max_size //
+                         (self.hidden_dim * element_size))
+        # take the min to save workspace size and we'll never use more
+        # than max_num_batched_tokens anyways
+        max_token_num = min(max_token_num,
+                            config.scheduler_config.max_num_batched_tokens)
+
         self.ipc_handles, workspace_tensor = (
             flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
                 tp_rank=rank,
                 tp_size=self.tp_size,
-                max_token_num=max_num_token,
+                max_token_num=max_token_num,
                 hidden_dim=self.hidden_dim,
                 group=self.group,
                 use_fp32_lamport=use_fp32_lamport,
@@ -1487,7 +1509,7 @@ def __init__(self, config: VllmConfig):
             rank=rank,
             world_size=self.tp_size,
             use_fp32_lamport=use_fp32_lamport,
-            max_token_num=max_num_token,
+            max_token_num=max_token_num,
         )
         is_custom_ops = ("+rms_norm"  in config.compilation_config.custom_ops, 
                          "+quant_fp8" in config.compilation_config.custom_ops)
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -49,10 +49,8 @@
     try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
-from vllm.utils import (_DEFAULT_FI_ALLREDUCE_MAX_INPUT_SIZE,
-                        _FI_ALLREDUCE_MAX_INPUT_SIZES,
-                        DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
-                        LazyLoader, common_broadcastable_dtype, random_uuid)
+from vllm.utils import (LayerBlockType, LazyLoader, common_broadcastable_dtype,
+                        flashinfer_max_size, random_uuid)
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
@@ -3879,13 +3877,15 @@ def _set_compile_ranges(self):
         # Add the compile ranges for flashinfer
         if compilation_config.pass_config.enable_fi_allreduce_fusion:
             tp_size = self.parallel_config.tensor_parallel_size
-            max_size = _FI_ALLREDUCE_MAX_INPUT_SIZES.get(
-                tp_size, _DEFAULT_FI_ALLREDUCE_MAX_INPUT_SIZE)
-            max_token_num = max_size // (self.model_config.get_hidden_size() *
-                                         self.model_config.dtype.itemsize)
-            # We add 1 because the bounds checks in the compiler are exclusive
-            # and we want to include the max_token_num in the compile range
-            computed_compile_ranges_split_points.append(max_token_num + 1)
+            max_size = flashinfer_max_size(tp_size, self)
+            if max_size is not None:
+                max_token_num = max_size // (
+                    self.model_config.get_hidden_size() *
+                    self.model_config.dtype.itemsize)
+                # We add 1 because the bounds checks in the compiler are
+                # exclusive and we want to include the max_token_num in the
+                # compile range
+                computed_compile_ranges_split_points.append(max_token_num + 1)
 
         if compilation_config.compile_ranges_split_points is not None:
             for x in compilation_config.compile_ranges_split_points:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -87,8 +87,14 @@ class PassConfig:
     """Whether to enable async TP."""
     enable_fi_allreduce_fusion: bool = False
     """Whether to enable flashinfer allreduce fusion."""
-    fi_allreduce_fusion_max_token_num: int = 16384
-    """Max number of tokens to used in flashinfer allreduce fusion."""
+    fi_allreduce_fusion_max_size_mb: dict[int,
+                                          float] = field(default_factory=dict)
+    """The thresholds of the communicated tensor sizes under which
+    vllm should use flashinfer fused allreduce. Specified as a
+    dictionary mapping each world size to the threshold in MB
+        { <world size>: <max size in mb> }
+    Unspecified world sizes will fallback to
+        { 2: 32, 4: 32, 8: 2 }"""
 
     # TODO(luka) better pass enabling system.
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-import json
 import os
 import sys
 import tempfile
@@ -1059,16 +1058,6 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
     lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
 
-    # Specifies the thresholds of the communicated tensor sizes under which
-    # vllm should use flashinfer fused allreduce. The variable should be a
-    # JSON with the following format:
-    #     { <world size>: <max size in mb> }
-    # Unspecified world sizes will fallback to
-    #     { 2: 64, 4: 1, <everything else>: 0.5 }
-    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
-    lambda: json.loads(os.getenv(
-        "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")),
-
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py