Fix and add compile ranges test

ilmarkov · ilmarkov · commit ca9f59e6119e · 2025-09-01T12:56:13.000Z
Signed-off-by: ilmarkov &lt;markovilya197@gmail.com&gt;
diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import set_forward_context
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+BATCH_SIZE = 64
+MLP_SIZE = 128
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@support_torch_compile
+class TestModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x * 3
+        return x
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module,
+              batch_sizes: list[int]):
+    with set_forward_context({}, vllm_config=vllm_config):
+        model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        for batch_size in batch_sizes:
+            model(torch.randn(batch_size, MLP_SIZE).cuda())
+
+
+def test_compile_ranges():
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        compile_ranges_split_points=[8, 32],
+    ))
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix='').eval().cuda()
+    batch_sizes = [1, 16, 48]
+    # A has support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=1,
+            num_backend_compilations=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, model, batch_sizes)
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
@@ -27,7 +27,7 @@
 from .backend import TestBackend
 
 
-def finisher(hidden_states):
+def maybe_dummy_quant(hidden_states):
     custom_ops = get_current_vllm_config().compilation_config.custom_ops
     if not custom_ops or "+quant_fp8" not in custom_ops:
         # Hack: use dynamic fp8 quantization to
@@ -53,7 +53,7 @@ def forward(self, hidden_states, residual):
 
         hidden_states = self.norm(all_reduce)
 
-        hidden_states = finisher(hidden_states)
+        hidden_states = maybe_dummy_quant(hidden_states)
 
         return hidden_states
 
@@ -80,7 +80,7 @@ def forward(self, hidden_states, residual):
         # Hack: use dynamic fp8 quantization to
         # suppress torch.compile optimizations
         # that prevent pattern matching
-        hidden_states = finisher(hidden_states)
+        hidden_states = maybe_dummy_quant(hidden_states)
         return hidden_states, residual
 
     def ops_in_model_after(self):
@@ -122,7 +122,7 @@ def forward(self, hidden_states, residual):
         all_reduce = tensor_model_parallel_all_reduce(view)
         norm_output, residual_output = self.norm(all_reduce, residual)
         output, _ = self.quant_fp8(norm_output, self.scale)
-        hidden_states = finisher(output.to(hidden_states.dtype))
+        hidden_states = maybe_dummy_quant(output.to(hidden_states.dtype))
         return hidden_states, residual_output
 
     def ops_in_model_after(self):
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -10,7 +10,6 @@
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
-import vllm.envs as envs
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -398,31 +397,6 @@ def __call__(self, graph: fx.Graph):
 if flashinfer_comm is not None:
     _FI_WORKSPACE_TENSOR = None
 
-    MiB = 1024 * 1024
-    # Max size of the input tensor per world size
-    # to use flashinfer fused allreduce
-    _FI_MAX_SIZES = {
-        2: 64 * MiB,  # 64MB
-        4: MiB,  # 1MB
-        6: MiB // 2,  # 512KB
-        8: MiB // 2,  # 512KB
-    }
-
-    try:
-        _FI_MAX_SIZES.update({
-            int(k): int(float(v) * MiB)
-            for k, v in
-            envs.VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB.items()
-        })
-    except Exception as e:
-        raise ValueError(
-            "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: "
-            + str(e)) from e
-
-    # opt for a more conservative default value
-    # when world size is not in _FI_MAX_SIZES
-    _DEFAULT_FI_MAX_SIZE = MiB // 2
-
     def call_trtllm_fused_allreduce_norm(
         allreduce_in: torch.Tensor,
         residual: torch.Tensor,
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -72,7 +72,6 @@
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
-from vllm.platforms import current_platform
 from vllm.ray.lazy_utils import is_in_ray_actor
 
 if TYPE_CHECKING:
@@ -128,6 +127,10 @@ def flashinfer_max_size(world_size: int, config: VllmConfig) -> Optional[int]:
     allreduce fusion for the given world size. Falls back to
     conservative defaults if the world size is not specified in config.
     """
+
+    # import here to avoid circular dependencies
+    from vllm.platforms import current_platform
+
     device_capability = current_platform.get_device_capability(
     ).as_version_str()
     max_sizes = _FI_ALLREDUCE_MAX_INPUT_SIZES.get(device_capability, {})