[V1] [Hybrid] Disable prefix caching by default for hybrid or mamba-based models (vllm-project#23716)

tdoublep · epwalsh · commit d69b09403b6b · 2025-08-27T16:55:43.000-07:00
Signed-off-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
@@ -107,14 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
+Please note that prefix caching is not yet supported for these models.
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching in V1.
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+Please note that prefix caching is not yet supported for these models.
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching and enforcing eager mode in V1.
+Please note that prefix caching is not yet supported for these models.
+It is also necessary to enforce eager mode for these models in V1.
 
 #### Encoder-Decoder Models
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -292,12 +292,13 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             return
 
         model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
         compilation_config = vllm_config.compilation_config
 
-        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture,
-            model_config=model_config,
-        )
+        # TODO(tdoublep): remove once prefix caching is enabled
+        cache_config.enable_prefix_caching = False
+        logger.info("Hybrid or mamba-based model detected: disabling prefix "
+                    "caching since it is not yet supported.")
 
         # TODO(tdoublep): remove as full cuda graph support is added
         FCG_NOT_SUPPORTED_MODELS = [