vllm-project
diff --git a/‎tests/models/language/generation/test_hybrid.py‎
Lines changed: 60 additions & 10 deletions b/‎tests/models/language/generation/test_hybrid.py‎
Lines changed: 60 additions & 10 deletions
diff --git a/‎tests/models/registry.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/registry.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/v1/test_oracle.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/v1/test_oracle.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/model_executor/layers/mamba/ops/mamba_ssm.py‎
Lines changed: 1 addition & 1 deletion b/‎vllm/model_executor/layers/mamba/ops/mamba_ssm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/model_executor/models/bamba.py‎
Lines changed: 30 additions & 15 deletions b/‎vllm/model_executor/models/bamba.py‎
Lines changed: 30 additions & 15 deletions
diff --git a/‎vllm/model_executor/models/falcon_h1.py‎
Lines changed: 40 additions & 17 deletions b/‎vllm/model_executor/models/falcon_h1.py‎
Lines changed: 40 additions & 17 deletions
@@ -3,6 +3,7 @@
 
 import pytest
 
+from tests.models.registry import HF_EXAMPLE_MODELS
 from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
@@ -19,31 +20,55 @@
 SSM_MODELS = [
     "state-spaces/mamba-130m-hf",
     "tiiuae/falcon-mamba-tiny-dev",
-    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
     "mistralai/Mamba-Codestral-7B-v0.1",
 ]
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
-    # NOTE: Currently the test failes due to HF transformers issue fixed in:
-    # https://github.com/huggingface/transformers/pull/39033
-    # We will enable vLLM test for Granite after next HF transformers release.
-    # "ibm-granite/granite-4.0-tiny-preview",
     # NOTE: Running Plamo2 in transformers implementation requires to install
     # causal-conv1d package, which is not listed as a test dependency as it's
     # not compatible with pip-compile.
     "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+]
+
+HF_UNSUPPORTED_MODELS = [
+    # The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
+    # doesn't compare vLLM output with HF output.
+    # See https://github.com/huggingface/transformers/pull/35943
+    "mistralai/Mamba-Codestral-7B-v0.1",
+    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
+    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
+    "nvidia/Nemotron-H-8B-Base-8K",
+    # NOTE: Currently the test fails due to HF transformers issue fixed in:
+    # https://github.com/huggingface/transformers/pull/39033
+    # We will enable vLLM test for Granite after next HF transformers release.
+    "ibm-granite/granite-4.0-tiny-preview",
 ]
 
 V1_SUPPORTED_MODELS = [
     "mistralai/Mamba-Codestral-7B-v0.1",
+    "ibm-ai-platform/Bamba-9B-v1",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "nvidia/Nemotron-H-8B-Base-8K",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
 ]
 
+ATTN_BLOCK_SIZES = {
+    "ibm-ai-platform/Bamba-9B-v1": 528,
+    "Zyphra/Zamba2-1.2B-instruct": 80,
+    "nvidia/Nemotron-H-8B-Base-8K": 528,
+    "ibm-granite/granite-4.0-tiny-preview": 400,
+    "tiiuae/Falcon-H1-0.5B-Base": 800,
+}
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -60,8 +85,16 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
     with hf_runner(model) as hf_model:
-        if model != "mistralai/Mamba-Codestral-7B-v0.1":
+        if model not in HF_UNSUPPORTED_MODELS:
             hf_outputs = hf_model.generate_greedy_logprobs_limit(
                 example_prompts, max_tokens, num_logprobs)
         else:
@@ -72,12 +105,21 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
+        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
+            block_size = ATTN_BLOCK_SIZES[model]
+        else:
+            block_size = 16
+
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
+            if model in HYBRID_MODELS:
+                # required due to reorder_batch behaviour
+                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
                              enforce_eager=True,
-                             enable_prefix_caching=False) as vllm_model:
+                             enable_prefix_caching=False,
+                             block_size=block_size) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
     else:
@@ -111,6 +153,14 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
     for_loop_outputs = []
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         for prompt in example_prompts:
 
@@ -169,7 +169,7 @@ def check_available_online(
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
                                           min_transformers_version="4.53"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
 
@@ -13,7 +13,6 @@
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "state-spaces/mamba-130m-hf",  # mamba1
-    "hmellor/tiny-random-BambaForCausalLM",  # hybrid
     "BAAI/bge-m3",  # embedding
 ]
 
 
@@ -108,7 +108,7 @@ def _selective_scan_update_kernel(
     # is the same as the batch id.
     if HAS_STATE_BATCH_INDICES:
         state_batch_indices_ptr += pid_b
-        state_batch_idx = tl.load(state_batch_indices_ptr)
+        state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
         state_ptr += (state_batch_idx * stride_state_batch +
                       pid_h * stride_state_head)
     else:
 
@@ -9,6 +9,7 @@
 from torch import nn
 from transformers import BambaConfig
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -36,7 +37,7 @@
 from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant, SupportsV0Only)
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -97,7 +98,9 @@ def __init__(self,
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.mixer",
+                                chunk_size=config.mamba_chunk_size)
 
         self.feed_forward = BambaMLP(config, quant_config=quant_config)
         self.input_layernorm = RMSNorm(config.hidden_size,
@@ -313,10 +316,14 @@ def forward(
 
         attn_metadata = get_forward_context().attn_metadata
 
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -337,7 +344,8 @@ def forward(
                 num_attn += 1
 
             layer_mamba_cache_params = None
-            if isinstance(layer, BambaMixerDecoderLayer):
+            if isinstance(layer,
+                          BambaMixerDecoderLayer) and mamba_cache_params:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     i - num_attn)
 
@@ -411,7 +419,7 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid, SupportsV0Only, SupportsQuant):
+                       IsHybrid, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -475,15 +483,22 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
 
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = \
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba
+                    )
+
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config, self.lm_head.weight.dtype,
+                    num_mamba_layers, *self._get_mamba_cache_shape())
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
 
 
@@ -8,6 +8,7 @@
 from torch import nn
 from transformers import FalconH1Config
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
@@ -33,8 +34,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsV0Only)
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -85,6 +85,7 @@ def __init__(
         config: FalconH1Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -107,6 +108,8 @@ def __init__(
             activation=config.hidden_act,
             quant_config=quant_config,
             use_rms_norm=config.mamba_rms_norm,
+            prefix=f"{prefix}.mixer",
+            chunk_size=config.mamba_chunk_size,
         )
         # n_groups is overridden later by `MambaMixer2`
         self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state
@@ -316,18 +319,26 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
+
         # Instantiate the attention branch
         self.self_attn = FalconH1AttentionDecoderLayer(
             config=config,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=prefix,
         )
+
+        # In V1 all attention/ssm layers must have
+        # different index in prefix
+        ssm_layer_idx = config.num_hidden_layers + layer_idx
+        ssm_prefix = prefix.split(".")[0] + f".{ssm_layer_idx}"
+
         # Instantiate the SSM branch
         self.mamba = FalconH1SSMDecoderLayer(
             config=config,
             cache_config=cache_config,
             quant_config=quant_config,
+            prefix=ssm_prefix,
         )
         self.ssm_out_multiplier = config.ssm_out_multiplier
         self.ssm_in_multiplier = config.ssm_in_multiplier
@@ -452,10 +463,16 @@ def forward(
         # proper continuous batching computation including
         # chunked prefill
         attn_metadata = get_forward_context().attn_metadata
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+
+        if not envs.VLLM_USE_V1:
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
+
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds * self.embedding_multiplier
@@ -468,7 +485,9 @@ def forward(
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
+            layer_mamba_cache_params = None
+            if mamba_cache_params:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
             hidden_states = layer(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -484,7 +503,7 @@ def forward(
 
 
 class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                          IsHybrid, SupportsV0Only):
+                          IsHybrid):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -558,15 +577,19 @@ def forward(
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ):
-        if self.mamba_cache is None:
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config,
-                self.lm_head.weight.dtype
-                if hasattr(self.lm_head, 'weight') else torch.bfloat16,
-                self.config.num_hidden_layers,
-                *self._get_mamba_cache_shape(),
-            )
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                self.mamba_cache = MambaCacheManager(
+                    self.vllm_config,
+                    self.lm_head.weight.dtype if hasattr(
+                        self.lm_head, 'weight') else torch.bfloat16,
+                    self.config.num_hidden_layers,
+                    *self._get_mamba_cache_shape(),
+                )
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
         hidden_states = self.model(
             input_ids,
             positions,
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@`
`13`	`13`	`"openai/whisper-large-v3", # transcription`
`14`	`14`	`"facebook/bart-large-cnn", # encoder decoder`
`15`	`15`	`"state-spaces/mamba-130m-hf", # mamba1`
`16`		`- "hmellor/tiny-random-BambaForCausalLM", # hybrid`
`17`	`16`	`"BAAI/bge-m3", # embedding`
`18`	`17`	`]`
`19`	`18`