Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a3b89cc
v1: Add Whisper encoder-decoder model support
russellb Jul 2, 2025
e689784
prevent voxtral from being detected as encoder-decoder
russellb Aug 29, 2025
a0aeea2
Drop whisper-small from tests
russellb Aug 30, 2025
d2b51e1
Use correct number of encoder tokens in attention metadata
russellb Aug 30, 2025
a57ee42
Force spawn multiproc method in whisper example
russellb Aug 31, 2025
44193e5
Warn if Whisper is used without spawn
russellb Aug 31, 2025
21e5a90
Run whisper test with spawn multiproc method
russellb Aug 31, 2025
8ecc6c7
whisper: simplify encoder attention by using pytorch
russellb Aug 15, 2025
35b2125
Limit whisper encoder concurrency
russellb Sep 4, 2025
2423682
Drop EncoderAttention abstractions no longer needed
russellb Sep 4, 2025
dec14fe
remove debug logs
russellb Sep 4, 2025
b9b228b
encoder-decoder does not use encoder-cache yet
russellb Sep 4, 2025
be9add9
Simplify and reduce duplication in TorchAttention
russellb Sep 5, 2025
7474c67
Use existing MultiHeadAttention instead of new TorchAttention
russellb Sep 6, 2025
bf4c7c1
Remove some unnecessary variables that were not used
russellb Sep 8, 2025
f99f5d7
Remove unused TorchAttention
russellb Sep 9, 2025
5c858d5
Replace a slow Python loop with torch
russellb Sep 9, 2025
11b9b9e
Move max_seq_len override into CrossAttentionBuilder
russellb Sep 9, 2025
a875bfc
Move seq_lens / seq_lens_cpu overrides into CrossAttentionBuilder
russellb Sep 9, 2025
8c7176f
Revert unnecessary code move to reduce diff size
russellb Sep 9, 2025
7304b3e
further simplification of slot_mappings for cross attn
russellb Sep 9, 2025
83fd244
move slot mapping calculation back into CrossAttentionBuilder
russellb Sep 9, 2025
04cf403
improve how we get the number of blocks needed
russellb Sep 9, 2025
431db03
make python loop more efficient
russellb Sep 9, 2025
279c0b0
remove leftover docstring addition
russellb Sep 10, 2025
d466655
Restore TODO that was accidentally removed
russellb Sep 10, 2025
bc7277e
remove variables no longer needed
russellb Sep 10, 2025
1c28542
Ensure AttentionMetadataBuilder subclasses call parent constructor
russellb Sep 10, 2025
4b31447
remove old param from docstring
russellb Sep 10, 2025
8305707
move MultiHeadAttention changes into whisper.py
russellb Sep 10, 2025
3c10b1f
use vllm.utils.cdiv
russellb Sep 10, 2025
7444d37
minor refactoring
russellb Sep 10, 2025
4afbdcb
revert unnecessary changes left over
russellb Sep 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,6 @@ steps:
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
Expand Down Expand Up @@ -644,7 +643,7 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work

- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
Expand Down Expand Up @@ -818,7 +817,8 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
Expand Down
2 changes: 2 additions & 0 deletions examples/offline_inference/encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
encoder/decoder models, specifically BART and mBART.

This script is refactored to allow model selection via command-line arguments.

NOTE: This example is not yet supported in V1.
"""

import argparse
Expand Down
3 changes: 3 additions & 0 deletions examples/offline_inference/encoder_decoder_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""

import os
import time
from collections.abc import Sequence
from dataclasses import asdict
Expand Down Expand Up @@ -130,6 +131,8 @@ def run_mllama():


def run_whisper():
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

engine_args = EngineArgs(
model="openai/whisper-large-v3-turbo",
max_model_len=448,
Expand Down
1 change: 1 addition & 0 deletions tests/encoder_decoder/test_e2e_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def clear_cache():
current_platform.is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models"
)
@pytest.mark.skip(reason="bart not supported in V1")
def test_encoder_decoder_e2e(
hf_runner,
vllm_runner,
Expand Down
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ async def client(server):

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="bart is not yet supported in V1")
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
Expand Down
2 changes: 2 additions & 0 deletions tests/models/language/generation/test_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def run_test(
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.skip(reason="bart not supported in V1")
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:

Expand All @@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
@pytest.mark.skip(reason="bart not supported in V1")
def test_models_distributed(hf_runner, vllm_runner,
example_encoder_decoder_prompts,
distributed_executor_backend, model, dtype,
Expand Down
3 changes: 1 addition & 2 deletions tests/models/multimodal/generation/test_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@ def run_test(


@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@create_new_process_for_each_test()
def test_models(vllm_runner, model) -> None:
run_test(
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_tensor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

ARCH_TO_SKIP = {
"MolmoForCausalLM": "incompatible requirements",
"Florence2ForConditionalGeneration": "not supported in V1",
}
ARCH_NEEDS_EXTRAS = [
"InternVLChatModel",
Expand Down
6 changes: 6 additions & 0 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ def _initialize_kv_caches_v1(self, vllm_config):
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
if model_arch == "Florence2ForConditionalGeneration":
# An encoder-decoder model that's V0-only. Just skip it
# since V0 is about to be removed.
pytest.skip("Skipping Florence2ForConditionalGeneration")
if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
LLM(
model_info.default,
tokenizer=model_info.tokenizer,
Expand Down
1 change: 0 additions & 1 deletion tests/v1/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from vllm.engine.async_llm_engine import AsyncLLMEngine

UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
]

Expand Down
160 changes: 160 additions & 0 deletions vllm/attention/layers/cross_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
from copy import copy
from typing import Optional

import numpy as np
import torch
from transformers import CacheConfig

from vllm import envs
from vllm.attention.backends.abstract import (AttentionBackend,
AttentionMetadata, AttentionType)
from vllm.attention.layer import Attention
from vllm.attention.selector import get_attn_backend
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.utils import cdiv
from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
subclass_attention_backend)
from vllm.v1.kv_cache_interface import CrossAttentionSpec

logger = init_logger(__name__)


def _get_max_encoder_len(vllm_config: VllmConfig) -> int:
return MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(
vllm_config.model_config)


def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray,
block_table_tensor: torch.Tensor,
kv_cache_spec: CrossAttentionSpec,
device: torch.device) -> torch.Tensor:
"""Get cross-attention slot mappings."""

block_size = kv_cache_spec.block_size
slot_mappings = []

# Find indices with non-zero encoder sequence lengths
# The majority of parallel requests will be running the
# decoder, so this list should be relatively small.
active_indices = np.nonzero(encoder_seq_lens)[0]

for req_index in active_indices:
encoder_seq_len = encoder_seq_lens[req_index].item()

# Calculate the number of blocks needed for this request
num_blocks_needed = cdiv(encoder_seq_len, block_size)

# Get the block IDs for this request from the tensor
req_block_ids = block_table_tensor[req_index]

# Get only the blocks we need (first num_blocks_needed blocks)
needed_block_ids = req_block_ids[:num_blocks_needed]

# All needed blocks are allocated
i_values = torch.arange(encoder_seq_len,
dtype=torch.int64,
device=device)
block_indices = i_values // block_size
block_offsets = i_values % block_size
block_numbers = needed_block_ids[block_indices]
slot_mapping = block_numbers * block_size + block_offsets

slot_mappings.append(slot_mapping)

if slot_mappings:
return torch.cat(slot_mappings)
else:
return torch.empty(0, dtype=torch.int64, device=device)


@functools.lru_cache
def create_cross_attention_backend(
underlying_attn_backend: AttentionBackend, ) -> type[AttentionBackend]:
prefix = "CrossAttention_"
underlying_builder = underlying_attn_backend.get_builder_cls()

class CrossAttentionBuilder(underlying_builder): # type: ignore

def build(self,
common_prefix_len: int,
common_attn_metadata: CommonAttentionMetadata,
fast_build: bool = False) -> AttentionMetadata:
new_metadata = copy(common_attn_metadata)
new_metadata.causal = False
max_encoder_len = _get_max_encoder_len(self.vllm_config)
new_metadata.max_seq_len = max_encoder_len

new_metadata.seq_lens = torch.full(
(new_metadata.num_reqs, ),
max_encoder_len,
dtype=torch.int32,
device=self.device,
)
new_metadata.seq_lens_cpu = torch.full(
(new_metadata.num_reqs, ),
max_encoder_len,
dtype=torch.int32,
device="cpu",
)
new_metadata.slot_mapping = _get_cross_slot_mapping(
new_metadata.encoder_seq_lens, new_metadata.block_table_tensor,
self.kv_cache_spec, self.device)
return super().build(common_prefix_len, new_metadata, fast_build)

attn_backend = subclass_attention_backend(
name_prefix=prefix,
attention_backend_cls=underlying_attn_backend,
builder_cls=CrossAttentionBuilder)

return attn_backend


class CrossAttention(Attention):
"""
Cross-attention for encoder-decoder models.
Handles attention between decoder queries and encoder keys/values.
"""

def __init__(self,
num_heads: int,
head_size: int,
scale: float,
cache_config: Optional[CacheConfig] = None,
attn_type: Optional[str] = None,
**kwargs):
dtype = torch.get_default_dtype()

if cache_config is not None:
kv_cache_dtype = cache_config.cache_dtype
block_size = cache_config.block_size
else:
kv_cache_dtype = "auto"
block_size = 16

if envs.VLLM_USE_V1:
underlying_attn_backend = get_attn_backend(head_size, dtype,
kv_cache_dtype,
block_size)

attn_backend = create_cross_attention_backend(
underlying_attn_backend)
else:
# in v0 cross attention is handled inside the backends
attn_backend = None

if attn_type is not None:
assert attn_type == AttentionType.ENCODER_DECODER, (
"CrossAttention only supports AttentionType.ENCODER_DECODER")

super().__init__(num_heads=num_heads,
head_size=head_size,
scale=scale,
cache_config=cache_config,
attn_backend=attn_backend,
attn_type=AttentionType.ENCODER_DECODER,
**kwargs)
37 changes: 28 additions & 9 deletions vllm/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import hashlib
import inspect
import json
import os
import textwrap
import warnings
from collections.abc import Mapping
Expand Down Expand Up @@ -41,6 +42,7 @@
from vllm.config.utils import ConfigType, config
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.platforms import current_platform
from vllm.transformers_utils.config import (
ConfigFormat, get_config, get_hf_image_processor_config,
Expand Down Expand Up @@ -3512,16 +3514,33 @@ def __post_init__(self):

disable_chunked_prefill_reasons: list[str] = []

if self.model_config and self.model_config.pooler_config:
pooling_type = self.model_config.pooler_config.pooling_type
if pooling_type is None or pooling_type.lower() != "last":
disable_chunked_prefill_reasons.append(
"Only \"last\" pooling supports chunked "
"prefill and prefix caching; disabling both.")
elif not getattr(self.model_config.hf_config, "is_causal", True):
if self.model_config:
if self.model_config.pooler_config:
pooling_type = self.model_config.pooler_config.pooling_type
if pooling_type is None or pooling_type.lower() != "last":
disable_chunked_prefill_reasons.append(
"Only \"last\" pooling supports chunked "
"prefill and prefix caching; disabling both.")
elif self.model_config.is_encoder_decoder:
self.scheduler_config.max_num_encoder_input_tokens = \
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
logger.debug(
"Encoder-decoder model detected: setting "
"`max_num_encoder_input_tokens` to encoder length (%s)",
self.scheduler_config.max_num_encoder_input_tokens)
self.scheduler_config.disable_chunked_mm_input = True
disable_chunked_prefill_reasons.append(
"Only models using causal attention supports chunked "
"prefill and prefix caching; disabling both.")
"Encoder-decoder models do not support chunked prefill nor"
" prefix caching; disabling both.")
if (self.model_config.architecture
== "WhisperForConditionalGeneration"
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
!= "spawn"):
logger.warning(
"Whisper is known to have issues with "
"forked workers. If startup is hanging, "
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
"to 'spawn'.")

if disable_chunked_prefill_reasons:
for reason in disable_chunked_prefill_reasons:
Expand Down
1 change: 0 additions & 1 deletion vllm/model_executor/models/voxtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,6 @@ def __init__(
self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config,
prefix=maybe_prefix(
prefix, "whisper_encoder"),
is_standalone_encoder=True,
init_in_fp32=True)
mel_filters = mel_filter_bank(
num_frequency_bins=1 + self.config.window_size // 2,
Expand Down
Loading