Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docs/source/contributing/model/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
)
```

To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
with different `full` and `feature` attributes:
To assign the vision embeddings to only the image tokens, instead of a string
you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:

```python
hf_config = self.info.get_hf_config()
Expand All @@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows

return PromptUpdateDetails(
full=image_tokens + [bos_token_id],
features=image_tokens,
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID,
)
```

Expand Down Expand Up @@ -914,9 +914,9 @@ def _get_prompt_updates(
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows

return PromptUpdateDetails(
full=image_tokens + [bos_token_id],
features=image_tokens,
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID,
)

return [
Expand Down
3 changes: 0 additions & 3 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -989,9 +989,6 @@ See [this page](#generative-models) for more information on how to use generativ
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.

:::{important}
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
`pip install git+https://github.com/huggingface/transformers`.

Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
:::
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=5,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
)

Expand Down
5 changes: 4 additions & 1 deletion tests/models/decoder_only/audio_language/test_ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def server(request, audio_assets):
for key, value in request.param.items()
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
yield remote_server


Expand Down
2 changes: 1 addition & 1 deletion tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@
"cherry_blossom": "<image>What is the season?", # noqa: E501
}),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=8192,
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
Expand Down
26 changes: 10 additions & 16 deletions tests/models/decoder_only/vision_language/test_pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def test_chat(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
Expand All @@ -198,22 +200,14 @@ def test_chat(


@large_gpu_test(min_gb=48)
@pytest.mark.parametrize(
"prompt,expected_ranges",
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
"offset": 11,
"length": 494
}]),
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
"offset": 11,
"length": 266
}, {
"offset": 277,
"length": 1056
}, {
"offset": 1333,
"length": 418
}])])
@pytest.mark.parametrize("prompt,expected_ranges",
[(_create_engine_inputs_hf(IMG_URLS[:1]),
[PlaceholderRange(offset=11, length=494)]),
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
PlaceholderRange(offset=11, length=266),
PlaceholderRange(offset=277, length=1056),
PlaceholderRange(offset=1333, length=418)
])])
def test_multi_modal_placeholders(vllm_runner, prompt,
expected_ranges: list[PlaceholderRange],
monkeypatch) -> None:
Expand Down
4 changes: 2 additions & 2 deletions tests/models/multimodal/processing/test_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
first_placeholder = image_placeholders[0]

# NOTE: There is a BOS token
assert first_placeholder["offset"] == 1
assert first_placeholder["length"] == (
assert first_placeholder.offset == 1
assert first_placeholder.length == (
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs

except Exception as exc:
Expand Down
4 changes: 2 additions & 2 deletions tests/models/multimodal/processing/test_llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(

first_placeholder = image_placeholders[0]

assert first_placeholder["offset"] == 0
assert first_placeholder["length"] == len(
assert first_placeholder.offset == 0
assert first_placeholder.length == len(
processed_inputs["prompt_token_ids"]) // num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
Expand Down
4 changes: 3 additions & 1 deletion tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,9 @@ def check_available_online(
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
extras={"2b": "h2oai/h2ovl-mississippi-2b"}), # noqa: E501
extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible."), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
trust_remote_code=True),
Expand Down
9 changes: 9 additions & 0 deletions tests/multimodal/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,7 @@ def test_find_update_tokens(
item_idx=0,
start_idx=6,
tokens=[32000, 32000],
is_embed=None,
),
],
"pattern_4": [
Expand All @@ -793,6 +794,7 @@ def test_find_update_tokens(
item_idx=0,
start_idx=3,
tokens=[32000],
is_embed=None,
),
],
}
Expand All @@ -807,12 +809,14 @@ def test_find_update_tokens(
item_idx=0,
start_idx=1,
tokens=[32000, 32000],
is_embed=None,
),
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=1,
start_idx=5,
tokens=[32000, 32000],
is_embed=None,
),
],
"pattern_3": [
Expand All @@ -821,6 +825,7 @@ def test_find_update_tokens(
item_idx=0,
start_idx=7,
tokens=[1550, 918, 1550],
is_embed=None,
),
],
# No match for pattern_4 as it has lower priority than pattern_1
Expand All @@ -835,12 +840,14 @@ def test_find_update_tokens(
item_idx=0,
start_idx=1,
tokens=[32000, 32000],
is_embed=None,
),
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=1,
start_idx=3,
tokens=[32000, 32000],
is_embed=None,
),
],
"pattern_4": [
Expand All @@ -849,6 +856,7 @@ def test_find_update_tokens(
item_idx=0,
start_idx=5,
tokens=[32000],
is_embed=None,
),
],
"pattern_3": [
Expand All @@ -857,6 +865,7 @@ def test_find_update_tokens(
item_idx=0,
start_idx=6,
tokens=[1550, 918, 1550],
is_embed=None,
),
],
}
Expand Down
46 changes: 17 additions & 29 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import torch

from vllm.multimodal.inputs import MultiModalKwargs
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.utils import sha256
# disable yapf here as it formats differently than isort such that both fail
Expand Down Expand Up @@ -158,13 +158,10 @@ def test_generate_block_hash_extra_keys():
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(20)],
mm_positions=[{
"offset": 0,
"length": 5
}, {
"offset": 10,
"length": 5
}],
mm_positions=[
PlaceholderRange(offset=0, length=5),
PlaceholderRange(offset=10, length=5),
],
mm_hashes=["hash1", "hash2"],
)

Expand Down Expand Up @@ -222,13 +219,10 @@ def test_hash_request_tokens(hash_fn):
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash1", "hash2"],
)

Expand All @@ -253,25 +247,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
request1 = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash1", "hash2"],
)
request2 = make_request(
request_id=1,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_positions=[
PlaceholderRange(offset=0, length=3),
PlaceholderRange(offset=3, length=3),
],
mm_hashes=["hash3", "hash2"],
)
block_size = 3
Expand Down
Loading