Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
ae47519
init phi4mm multimodal processor
Isotr0py Mar 25, 2025
1a3e9c5
remove unused func
Isotr0py Mar 25, 2025
41e00f6
make image inference work
Isotr0py Mar 26, 2025
373d0a8
image work
Isotr0py Mar 27, 2025
a3f9725
fix multi images
Isotr0py Mar 27, 2025
83ce87c
init v1
Isotr0py Mar 27, 2025
20fa915
v1 image work
Isotr0py Mar 27, 2025
6feca07
make audio run
Isotr0py Mar 29, 2025
70478c8
fix
Isotr0py Mar 30, 2025
fbe07ff
fix audio correctness
Isotr0py Mar 30, 2025
49fb233
fix multi audios
Isotr0py Mar 30, 2025
51dde9c
fix resampling
Isotr0py Mar 30, 2025
f071581
fix resampling
Isotr0py Mar 30, 2025
d665855
fix audio diff
Isotr0py Mar 30, 2025
f63d7c2
unpad audio features
Isotr0py Apr 1, 2025
83c08fc
fix v1 audio
Isotr0py Apr 1, 2025
1b9f027
clean legacy code
Isotr0py Apr 1, 2025
e44b883
Merge remote-tracking branch 'upstream/main' into phi-4-mm-refactor
Isotr0py Apr 1, 2025
76f8b8e
clean up
Isotr0py Apr 1, 2025
7cd6f4a
Merge branch 'vllm-project:main' into phi-4-mm-refactor
Isotr0py Apr 2, 2025
fb6b659
clean up
Isotr0py Apr 2, 2025
9fab0e4
minor refactor
Isotr0py Apr 2, 2025
660cfd7
minor fix
Isotr0py Apr 2, 2025
341a8f9
code format
Isotr0py Apr 2, 2025
335a29e
refactor audio resample
Isotr0py Apr 2, 2025
e755e6b
minor refactor audio encoder
Isotr0py Apr 2, 2025
5714c18
increase test max_model_len
Isotr0py Apr 3, 2025
d3dd9e0
add processor tests
Isotr0py Apr 3, 2025
5a505b8
revert unnecessary changes
Isotr0py Apr 3, 2025
a020575
Merge branch 'vllm-project:main' into phi-4-mm-refactor
Isotr0py Apr 3, 2025
11bb0a9
Merge branch 'vllm-project:main' into phi-4-mm-refactor
Isotr0py Apr 6, 2025
b40b458
add scipy to doc requirement
Isotr0py Apr 6, 2025
4f3049d
fix doc build
Isotr0py Apr 6, 2025
af3a239
Merge branch 'vllm-project:main' into phi-4-mm-refactor
Isotr0py Apr 8, 2025
6cce3fe
init vision speech test
Isotr0py Apr 8, 2025
a54dae3
make vision speech test passed
Isotr0py Apr 8, 2025
516d9da
fix ultravox test import
Isotr0py Apr 9, 2025
faa14d5
Fix online inference
DarkLight1337 Apr 9, 2025
d01cafa
Merge branch 'main' into phi-4-mm-refactor
DarkLight1337 Apr 10, 2025
0f6859e
Merge branch 'main' into phi-4-mm-refactor
DarkLight1337 Apr 10, 2025
3b284e5
Merge branch 'main' into phi-4-mm-refactor
Isotr0py Apr 11, 2025
5ddf574
expose dynamic_hd
Isotr0py Apr 14, 2025
3e2aae1
Merge branch 'main' into phi-4-mm-refactor
Isotr0py Apr 14, 2025
bf8a340
Merge branch 'main' into phi-4-mm-refactor
DarkLight1337 Apr 15, 2025
e9724c8
reduce max_model_len in example to fit single gpu
Isotr0py Apr 15, 2025
9dfbf86
Merge branch 'main' into phi-4-mm-refactor
Isotr0py Apr 19, 2025
11be486
update profiler and fix ultravox tests
Isotr0py Apr 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -1004,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ
* `microsoft/Phi-4-multimodal-instruct`, etc.
* ✅︎
*
*
* ✅︎
- * `PixtralForConditionalGeneration`
* Pixtral
* T + I<sup>+</sup>
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=4096,
max_model_len=12800,
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
Expand Down
5 changes: 4 additions & 1 deletion examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,10 +814,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=4096,
max_model_len=5120,
max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
)

Expand Down
4 changes: 3 additions & 1 deletion examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=10000,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
)

placeholders = "".join(f"<|image_{i}|>"
Expand Down
1 change: 1 addition & 0 deletions requirements/docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ transformers
mistral_common >= 1.5.4
aiohttp
starlette
scipy
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
Expand Down
27 changes: 18 additions & 9 deletions tests/models/decoder_only/audio_language/test_ultravox.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# SPDX-License-Identifier: Apache-2.0

import json
from typing import Optional
from typing import Any, Optional

import numpy as np
import pytest
import pytest_asyncio
from transformers import AutoModel, AutoTokenizer

from vllm.multimodal.audio import resample_audio
from vllm.multimodal.audio import resample_audio_librosa
from vllm.sequence import SampleLogprobs

from ....conftest import HfRunner, VllmRunner
Expand Down Expand Up @@ -43,6 +43,18 @@ def audio(request):
return AudioAsset(request.param)


def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
"""Convert kwargs to CLI args."""
args = []
for key, value in params_kwargs.items():
if isinstance(value, bool):
if value:
args.append(f"--{key.replace('_','-')}")
else:
args.append(f"--{key.replace('_','-')}={value}")
return args


@pytest.fixture(params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
Expand All @@ -52,10 +64,7 @@ def server(request, audio_assets):
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
] + [
f"--{key.replace('_','-')}={value}"
for key, value in request.param.items()
]
] + params_kwargs_to_cli_args(request.param)

with RemoteOpenAIServer(MODEL_NAME,
args,
Expand Down Expand Up @@ -136,9 +145,9 @@ def run_test(
[hf_prompt],
max_tokens,
num_logprobs=num_logprobs,
audios=[(resample_audio(audio[0],
orig_sr=audio[1],
target_sr=16000), 16000)])
audios=[(resample_audio_librosa(audio[0],
orig_sr=audio[1],
target_sr=16000), 16000)])
for _, hf_prompt, audio in prompts_and_audios
]

Expand Down
6 changes: 3 additions & 3 deletions tests/models/decoder_only/vision_language/test_phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def patch_hf_processor(*args,
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [4096])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
Expand Down Expand Up @@ -225,7 +225,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [10000])
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
Expand Down Expand Up @@ -258,7 +258,7 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,

@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [10000])
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def _test_processing_correctness_mistral(
"nvidia/NVLM-D-72B",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-4-multimodal-instruct",
"mistralai/Pixtral-12B-2409",
"mistral-community/pixtral-12b",
"Qwen/Qwen-VL-Chat",
Expand Down
59 changes: 59 additions & 0 deletions tests/models/multimodal/processing/test_phi4mm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
"""Tests for phi4mm's multimodal preprocessing kwargs."""
import pytest

from vllm.multimodal import MULTIMODAL_REGISTRY

from ....conftest import _ImageAssets
from ...utils import build_model_context


@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
[
({"dynamic_hd": 4}, 1329),
({"dynamic_hd": 16}, 4433),
# the default num_crops of phi-4-multimodal is 36
({}, 9585),
])
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model_id: str,
mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
# Avoid initializing CUDA early
from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID

ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"

image_size = ctx.get_hf_config(
).embd_layer["image_embd_layer"]["crop_size"]
dummy_image_size = (image_size * 7, image_size * 7)
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}

processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)

# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(
_IMAGE_PLACEHOLDER_TOKEN_ID)
assert img_tok_count == expected_toks_per_img * num_imgs
7 changes: 2 additions & 5 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,11 +482,8 @@ def _placeholder_str(self, modality: ModalityStr,
if modality in ("image", "image_embeds"):
if model_type == "chatglm":
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
if model_type in ("phi3_v", "phi4mm"):
return f"<|image_{current_count}|>"
if model_type == "phi4mm":
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)"
if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
Expand Down Expand Up @@ -522,7 +519,7 @@ def _placeholder_str(self, modality: ModalityStr,
if model_type == "ultravox":
return "<|audio|>"
if model_type == "phi4mm":
return "<|endoftext11|>" # 200011 (see vocab.json in hf model)
return f"<|audio_{current_count}|>"
if model_type in ("qwen2_audio", "qwen2_5_omni"):
return (f"Audio {current_count}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def get_num_image_tokens(
*,
image_width: int,
image_height: int,
processor: Optional[ProcessorMixin],
processor: Optional[ProcessorMixin] = None,
) -> int:
if processor is None:
processor = self.get_hf_processor()
Expand Down
Loading