From c97653d8a37b10cf645b2b2f01f58773e32229ee Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Wed, 2 Jul 2025 10:56:07 -0700 Subject: [PATCH 1/3] Remove global media connector. Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Chenheli Hua --- .../vision_language_embedding.py | 8 ++- .../vision_language_multi_image.py | 65 ++++++++++--------- tests/entrypoints/openai/test_audio.py | 7 +- tests/entrypoints/openai/test_video.py | 7 +- tests/entrypoints/openai/test_vision.py | 6 +- .../openai/test_vision_embedding.py | 6 +- tests/v1/tpu/test_multimodal.py | 7 +- vllm/multimodal/utils.py | 9 --- 8 files changed, 64 insertions(+), 51 deletions(-) diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 9451825f0b73..75199127a9c5 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -15,9 +15,11 @@ from PIL.Image import Image from vllm import LLM, EngineArgs -from vllm.multimodal.utils import fetch_image +from vllm.multimodal.utils import MediaConnector from vllm.utils import FlexibleArgumentParser +OFFLINE_MEDIA_CONNECTOR = MediaConnector() + class TextQuery(TypedDict): modality: Literal["text"] @@ -114,7 +116,7 @@ def get_query(modality: QueryModality): if modality == "image": return ImageQuery( modality="image", - image=fetch_image( + image=OFFLINE_MEDIA_CONNECTOR.fetch_image( "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 ), ) @@ -123,7 +125,7 @@ def get_query(modality: QueryModality): return TextImageQuery( modality="text+image", text="A cat standing in the snow.", - image=fetch_image( + image=OFFLINE_MEDIA_CONNECTOR.fetch_image( "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 ), ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 13af8e904194..e7f25d849559 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -17,7 +17,7 @@ from vllm import LLM, EngineArgs, SamplingParams from vllm.lora.request import LoRARequest -from vllm.multimodal.utils import fetch_image +from vllm.multimodal.utils import MediaConnector from vllm.utils import FlexibleArgumentParser QUESTION = "What is the content of each image?" @@ -35,6 +35,7 @@ "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg", "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg", ] +OFFLINE_MEDIA_CONNECTOR = MediaConnector() class ModelRequestData(NamedTuple): @@ -70,7 +71,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -103,7 +104,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -126,7 +127,7 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -160,7 +161,7 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -193,7 +194,7 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -221,7 +222,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -249,7 +250,7 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -285,7 +286,7 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -319,7 +320,7 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -352,7 +353,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -385,7 +386,7 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -419,7 +420,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -451,7 +452,7 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - image_data = [fetch_image(url) for url in image_urls] + image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -491,7 +492,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -513,7 +514,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -533,7 +534,7 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -563,7 +564,7 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -593,7 +594,7 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -615,7 +616,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -648,7 +649,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], ) @@ -680,7 +681,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], lora_requests=[LoRARequest("vision", 1, vision_lora_path)], ) @@ -723,7 +724,7 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], + image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], chat_template=chat_template, ) @@ -768,7 +769,7 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) if smart_resize is None: - image_data = [fetch_image(url) for url in image_urls] + image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] else: def post_process_image(image: Image) -> Image: @@ -778,7 +779,10 @@ def post_process_image(image: Image) -> Image: ) return image.resize((resized_width, resized_height)) - image_data = [post_process_image(fetch_image(url)) for url in image_urls] + image_data = [ + post_process_image(OFFLINE_MEDIA_CONNECTOR.fetch_image(url)) + for url in image_urls + ] return ModelRequestData( engine_args=engine_args, @@ -826,7 +830,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) if smart_resize is None: - image_data = [fetch_image(url) for url in image_urls] + image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] else: def post_process_image(image: Image) -> Image: @@ -836,7 +840,10 @@ def post_process_image(image: Image) -> Image: ) return image.resize((resized_width, resized_height)) - image_data = [post_process_image(fetch_image(url)) for url in image_urls] + image_data = [ + post_process_image(OFFLINE_MEDIA_CONNECTOR.fetch_image(url)) + for url in image_urls + ] return ModelRequestData( engine_args=engine_args, @@ -856,7 +863,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: ) prompt = f"USER: {'' * len(image_urls)}\n{question}\n ASSISTANT:" - image_data = [fetch_image(url) for url in image_urls] + image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -882,7 +889,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData: f"<|vision_end|>{question}<|im_end|>\n" "<|im_start|>assistant\n" ) - image_data = [fetch_image(url) for url in image_urls] + image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] return ModelRequestData( engine_args=engine_args, diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index d67c05ab3e8d..4d94fdf4d44d 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -8,7 +8,7 @@ import pytest_asyncio from vllm.assets.audio import AudioAsset -from vllm.multimodal.utils import encode_audio_base64, fetch_audio +from vllm.multimodal.utils import MediaConnector, encode_audio_base64 from ...utils import RemoteOpenAIServer @@ -19,6 +19,8 @@ ] MAXIMUM_AUDIOS = 2 +TEST_MEDIA_CONNECTOR = MediaConnector() + @pytest.fixture(scope="module") def server(): @@ -46,7 +48,8 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_audio() -> dict[str, str]: return { - audio_url: encode_audio_base64(*fetch_audio(audio_url)) + audio_url: + encode_audio_base64(*TEST_MEDIA_CONNECTOR.fetch_audio(audio_url)) for audio_url in TEST_AUDIO_URLS } diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index b68e08556ee9..3d75b8d0b339 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -7,7 +7,7 @@ import pytest import pytest_asyncio -from vllm.multimodal.utils import encode_video_base64, fetch_video +from vllm.multimodal.utils import MediaConnector, encode_video_base64 from ...utils import RemoteOpenAIServer @@ -21,6 +21,8 @@ "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4", ] +TEST_MEDIA_CONNECTOR = MediaConnector() + @pytest.fixture(scope="module") def server(): @@ -50,7 +52,8 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_video() -> dict[str, str]: return { - video_url: encode_video_base64(fetch_video(video_url)[0]) + video_url: + encode_video_base64(TEST_MEDIA_CONNECTOR.fetch_video(video_url)[0]) for video_url in TEST_VIDEO_URLS } diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index fd613842f986..f07dad6d7e03 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -10,12 +10,13 @@ from PIL import Image from transformers import AutoProcessor -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import MediaConnector, encode_image_base64 from ...utils import RemoteOpenAIServer MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 +TEST_MEDIA_CONNECTOR = MediaConnector() # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ @@ -73,7 +74,8 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) + image_url: + encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url)) for image_url in TEST_IMAGE_URLS } diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index fe982e286ae4..79b77a7c77b4 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -9,12 +9,13 @@ from transformers import AutoProcessor from vllm.entrypoints.openai.protocol import EmbeddingResponse -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import MediaConnector, encode_image_base64 from ...utils import VLLM_PATH, RemoteOpenAIServer MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 +TEST_MEDIA_CONNECTOR = MediaConnector() vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" assert vlm2vec_jinja_path.exists() @@ -52,7 +53,8 @@ def server(): @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) + image_url: + encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url)) for image_url in TEST_IMAGE_URLS } diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index a61773a4f611..2afb4d41d3e2 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -5,12 +5,14 @@ import pytest from vllm import envs -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import MediaConnector, encode_image_base64 from vllm.platforms import current_platform from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS from ...utils import RemoteOpenAIServer +TEST_MEDIA_CONNECTOR = MediaConnector() + if not envs.VLLM_USE_V1: pytest.skip( "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", @@ -21,7 +23,8 @@ @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) + image_url: + encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url)) for image_url in TEST_IMAGE_URLS } diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 2f2be59a1f42..a317ede401c0 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -277,15 +277,6 @@ def fetch_image_embedding( return image_embedding_io.load_base64("", data) -global_media_connector = MediaConnector() -"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector] -instance used by vLLM.""" - -fetch_audio = global_media_connector.fetch_audio -fetch_image = global_media_connector.fetch_image -fetch_video = global_media_connector.fetch_video - - def encode_audio_base64( audio: np.ndarray, sampling_rate: float, From efd7aa4cdb820d9b8b5064e3c0f5b068284f9444 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Thu, 3 Jul 2025 10:47:14 -0700 Subject: [PATCH 2/3] Address comments. Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Chenheli Hua --- .../vision_language_embedding.py | 8 +-- .../vision_language_multi_image.py | 65 +++++++++---------- tests/entrypoints/openai/test_audio.py | 7 +- tests/entrypoints/openai/test_video.py | 7 +- tests/entrypoints/openai/test_vision.py | 6 +- .../openai/test_vision_embedding.py | 6 +- tests/v1/tpu/test_multimodal.py | 7 +- vllm/multimodal/utils.py | 24 +++++++ 8 files changed, 66 insertions(+), 64 deletions(-) diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 75199127a9c5..9451825f0b73 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -15,11 +15,9 @@ from PIL.Image import Image from vllm import LLM, EngineArgs -from vllm.multimodal.utils import MediaConnector +from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser -OFFLINE_MEDIA_CONNECTOR = MediaConnector() - class TextQuery(TypedDict): modality: Literal["text"] @@ -116,7 +114,7 @@ def get_query(modality: QueryModality): if modality == "image": return ImageQuery( modality="image", - image=OFFLINE_MEDIA_CONNECTOR.fetch_image( + image=fetch_image( "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 ), ) @@ -125,7 +123,7 @@ def get_query(modality: QueryModality): return TextImageQuery( modality="text+image", text="A cat standing in the snow.", - image=OFFLINE_MEDIA_CONNECTOR.fetch_image( + image=fetch_image( "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 ), ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index e7f25d849559..13af8e904194 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -17,7 +17,7 @@ from vllm import LLM, EngineArgs, SamplingParams from vllm.lora.request import LoRARequest -from vllm.multimodal.utils import MediaConnector +from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser QUESTION = "What is the content of each image?" @@ -35,7 +35,6 @@ "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg", "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg", ] -OFFLINE_MEDIA_CONNECTOR = MediaConnector() class ModelRequestData(NamedTuple): @@ -71,7 +70,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -104,7 +103,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -127,7 +126,7 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -161,7 +160,7 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -194,7 +193,7 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -222,7 +221,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -250,7 +249,7 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -286,7 +285,7 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -320,7 +319,7 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -353,7 +352,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -386,7 +385,7 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -420,7 +419,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -452,7 +451,7 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] + image_data = [fetch_image(url) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -492,7 +491,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -514,7 +513,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -534,7 +533,7 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -564,7 +563,7 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -594,7 +593,7 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -616,7 +615,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -649,7 +648,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], ) @@ -681,7 +680,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: return ModelRequestData( engine_args=engine_args, prompt=prompt, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], lora_requests=[LoRARequest("vision", 1, vision_lora_path)], ) @@ -724,7 +723,7 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: engine_args=engine_args, prompt=prompt, stop_token_ids=stop_token_ids, - image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls], + image_data=[fetch_image(url) for url in image_urls], chat_template=chat_template, ) @@ -769,7 +768,7 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) if smart_resize is None: - image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] + image_data = [fetch_image(url) for url in image_urls] else: def post_process_image(image: Image) -> Image: @@ -779,10 +778,7 @@ def post_process_image(image: Image) -> Image: ) return image.resize((resized_width, resized_height)) - image_data = [ - post_process_image(OFFLINE_MEDIA_CONNECTOR.fetch_image(url)) - for url in image_urls - ] + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -830,7 +826,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) if smart_resize is None: - image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] + image_data = [fetch_image(url) for url in image_urls] else: def post_process_image(image: Image) -> Image: @@ -840,10 +836,7 @@ def post_process_image(image: Image) -> Image: ) return image.resize((resized_width, resized_height)) - image_data = [ - post_process_image(OFFLINE_MEDIA_CONNECTOR.fetch_image(url)) - for url in image_urls - ] + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -863,7 +856,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: ) prompt = f"USER: {'' * len(image_urls)}\n{question}\n ASSISTANT:" - image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] + image_data = [fetch_image(url) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -889,7 +882,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData: f"<|vision_end|>{question}<|im_end|>\n" "<|im_start|>assistant\n" ) - image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls] + image_data = [fetch_image(url) for url in image_urls] return ModelRequestData( engine_args=engine_args, diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 4d94fdf4d44d..d67c05ab3e8d 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -8,7 +8,7 @@ import pytest_asyncio from vllm.assets.audio import AudioAsset -from vllm.multimodal.utils import MediaConnector, encode_audio_base64 +from vllm.multimodal.utils import encode_audio_base64, fetch_audio from ...utils import RemoteOpenAIServer @@ -19,8 +19,6 @@ ] MAXIMUM_AUDIOS = 2 -TEST_MEDIA_CONNECTOR = MediaConnector() - @pytest.fixture(scope="module") def server(): @@ -48,8 +46,7 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_audio() -> dict[str, str]: return { - audio_url: - encode_audio_base64(*TEST_MEDIA_CONNECTOR.fetch_audio(audio_url)) + audio_url: encode_audio_base64(*fetch_audio(audio_url)) for audio_url in TEST_AUDIO_URLS } diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 3d75b8d0b339..b68e08556ee9 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -7,7 +7,7 @@ import pytest import pytest_asyncio -from vllm.multimodal.utils import MediaConnector, encode_video_base64 +from vllm.multimodal.utils import encode_video_base64, fetch_video from ...utils import RemoteOpenAIServer @@ -21,8 +21,6 @@ "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4", ] -TEST_MEDIA_CONNECTOR = MediaConnector() - @pytest.fixture(scope="module") def server(): @@ -52,8 +50,7 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_video() -> dict[str, str]: return { - video_url: - encode_video_base64(TEST_MEDIA_CONNECTOR.fetch_video(video_url)[0]) + video_url: encode_video_base64(fetch_video(video_url)[0]) for video_url in TEST_VIDEO_URLS } diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index f07dad6d7e03..fd613842f986 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -10,13 +10,12 @@ from PIL import Image from transformers import AutoProcessor -from vllm.multimodal.utils import MediaConnector, encode_image_base64 +from vllm.multimodal.utils import encode_image_base64, fetch_image from ...utils import RemoteOpenAIServer MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 -TEST_MEDIA_CONNECTOR = MediaConnector() # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ @@ -74,8 +73,7 @@ async def client(server): @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: return { - image_url: - encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url)) + image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS } diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 79b77a7c77b4..fe982e286ae4 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -9,13 +9,12 @@ from transformers import AutoProcessor from vllm.entrypoints.openai.protocol import EmbeddingResponse -from vllm.multimodal.utils import MediaConnector, encode_image_base64 +from vllm.multimodal.utils import encode_image_base64, fetch_image from ...utils import VLLM_PATH, RemoteOpenAIServer MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 -TEST_MEDIA_CONNECTOR = MediaConnector() vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" assert vlm2vec_jinja_path.exists() @@ -53,8 +52,7 @@ def server(): @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: return { - image_url: - encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url)) + image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS } diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index 2afb4d41d3e2..a61773a4f611 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -5,14 +5,12 @@ import pytest from vllm import envs -from vllm.multimodal.utils import MediaConnector, encode_image_base64 +from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.platforms import current_platform from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS from ...utils import RemoteOpenAIServer -TEST_MEDIA_CONNECTOR = MediaConnector() - if not envs.VLLM_USE_V1: pytest.skip( "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", @@ -23,8 +21,7 @@ @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: return { - image_url: - encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url)) + image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS } diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 88daf1a9d817..5aca30d0b5d6 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -432,3 +432,27 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor, dim=0) vision_embeddings = vision_embeddings[:num_chunks, ...] return vision_embeddings + + +def fetch_audio( + audio_url: str, + media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, +) -> tuple[np.ndarray, Union[int, float]]: + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_audio(audio_url) + + +def fetch_image( + image_url: str, + media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, +) -> Image.Image: + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_image(image_url) + + +def fetch_video( + video_url: str, + media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, +) -> tuple[npt.NDArray, dict[str, Any]]: + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_video(video_url) From 51b4c772b65907b740e89184721900826a666ffb Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Fri, 4 Jul 2025 08:01:47 -0700 Subject: [PATCH 3/3] Add docstring. Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Chenheli Hua --- vllm/multimodal/utils.py | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 5aca30d0b5d6..6a2f7d159a69 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -43,6 +43,15 @@ def __init__( *, allowed_local_media_path: str = "", ) -> None: + """ + Args: + media_io_kwargs: Additional args passed to process media + inputs, keyed by modalities. For example, + to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'` + connection: HTTP connection client to download media contents. + allowed_local_media_path: A local directory to load media files from. + """ super().__init__() self.media_io_kwargs: dict[str, dict[ @@ -436,23 +445,47 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor, def fetch_audio( audio_url: str, - media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, + audio_io_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[np.ndarray, Union[int, float]]: + """ + Args: + audio_url: URL of the audio file to fetch. + audio_io_kwargs: Additional kwargs passed to handle audio IO. + """ + media_io_kwargs = None if not audio_io_kwargs else { + "audio": audio_io_kwargs + } media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) return media_connector.fetch_audio(audio_url) def fetch_image( image_url: str, - media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, + image_io_kwargs: Optional[dict[str, Any]] = None, ) -> Image.Image: + """ + Args: + image_url: URL of the image file to fetch. + image_io_kwargs: Additional kwargs passed to handle image IO. + """ + media_io_kwargs = None if not image_io_kwargs else { + "image": image_io_kwargs + } media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) return media_connector.fetch_image(image_url) def fetch_video( video_url: str, - media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None, + video_io_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[npt.NDArray, dict[str, Any]]: + """ + Args: + video_url: URL of the video file to fetch. + video_io_kwargs: Additional kwargs passed to handle video IO. + """ + media_io_kwargs = None if not video_io_kwargs else { + "video": video_io_kwargs + } media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) return media_connector.fetch_video(video_url)