Skip to content

Commit c97653d

Browse files
committed
Remove global media connector.
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Chenheli Hua <[email protected]>
1 parent c1909e7 commit c97653d

File tree

8 files changed

+64
-51
lines changed

8 files changed

+64
-51
lines changed

examples/offline_inference/vision_language_embedding.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
from PIL.Image import Image
1616

1717
from vllm import LLM, EngineArgs
18-
from vllm.multimodal.utils import fetch_image
18+
from vllm.multimodal.utils import MediaConnector
1919
from vllm.utils import FlexibleArgumentParser
2020

21+
OFFLINE_MEDIA_CONNECTOR = MediaConnector()
22+
2123

2224
class TextQuery(TypedDict):
2325
modality: Literal["text"]
@@ -114,7 +116,7 @@ def get_query(modality: QueryModality):
114116
if modality == "image":
115117
return ImageQuery(
116118
modality="image",
117-
image=fetch_image(
119+
image=OFFLINE_MEDIA_CONNECTOR.fetch_image(
118120
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
119121
),
120122
)
@@ -123,7 +125,7 @@ def get_query(modality: QueryModality):
123125
return TextImageQuery(
124126
modality="text+image",
125127
text="A cat standing in the snow.",
126-
image=fetch_image(
128+
image=OFFLINE_MEDIA_CONNECTOR.fetch_image(
127129
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
128130
),
129131
)

examples/offline_inference/vision_language_multi_image.py

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from vllm import LLM, EngineArgs, SamplingParams
1919
from vllm.lora.request import LoRARequest
20-
from vllm.multimodal.utils import fetch_image
20+
from vllm.multimodal.utils import MediaConnector
2121
from vllm.utils import FlexibleArgumentParser
2222

2323
QUESTION = "What is the content of each image?"
@@ -35,6 +35,7 @@
3535
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
3636
"https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
3737
]
38+
OFFLINE_MEDIA_CONNECTOR = MediaConnector()
3839

3940

4041
class ModelRequestData(NamedTuple):
@@ -70,7 +71,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
7071
engine_args=engine_args,
7172
prompt=prompt,
7273
stop_token_ids=stop_token_ids,
73-
image_data=[fetch_image(url) for url in image_urls],
74+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
7475
)
7576

7677

@@ -103,7 +104,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
103104
return ModelRequestData(
104105
engine_args=engine_args,
105106
prompt=prompt,
106-
image_data=[fetch_image(url) for url in image_urls],
107+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
107108
)
108109

109110

@@ -126,7 +127,7 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
126127
return ModelRequestData(
127128
engine_args=engine_args,
128129
prompt=prompt,
129-
image_data=[fetch_image(url) for url in image_urls],
130+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
130131
)
131132

132133

@@ -160,7 +161,7 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
160161
return ModelRequestData(
161162
engine_args=engine_args,
162163
prompt=prompt,
163-
image_data=[fetch_image(url) for url in image_urls],
164+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
164165
)
165166

166167

@@ -193,7 +194,7 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
193194
engine_args=engine_args,
194195
prompt=prompt,
195196
stop_token_ids=stop_token_ids,
196-
image_data=[fetch_image(url) for url in image_urls],
197+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
197198
)
198199

199200

@@ -221,7 +222,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
221222
return ModelRequestData(
222223
engine_args=engine_args,
223224
prompt=prompt,
224-
image_data=[fetch_image(url) for url in image_urls],
225+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
225226
)
226227

227228

@@ -249,7 +250,7 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
249250
return ModelRequestData(
250251
engine_args=engine_args,
251252
prompt=prompt,
252-
image_data=[fetch_image(url) for url in image_urls],
253+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
253254
)
254255

255256

@@ -285,7 +286,7 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
285286
engine_args=engine_args,
286287
prompt=prompt,
287288
stop_token_ids=stop_token_ids,
288-
image_data=[fetch_image(url) for url in image_urls],
289+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
289290
)
290291

291292

@@ -319,7 +320,7 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
319320
return ModelRequestData(
320321
engine_args=engine_args,
321322
prompt=prompt,
322-
image_data=[fetch_image(url) for url in image_urls],
323+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
323324
)
324325

325326

@@ -352,7 +353,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
352353
return ModelRequestData(
353354
engine_args=engine_args,
354355
prompt=prompt,
355-
image_data=[fetch_image(url) for url in image_urls],
356+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
356357
)
357358

358359

@@ -385,7 +386,7 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
385386
return ModelRequestData(
386387
engine_args=engine_args,
387388
prompt=prompt,
388-
image_data=[fetch_image(url) for url in image_urls],
389+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
389390
)
390391

391392

@@ -419,7 +420,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
419420
return ModelRequestData(
420421
engine_args=engine_args,
421422
prompt=prompt,
422-
image_data=[fetch_image(url) for url in image_urls],
423+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
423424
)
424425

425426

@@ -451,7 +452,7 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
451452
messages, tokenize=False, add_generation_prompt=True
452453
)
453454

454-
image_data = [fetch_image(url) for url in image_urls]
455+
image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls]
455456

456457
return ModelRequestData(
457458
engine_args=engine_args,
@@ -491,7 +492,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
491492
return ModelRequestData(
492493
engine_args=engine_args,
493494
prompt=prompt,
494-
image_data=[fetch_image(url) for url in image_urls],
495+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
495496
)
496497

497498

@@ -513,7 +514,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
513514
return ModelRequestData(
514515
engine_args=engine_args,
515516
prompt=prompt,
516-
image_data=[fetch_image(url) for url in image_urls],
517+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
517518
)
518519

519520

@@ -533,7 +534,7 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
533534
return ModelRequestData(
534535
engine_args=engine_args,
535536
prompt=prompt,
536-
image_data=[fetch_image(url) for url in image_urls],
537+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
537538
)
538539

539540

@@ -563,7 +564,7 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
563564
return ModelRequestData(
564565
engine_args=engine_args,
565566
prompt=prompt,
566-
image_data=[fetch_image(url) for url in image_urls],
567+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
567568
)
568569

569570

@@ -593,7 +594,7 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
593594
return ModelRequestData(
594595
engine_args=engine_args,
595596
prompt=prompt,
596-
image_data=[fetch_image(url) for url in image_urls],
597+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
597598
)
598599

599600

@@ -615,7 +616,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
615616
return ModelRequestData(
616617
engine_args=engine_args,
617618
prompt=prompt,
618-
image_data=[fetch_image(url) for url in image_urls],
619+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
619620
)
620621

621622

@@ -648,7 +649,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
648649
return ModelRequestData(
649650
engine_args=engine_args,
650651
prompt=prompt,
651-
image_data=[fetch_image(url) for url in image_urls],
652+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
652653
)
653654

654655

@@ -680,7 +681,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
680681
return ModelRequestData(
681682
engine_args=engine_args,
682683
prompt=prompt,
683-
image_data=[fetch_image(url) for url in image_urls],
684+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
684685
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
685686
)
686687

@@ -723,7 +724,7 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
723724
engine_args=engine_args,
724725
prompt=prompt,
725726
stop_token_ids=stop_token_ids,
726-
image_data=[fetch_image(url) for url in image_urls],
727+
image_data=[OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls],
727728
chat_template=chat_template,
728729
)
729730

@@ -768,7 +769,7 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
768769
)
769770

770771
if smart_resize is None:
771-
image_data = [fetch_image(url) for url in image_urls]
772+
image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls]
772773
else:
773774

774775
def post_process_image(image: Image) -> Image:
@@ -778,7 +779,10 @@ def post_process_image(image: Image) -> Image:
778779
)
779780
return image.resize((resized_width, resized_height))
780781

781-
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
782+
image_data = [
783+
post_process_image(OFFLINE_MEDIA_CONNECTOR.fetch_image(url))
784+
for url in image_urls
785+
]
782786

783787
return ModelRequestData(
784788
engine_args=engine_args,
@@ -826,7 +830,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
826830
)
827831

828832
if smart_resize is None:
829-
image_data = [fetch_image(url) for url in image_urls]
833+
image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls]
830834
else:
831835

832836
def post_process_image(image: Image) -> Image:
@@ -836,7 +840,10 @@ def post_process_image(image: Image) -> Image:
836840
)
837841
return image.resize((resized_width, resized_height))
838842

839-
image_data = [post_process_image(fetch_image(url)) for url in image_urls]
843+
image_data = [
844+
post_process_image(OFFLINE_MEDIA_CONNECTOR.fetch_image(url))
845+
for url in image_urls
846+
]
840847

841848
return ModelRequestData(
842849
engine_args=engine_args,
@@ -856,7 +863,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
856863
)
857864

858865
prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
859-
image_data = [fetch_image(url) for url in image_urls]
866+
image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls]
860867

861868
return ModelRequestData(
862869
engine_args=engine_args,
@@ -882,7 +889,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
882889
f"<|vision_end|>{question}<|im_end|>\n"
883890
"<|im_start|>assistant\n"
884891
)
885-
image_data = [fetch_image(url) for url in image_urls]
892+
image_data = [OFFLINE_MEDIA_CONNECTOR.fetch_image(url) for url in image_urls]
886893

887894
return ModelRequestData(
888895
engine_args=engine_args,

tests/entrypoints/openai/test_audio.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pytest_asyncio
99

1010
from vllm.assets.audio import AudioAsset
11-
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
11+
from vllm.multimodal.utils import MediaConnector, encode_audio_base64
1212

1313
from ...utils import RemoteOpenAIServer
1414

@@ -19,6 +19,8 @@
1919
]
2020
MAXIMUM_AUDIOS = 2
2121

22+
TEST_MEDIA_CONNECTOR = MediaConnector()
23+
2224

2325
@pytest.fixture(scope="module")
2426
def server():
@@ -46,7 +48,8 @@ async def client(server):
4648
@pytest.fixture(scope="session")
4749
def base64_encoded_audio() -> dict[str, str]:
4850
return {
49-
audio_url: encode_audio_base64(*fetch_audio(audio_url))
51+
audio_url:
52+
encode_audio_base64(*TEST_MEDIA_CONNECTOR.fetch_audio(audio_url))
5053
for audio_url in TEST_AUDIO_URLS
5154
}
5255

tests/entrypoints/openai/test_video.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pytest
88
import pytest_asyncio
99

10-
from vllm.multimodal.utils import encode_video_base64, fetch_video
10+
from vllm.multimodal.utils import MediaConnector, encode_video_base64
1111

1212
from ...utils import RemoteOpenAIServer
1313

@@ -21,6 +21,8 @@
2121
"http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
2222
]
2323

24+
TEST_MEDIA_CONNECTOR = MediaConnector()
25+
2426

2527
@pytest.fixture(scope="module")
2628
def server():
@@ -50,7 +52,8 @@ async def client(server):
5052
@pytest.fixture(scope="session")
5153
def base64_encoded_video() -> dict[str, str]:
5254
return {
53-
video_url: encode_video_base64(fetch_video(video_url)[0])
55+
video_url:
56+
encode_video_base64(TEST_MEDIA_CONNECTOR.fetch_video(video_url)[0])
5457
for video_url in TEST_VIDEO_URLS
5558
}
5659

tests/entrypoints/openai/test_vision.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,13 @@
1010
from PIL import Image
1111
from transformers import AutoProcessor
1212

13-
from vllm.multimodal.utils import encode_image_base64, fetch_image
13+
from vllm.multimodal.utils import MediaConnector, encode_image_base64
1414

1515
from ...utils import RemoteOpenAIServer
1616

1717
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
1818
MAXIMUM_IMAGES = 2
19+
TEST_MEDIA_CONNECTOR = MediaConnector()
1920

2021
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
2122
TEST_IMAGE_URLS = [
@@ -73,7 +74,8 @@ async def client(server):
7374
@pytest.fixture(scope="session")
7475
def base64_encoded_image() -> dict[str, str]:
7576
return {
76-
image_url: encode_image_base64(fetch_image(image_url))
77+
image_url:
78+
encode_image_base64(TEST_MEDIA_CONNECTOR.fetch_image(image_url))
7779
for image_url in TEST_IMAGE_URLS
7880
}
7981

0 commit comments

Comments
 (0)