17
17
18
18
from vllm import LLM , EngineArgs , SamplingParams
19
19
from vllm .lora .request import LoRARequest
20
- from vllm .multimodal .utils import fetch_image
20
+ from vllm .multimodal .utils import MediaConnector
21
21
from vllm .utils import FlexibleArgumentParser
22
22
23
23
QUESTION = "What is the content of each image?"
35
35
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg" ,
36
36
"https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg" ,
37
37
]
38
+ OFFLINE_MEDIA_CONNECTOR = MediaConnector ()
38
39
39
40
40
41
class ModelRequestData (NamedTuple ):
@@ -70,7 +71,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
70
71
engine_args = engine_args ,
71
72
prompt = prompt ,
72
73
stop_token_ids = stop_token_ids ,
73
- image_data = [fetch_image (url ) for url in image_urls ],
74
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
74
75
)
75
76
76
77
@@ -103,7 +104,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
103
104
return ModelRequestData (
104
105
engine_args = engine_args ,
105
106
prompt = prompt ,
106
- image_data = [fetch_image (url ) for url in image_urls ],
107
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
107
108
)
108
109
109
110
@@ -126,7 +127,7 @@ def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
126
127
return ModelRequestData (
127
128
engine_args = engine_args ,
128
129
prompt = prompt ,
129
- image_data = [fetch_image (url ) for url in image_urls ],
130
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
130
131
)
131
132
132
133
@@ -160,7 +161,7 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
160
161
return ModelRequestData (
161
162
engine_args = engine_args ,
162
163
prompt = prompt ,
163
- image_data = [fetch_image (url ) for url in image_urls ],
164
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
164
165
)
165
166
166
167
@@ -193,7 +194,7 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
193
194
engine_args = engine_args ,
194
195
prompt = prompt ,
195
196
stop_token_ids = stop_token_ids ,
196
- image_data = [fetch_image (url ) for url in image_urls ],
197
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
197
198
)
198
199
199
200
@@ -221,7 +222,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
221
222
return ModelRequestData (
222
223
engine_args = engine_args ,
223
224
prompt = prompt ,
224
- image_data = [fetch_image (url ) for url in image_urls ],
225
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
225
226
)
226
227
227
228
@@ -249,7 +250,7 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
249
250
return ModelRequestData (
250
251
engine_args = engine_args ,
251
252
prompt = prompt ,
252
- image_data = [fetch_image (url ) for url in image_urls ],
253
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
253
254
)
254
255
255
256
@@ -285,7 +286,7 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
285
286
engine_args = engine_args ,
286
287
prompt = prompt ,
287
288
stop_token_ids = stop_token_ids ,
288
- image_data = [fetch_image (url ) for url in image_urls ],
289
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
289
290
)
290
291
291
292
@@ -319,7 +320,7 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
319
320
return ModelRequestData (
320
321
engine_args = engine_args ,
321
322
prompt = prompt ,
322
- image_data = [fetch_image (url ) for url in image_urls ],
323
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
323
324
)
324
325
325
326
@@ -352,7 +353,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
352
353
return ModelRequestData (
353
354
engine_args = engine_args ,
354
355
prompt = prompt ,
355
- image_data = [fetch_image (url ) for url in image_urls ],
356
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
356
357
)
357
358
358
359
@@ -385,7 +386,7 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
385
386
return ModelRequestData (
386
387
engine_args = engine_args ,
387
388
prompt = prompt ,
388
- image_data = [fetch_image (url ) for url in image_urls ],
389
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
389
390
)
390
391
391
392
@@ -419,7 +420,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
419
420
return ModelRequestData (
420
421
engine_args = engine_args ,
421
422
prompt = prompt ,
422
- image_data = [fetch_image (url ) for url in image_urls ],
423
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
423
424
)
424
425
425
426
@@ -451,7 +452,7 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
451
452
messages , tokenize = False , add_generation_prompt = True
452
453
)
453
454
454
- image_data = [fetch_image (url ) for url in image_urls ]
455
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ]
455
456
456
457
return ModelRequestData (
457
458
engine_args = engine_args ,
@@ -491,7 +492,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
491
492
return ModelRequestData (
492
493
engine_args = engine_args ,
493
494
prompt = prompt ,
494
- image_data = [fetch_image (url ) for url in image_urls ],
495
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
495
496
)
496
497
497
498
@@ -513,7 +514,7 @@ def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
513
514
return ModelRequestData (
514
515
engine_args = engine_args ,
515
516
prompt = prompt ,
516
- image_data = [fetch_image (url ) for url in image_urls ],
517
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
517
518
)
518
519
519
520
@@ -533,7 +534,7 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
533
534
return ModelRequestData (
534
535
engine_args = engine_args ,
535
536
prompt = prompt ,
536
- image_data = [fetch_image (url ) for url in image_urls ],
537
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
537
538
)
538
539
539
540
@@ -563,7 +564,7 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
563
564
return ModelRequestData (
564
565
engine_args = engine_args ,
565
566
prompt = prompt ,
566
- image_data = [fetch_image (url ) for url in image_urls ],
567
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
567
568
)
568
569
569
570
@@ -593,7 +594,7 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
593
594
return ModelRequestData (
594
595
engine_args = engine_args ,
595
596
prompt = prompt ,
596
- image_data = [fetch_image (url ) for url in image_urls ],
597
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
597
598
)
598
599
599
600
@@ -615,7 +616,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
615
616
return ModelRequestData (
616
617
engine_args = engine_args ,
617
618
prompt = prompt ,
618
- image_data = [fetch_image (url ) for url in image_urls ],
619
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
619
620
)
620
621
621
622
@@ -648,7 +649,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
648
649
return ModelRequestData (
649
650
engine_args = engine_args ,
650
651
prompt = prompt ,
651
- image_data = [fetch_image (url ) for url in image_urls ],
652
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
652
653
)
653
654
654
655
@@ -680,7 +681,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
680
681
return ModelRequestData (
681
682
engine_args = engine_args ,
682
683
prompt = prompt ,
683
- image_data = [fetch_image (url ) for url in image_urls ],
684
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
684
685
lora_requests = [LoRARequest ("vision" , 1 , vision_lora_path )],
685
686
)
686
687
@@ -723,7 +724,7 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
723
724
engine_args = engine_args ,
724
725
prompt = prompt ,
725
726
stop_token_ids = stop_token_ids ,
726
- image_data = [fetch_image (url ) for url in image_urls ],
727
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ],
727
728
chat_template = chat_template ,
728
729
)
729
730
@@ -768,7 +769,7 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
768
769
)
769
770
770
771
if smart_resize is None :
771
- image_data = [fetch_image (url ) for url in image_urls ]
772
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ]
772
773
else :
773
774
774
775
def post_process_image (image : Image ) -> Image :
@@ -778,7 +779,10 @@ def post_process_image(image: Image) -> Image:
778
779
)
779
780
return image .resize ((resized_width , resized_height ))
780
781
781
- image_data = [post_process_image (fetch_image (url )) for url in image_urls ]
782
+ image_data = [
783
+ post_process_image (OFFLINE_MEDIA_CONNECTOR .fetch_image (url ))
784
+ for url in image_urls
785
+ ]
782
786
783
787
return ModelRequestData (
784
788
engine_args = engine_args ,
@@ -826,7 +830,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
826
830
)
827
831
828
832
if smart_resize is None :
829
- image_data = [fetch_image (url ) for url in image_urls ]
833
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ]
830
834
else :
831
835
832
836
def post_process_image (image : Image ) -> Image :
@@ -836,7 +840,10 @@ def post_process_image(image: Image) -> Image:
836
840
)
837
841
return image .resize ((resized_width , resized_height ))
838
842
839
- image_data = [post_process_image (fetch_image (url )) for url in image_urls ]
843
+ image_data = [
844
+ post_process_image (OFFLINE_MEDIA_CONNECTOR .fetch_image (url ))
845
+ for url in image_urls
846
+ ]
840
847
841
848
return ModelRequestData (
842
849
engine_args = engine_args ,
@@ -856,7 +863,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
856
863
)
857
864
858
865
prompt = f"USER: { '<image>' * len (image_urls )} \n { question } \n ASSISTANT:"
859
- image_data = [fetch_image (url ) for url in image_urls ]
866
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ]
860
867
861
868
return ModelRequestData (
862
869
engine_args = engine_args ,
@@ -882,7 +889,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
882
889
f"<|vision_end|>{ question } <|im_end|>\n "
883
890
"<|im_start|>assistant\n "
884
891
)
885
- image_data = [fetch_image (url ) for url in image_urls ]
892
+ image_data = [OFFLINE_MEDIA_CONNECTOR . fetch_image (url ) for url in image_urls ]
886
893
887
894
return ModelRequestData (
888
895
engine_args = engine_args ,
0 commit comments