Skip to content

Commit 6ff5186

Browse files
authored
[Bugfix] Fix deepseek-vl2 inference with more than 2 images (#13818)
1 parent fa82074 commit 6ff5186

File tree

2 files changed

+46
-10
lines changed

2 files changed

+46
-10
lines changed

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
2626
ImageSize, MultiModalDataItems)
2727
from vllm.multimodal.processing import (BaseMultiModalProcessor,
28-
BaseProcessingInfo, PromptReplacement)
28+
BaseProcessingInfo, ProcessingCache,
29+
PromptReplacement)
2930
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
3031
from vllm.sequence import IntermediateTensors
3132
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -138,18 +139,24 @@ def get_hf_processor(self, **kwargs: object):
138139
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
139140
return {"image": None}
140141

141-
def get_num_image_tokens(self, *, image_width: int,
142-
image_height: int) -> int:
142+
def get_num_image_tokens(self,
143+
*,
144+
image_width: int,
145+
image_height: int,
146+
cropping: bool = True) -> int:
143147
hf_processor = self.get_hf_processor()
144148
image_size = hf_processor.image_size
145149
patch_size = hf_processor.patch_size
146150
downsample_ratio = hf_processor.downsample_ratio
147151

148-
best_width, best_height = hf_processor.select_best_resolution(
149-
(image_width, image_height))
152+
if cropping:
153+
best_width, best_height = hf_processor.select_best_resolution(
154+
(image_width, image_height))
155+
num_width_tiles, num_height_tiles = (best_width // image_size,
156+
best_height // image_size)
157+
else:
158+
num_width_tiles = num_height_tiles = 1
150159

151-
num_width_tiles, num_height_tiles = (best_width // image_size,
152-
best_height // image_size)
153160
h = w = math.ceil((image_size // patch_size) / downsample_ratio)
154161

155162
global_views_tokens = h * (w + 1)
@@ -169,10 +176,12 @@ def get_mm_max_tokens_per_item(
169176
seq_len: int,
170177
mm_counts: Mapping[str, int],
171178
) -> Mapping[str, int]:
179+
num_images = mm_counts.get("image", 0)
172180
max_image_size = self.get_image_size_with_most_features()
173181
max_image_tokens = self.get_num_image_tokens(
174182
image_height=max_image_size.height,
175-
image_width=max_image_size.width)
183+
image_width=max_image_size.width,
184+
cropping=num_images <= 2)
176185

177186
return {"image": max_image_tokens}
178187

@@ -207,6 +216,30 @@ def get_dummy_processor_inputs(
207216
class DeepseekVL2MultiModalProcessor(
208217
BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
209218

219+
def __init__(
220+
self,
221+
info: DeepseekVL2ProcessingInfo,
222+
dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]",
223+
*,
224+
cache: Optional[ProcessingCache] = None,
225+
enable_sanity_checks: bool = True) -> None:
226+
super().__init__(
227+
info,
228+
dummy_inputs,
229+
cache=cache,
230+
enable_sanity_checks=enable_sanity_checks,
231+
)
232+
233+
mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
234+
if self.cache is not None and mm_limit["image"] > 2:
235+
# The processor output depends on the number of images passed,
236+
# making it incompatible with processing cache which is supposed
237+
# to be invariant of how many images are passed per prompt
238+
self.cache = None
239+
logger.warning_once(
240+
f"{type(self).__name__} does not support processing cache with "
241+
"image limit larger than 2.")
242+
210243
def _call_hf_processor(
211244
self,
212245
prompt: str,
@@ -271,6 +304,7 @@ def get_replacement_deepseek_vl2(item_idx: int):
271304
num_image_tokens = self.info.get_num_image_tokens(
272305
image_width=image_size.width,
273306
image_height=image_size.height,
307+
cropping=len(images) <= 2,
274308
)
275309
return [image_token_id] * num_image_tokens
276310

vllm/model_executor/models/h2ovl.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,13 +477,15 @@ def __init__(self,
477477
enable_sanity_checks=enable_sanity_checks,
478478
)
479479

480-
if self.cache is not None:
480+
mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
481+
if self.cache is not None and mm_limit["image"] >= 2:
481482
# The processor output depends on the number of images passed,
482483
# making it incompatible with processing cache which is supposed
483484
# to be invariant of how many images are passed per prompt
484485
self.cache = None
485486
logger.warning_once(
486-
f"{type(self).__name__} does not support processing cache.")
487+
f"{type(self).__name__} does not support processing cache with "
488+
"multi-image support enabled.")
487489

488490
def _get_prompt_replacements(
489491
self,

0 commit comments

Comments
 (0)