25
25
from vllm .multimodal .parse import (ImageEmbeddingItems , ImageProcessorItems ,
26
26
ImageSize , MultiModalDataItems )
27
27
from vllm .multimodal .processing import (BaseMultiModalProcessor ,
28
- BaseProcessingInfo , PromptReplacement )
28
+ BaseProcessingInfo , ProcessingCache ,
29
+ PromptReplacement )
29
30
from vllm .multimodal .profiling import BaseDummyInputsBuilder , ProcessorInputs
30
31
from vllm .sequence import IntermediateTensors
31
32
from vllm .transformers_utils .configs .deepseek_vl2 import (DeepseekVLV2Config ,
@@ -138,18 +139,24 @@ def get_hf_processor(self, **kwargs: object):
138
139
def get_supported_mm_limits (self ) -> Mapping [str , Optional [int ]]:
139
140
return {"image" : None }
140
141
141
- def get_num_image_tokens (self , * , image_width : int ,
142
- image_height : int ) -> int :
142
+ def get_num_image_tokens (self ,
143
+ * ,
144
+ image_width : int ,
145
+ image_height : int ,
146
+ cropping : bool = True ) -> int :
143
147
hf_processor = self .get_hf_processor ()
144
148
image_size = hf_processor .image_size
145
149
patch_size = hf_processor .patch_size
146
150
downsample_ratio = hf_processor .downsample_ratio
147
151
148
- best_width , best_height = hf_processor .select_best_resolution (
149
- (image_width , image_height ))
152
+ if cropping :
153
+ best_width , best_height = hf_processor .select_best_resolution (
154
+ (image_width , image_height ))
155
+ num_width_tiles , num_height_tiles = (best_width // image_size ,
156
+ best_height // image_size )
157
+ else :
158
+ num_width_tiles = num_height_tiles = 1
150
159
151
- num_width_tiles , num_height_tiles = (best_width // image_size ,
152
- best_height // image_size )
153
160
h = w = math .ceil ((image_size // patch_size ) / downsample_ratio )
154
161
155
162
global_views_tokens = h * (w + 1 )
@@ -169,10 +176,12 @@ def get_mm_max_tokens_per_item(
169
176
seq_len : int ,
170
177
mm_counts : Mapping [str , int ],
171
178
) -> Mapping [str , int ]:
179
+ num_images = mm_counts .get ("image" , 0 )
172
180
max_image_size = self .get_image_size_with_most_features ()
173
181
max_image_tokens = self .get_num_image_tokens (
174
182
image_height = max_image_size .height ,
175
- image_width = max_image_size .width )
183
+ image_width = max_image_size .width ,
184
+ cropping = num_images <= 2 )
176
185
177
186
return {"image" : max_image_tokens }
178
187
@@ -207,6 +216,30 @@ def get_dummy_processor_inputs(
207
216
class DeepseekVL2MultiModalProcessor (
208
217
BaseMultiModalProcessor [DeepseekVL2ProcessingInfo ]):
209
218
219
+ def __init__ (
220
+ self ,
221
+ info : DeepseekVL2ProcessingInfo ,
222
+ dummy_inputs : "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]" ,
223
+ * ,
224
+ cache : Optional [ProcessingCache ] = None ,
225
+ enable_sanity_checks : bool = True ) -> None :
226
+ super ().__init__ (
227
+ info ,
228
+ dummy_inputs ,
229
+ cache = cache ,
230
+ enable_sanity_checks = enable_sanity_checks ,
231
+ )
232
+
233
+ mm_limit = self .info .ctx .model_config .multimodal_config .limit_per_prompt
234
+ if self .cache is not None and mm_limit ["image" ] > 2 :
235
+ # The processor output depends on the number of images passed,
236
+ # making it incompatible with processing cache which is supposed
237
+ # to be invariant of how many images are passed per prompt
238
+ self .cache = None
239
+ logger .warning_once (
240
+ f"{ type (self ).__name__ } does not support processing cache with "
241
+ "image limit larger than 2." )
242
+
210
243
def _call_hf_processor (
211
244
self ,
212
245
prompt : str ,
@@ -271,6 +304,7 @@ def get_replacement_deepseek_vl2(item_idx: int):
271
304
num_image_tokens = self .info .get_num_image_tokens (
272
305
image_width = image_size .width ,
273
306
image_height = image_size .height ,
307
+ cropping = len (images ) <= 2 ,
274
308
)
275
309
return [image_token_id ] * num_image_tokens
276
310
0 commit comments