From 06af65ad7ce3ba99b15755faf80b3c45e66aaa5c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 08:08:18 +0000
Subject: [PATCH 1/4] Simplify accessor for modality limit

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                                 | 9 +++++++++
 vllm/model_executor/models/deepseek_vl2.py     | 4 ++--
 vllm/model_executor/models/h2ovl.py            | 4 ++--
 vllm/model_executor/models/llava_next_video.py | 2 +-
 vllm/model_executor/models/llava_onevision.py  | 4 ++--
 vllm/model_executor/models/minicpmo.py         | 6 +++---
 vllm/model_executor/models/minicpmv.py         | 4 ++--
 vllm/model_executor/models/pixtral.py          | 2 +-
 vllm/model_executor/models/qwen2_vl.py         | 4 ++--
 vllm/multimodal/processing.py                  | 4 ++--
 vllm/multimodal/profiling.py                   | 4 +---
 vllm/multimodal/registry.py                    | 2 +-
 12 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 3eca09b232a7..5f61c2d592c9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2405,6 +2405,15 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
+    def get_limit_per_prompt(self, modality: str) -> int:
+        """
+        Get the maximum number of input items allowed per prompt
+        for the given modality.
+
+        If not set by the user, this defaults to `1`.
+        """
+        return self.limit_per_prompt.get(modality, 1)
+
     # TODO: Add configs to init vision tower or not.
 
 
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 532400b3b425..a8f7deb02379 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -230,8 +230,8 @@ def __init__(
             enable_sanity_checks=enable_sanity_checks,
         )
 
-        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
-        if self.cache is not None and mm_limit["image"] > 2:
+        if (self.cache is not None and
+                info.ctx.get_mm_config().get_limit_per_prompt("image") > 2):
             # The processor output depends on the number of images passed,
             # making it incompatible with processing cache which is supposed
             # to be invariant of how many images are passed per prompt
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index d336d7521a27..abfa4d79c02e 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -478,8 +478,8 @@ def __init__(self,
             enable_sanity_checks=enable_sanity_checks,
         )
 
-        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
-        if self.cache is not None and mm_limit["image"] >= 2:
+        if (self.cache is not None and
+                info.ctx.get_mm_config().get_limit_per_prompt("image") >= 2):
             # The processor output depends on the number of images passed,
             # making it incompatible with processing cache which is supposed
             # to be invariant of how many images are passed per prompt
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 508b47d13519..d974c3d22409 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -133,7 +133,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_total_frames = self._get_max_video_frames(seq_len)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index e87ef24ce2ca..f41f45e3e409 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -206,8 +206,8 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index f35c230c0cea..db38b5d342f8 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -201,9 +201,9 @@ def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
-        max_audios = mm_config.limit_per_prompt.get("audio", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
+        max_audios = mm_config.get_limit_per_prompt("audios")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index cf103edd0bcc..48f0c09cdfb3 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -446,8 +446,8 @@ def get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8acc07ac353a..f17f9fb8e0c7 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -68,7 +68,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     image_token_id = mm_encoder.special_ids.img
 
     mm_config = ctx.get_mm_config()
-    num_images = mm_config.limit_per_prompt.get("image", 1)
+    num_images = mm_config.get_limit_per_prompt("image")
 
     # dummy size
     size = 256
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 523b53d5ee41..ac3d154dd881 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -911,8 +911,8 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3f13cd8582fe..ba8a458e84c8 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -984,10 +984,10 @@ def _to_mm_items(
         before passing them to :meth:`_get_hf_mm_data`.
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
+        mm_config = self.info.ctx.get_mm_config()
 
-        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
-            limit = mm_limits.get(modality, 1)
+            limit = mm_config.get_limit_per_prompt(modality)
             if len(items) > limit:
                 raise ValueError(
                     f"You set {modality}={limit} (or defaulted to 1) in "
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 3178b0f8c3e6..57f045dae6bd 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -110,12 +110,10 @@ def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
 
     def get_mm_limits(self) -> Mapping[str, int]:
         mm_config = self.processing_info.ctx.get_mm_config()
-        mm_limit_per_prompt = mm_config.limit_per_prompt
-
         supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
-            modality: mm_limit_per_prompt.get(modality, 1)
+            modality: mm_config.get_limit_per_prompt(modality)
             for modality in supported_mm_limits
         }
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a9eb250cb877..4987cdc4a2e8 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -355,7 +355,7 @@ def init_mm_limits_per_prompt(
             # TODO: Automatically determine the limits based on budget
             # once more models support multi-image inputs
             limits_per_plugin = {
-                key: config_limits_per_plugin.get(key, 1)
+                key: multimodal_config.get_limit_per_prompt(key)
                 for key in self._plugins
             }
 

From 7fc9233f143bc953b78b319a42addb951bf64bc2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 08:09:58 +0000
Subject: [PATCH 2/4] Typo

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/minicpmo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index db38b5d342f8..bf6c38d27963 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -203,7 +203,7 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.get_limit_per_prompt("image")
         max_videos = mm_config.get_limit_per_prompt("video")
-        max_audios = mm_config.get_limit_per_prompt("audios")
+        max_audios = mm_config.get_limit_per_prompt("audio")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens

From 7fc07acd2fbc624c3681a2fd9255e43dfaeccc21 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 08:17:17 +0000
Subject: [PATCH 3/4] Only disable cache at inference time

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/deepseek_vl2.py | 54 ++++++++++-----------
 vllm/model_executor/models/h2ovl.py        | 56 ++++++++++------------
 2 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index a8f7deb02379..3d0ac16002e9 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -14,7 +14,6 @@
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -25,8 +24,8 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -42,8 +41,6 @@
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
 # The image token id may be various
 _IMAGE_TOKEN = "<image>"
 
@@ -216,30 +213,6 @@ def get_dummy_processor_inputs(
 class DeepseekVL2MultiModalProcessor(
         BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
 
-    def __init__(
-            self,
-            info: DeepseekVL2ProcessingInfo,
-            dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]",
-            *,
-            cache: Optional[ProcessingCache] = None,
-            enable_sanity_checks: bool = True) -> None:
-        super().__init__(
-            info,
-            dummy_inputs,
-            cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
-        )
-
-        if (self.cache is not None and
-                info.ctx.get_mm_config().get_limit_per_prompt("image") > 2):
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
-            self.cache = None
-            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache with "
-                "image limit larger than 2.")
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -316,6 +289,29 @@ def get_replacement_deepseek_vl2(item_idx: int):
             )
         ]
 
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        if mm_data_items.get_count("image") > 2:
+            # The processor output depends on the number of images passed,
+            # making it incompatible with processing cache which is supposed
+            # to be invariant of how many images are passed per prompt
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_update=True,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     DeepseekVL2MultiModalProcessor,
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index abfa4d79c02e..a81922fbcc5f 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -8,21 +8,19 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from PIL import Image
 from transformers import PretrainedConfig
 
-from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
@@ -32,8 +30,6 @@
                        InternVLMultiModalProcessor, build_transform,
                        find_closest_aspect_ratio, get_internvl_target_ratios)
 
-logger = init_logger(__name__)
-
 
 def resolve_h2ovl_min_max_num(
     *,
@@ -465,29 +461,6 @@ def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                                ):
 
-    def __init__(self,
-                 info: H2OVLProcessingInfo,
-                 dummy_inputs: "BaseDummyInputsBuilder[H2OVLProcessingInfo]",
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(
-            info,
-            dummy_inputs,
-            cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
-        )
-
-        if (self.cache is not None and
-                info.ctx.get_mm_config().get_limit_per_prompt("image") >= 2):
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
-            self.cache = None
-            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache with "
-                "multi-image support enabled.")
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -543,6 +516,29 @@ def get_replacement_internvl(item_idx: int):
             )
         ]
 
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        if mm_data_items.get_count("image") > 1:
+            # The processor output depends on the number of images passed,
+            # making it incompatible with processing cache which is supposed
+            # to be invariant of how many images are passed per prompt
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_update=True,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     H2OVLMultiModalProcessor,

From 809b251ec4db384af8a2e339049ec4ebfa0bc56c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 08:23:40 +0000
Subject: [PATCH 4/4] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/deepseek_vl2.py | 10 ++++++----
 vllm/model_executor/models/h2ovl.py        | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 3d0ac16002e9..fd5d5a564b5e 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -295,10 +295,12 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs, bool]:
-        if mm_data_items.get_count("image") > 2:
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
+        # The processor logic is different for len(images) <= 2 vs > 2
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 2:
+            # This code path corresponds to the cache being disabled
             return self._apply_hf_processor_main(
                 prompt=prompt,
                 mm_items=mm_data_items,
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index a81922fbcc5f..e23765cc4fb5 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -522,10 +522,12 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs, bool]:
-        if mm_data_items.get_count("image") > 1:
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
+        # The processor logic is different for len(images) <= 1 vs > 1
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 1:
+            # This code path corresponds to the cache being disabled
             return self._apply_hf_processor_main(
                 prompt=prompt,
                 mm_items=mm_data_items,