From d1fe607b2d5b354c51f6f5a5271ebde5914069a0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 6 Mar 2025 07:00:29 +0000
Subject: [PATCH 1/2] [Core] Optimize multi-modal profiling

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/inputs/registry.py      | 16 +++-------------
 vllm/multimodal/profiling.py |  5 -----
 vllm/multimodal/registry.py  | 16 ++++++++++++----
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 691fcd7dc53f..87e3384e1097 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -331,7 +331,9 @@ def dummy_data_for_profiling(
 
         if mm_registry.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = mm_registry.create_processor(model_config, tokenizer)
+            processor = mm_registry.create_processor(model_config,
+                                                     tokenizer,
+                                                     disable_cache=True)
             profiler = MultiModalProfiler(processor)
             dummy_data = profiler.get_dummy_data(
                 seq_len, is_encoder_data=is_encoder_data)
@@ -349,18 +351,6 @@ def dummy_data_for_profiling(
                                        _MultiModalCounts(mm_counts),
                                        **mm_processor_kwargs)
 
-        # Having more tokens is over-conservative but otherwise fine
-        num_tokens = dummy_data.seq_data.prompt_token_ids
-        if len(num_tokens) < seq_len:
-            if is_encoder_data:
-                logger.warning_once(
-                    f"Expected at least {seq_len} dummy encoder tokens for "
-                    f"profiling, but found {len(num_tokens)} tokens instead.")
-            else:
-                raise AssertionError(
-                    f"Expected at least {seq_len} dummy tokens for profiling, "
-                    f"but found {len(num_tokens)} tokens instead.")
-
         if (dummy_data.multi_modal_data is not None and
                 not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
             for k, v in dummy_data.multi_modal_data.items():
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 3178b0f8c3e6..c872aa5eddad 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -204,17 +204,12 @@ def get_dummy_data(
                     "and/or reduce `mm_counts`.", seq_len, total_len,
                     total_placeholders_by_modality)
 
-            num_tokens_to_pad = max(total_len, seq_len) - total_len
-            prompt_token_ids.extend([0] * num_tokens_to_pad)
-
             return DummyData(
                 seq_data=SequenceData.from_seqs(prompt_token_ids),
                 multi_modal_data=None,
                 multi_modal_placeholders=None,
             )
 
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
-
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
             multi_modal_data=mm_inputs["mm_kwargs"],
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 1882ffe9bf69..a9eb250cb877 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -257,7 +257,9 @@ def get_max_tokens_per_item_by_modality(
         """
         if self.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config, tokenizer)
+            processor = self.create_processor(model_config,
+                                              tokenizer,
+                                              disable_cache=True)
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
             return processor.info.get_mm_max_tokens_per_item(
@@ -372,7 +374,9 @@ def get_mm_limits_per_prompt(
         """
         if self.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config, tokenizer)
+            processor = self.create_processor(model_config,
+                                              tokenizer,
+                                              disable_cache=True)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
 
@@ -433,6 +437,8 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
+        *,
+        disable_cache: Optional[bool] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -440,11 +446,13 @@ def create_processor(
         See also:
             :ref:`mm-processing`
         """
+        if disable_cache is None:
+            disable_cache = model_config.disable_mm_preprocessor_cache
+
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        cache = (None if model_config.disable_mm_preprocessor_cache else
-                 self._processing_cache)
+        cache = None if disable_cache else self._processing_cache
 
         return factories.build_processor(ctx, cache=cache)

From 6ba2d40a2ce9a33c8ab4ce69a506215cb7f7040d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 6 Mar 2025 13:36:33 +0000
Subject: [PATCH 2/2] Revert padding

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/inputs/registry.py      | 12 ++++++++++++
 vllm/multimodal/profiling.py |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 87e3384e1097..babfc4fb809c 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -351,6 +351,18 @@ def dummy_data_for_profiling(
                                        _MultiModalCounts(mm_counts),
                                        **mm_processor_kwargs)
 
+        # Having more tokens is over-conservative but otherwise fine
+        num_tokens = dummy_data.seq_data.prompt_token_ids
+        if len(num_tokens) < seq_len:
+            if is_encoder_data:
+                logger.warning_once(
+                    f"Expected at least {seq_len} dummy encoder tokens for "
+                    f"profiling, but found {len(num_tokens)} tokens instead.")
+            else:
+                raise AssertionError(
+                    f"Expected at least {seq_len} dummy tokens for profiling, "
+                    f"but found {len(num_tokens)} tokens instead.")
+
         if (dummy_data.multi_modal_data is not None and
                 not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
             for k, v in dummy_data.multi_modal_data.items():
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index c872aa5eddad..3178b0f8c3e6 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -204,12 +204,17 @@ def get_dummy_data(
                     "and/or reduce `mm_counts`.", seq_len, total_len,
                     total_placeholders_by_modality)
 
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            prompt_token_ids.extend([0] * num_tokens_to_pad)
+
             return DummyData(
                 seq_data=SequenceData.from_seqs(prompt_token_ids),
                 multi_modal_data=None,
                 multi_modal_placeholders=None,
             )
 
+        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
             multi_modal_data=mm_inputs["mm_kwargs"],