From d1fe607b2d5b354c51f6f5a5271ebde5914069a0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 6 Mar 2025 07:00:29 +0000 Subject: [PATCH 1/2] [Core] Optimize multi-modal profiling Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 16 +++------------- vllm/multimodal/profiling.py | 5 ----- vllm/multimodal/registry.py | 16 ++++++++++++---- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 691fcd7dc53f..87e3384e1097 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -331,7 +331,9 @@ def dummy_data_for_profiling( if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) - processor = mm_registry.create_processor(model_config, tokenizer) + processor = mm_registry.create_processor(model_config, + tokenizer, + disable_cache=True) profiler = MultiModalProfiler(processor) dummy_data = profiler.get_dummy_data( seq_len, is_encoder_data=is_encoder_data) @@ -349,18 +351,6 @@ def dummy_data_for_profiling( _MultiModalCounts(mm_counts), **mm_processor_kwargs) - # Having more tokens is over-conservative but otherwise fine - num_tokens = dummy_data.seq_data.prompt_token_ids - if len(num_tokens) < seq_len: - if is_encoder_data: - logger.warning_once( - f"Expected at least {seq_len} dummy encoder tokens for " - f"profiling, but found {len(num_tokens)} tokens instead.") - else: - raise AssertionError( - f"Expected at least {seq_len} dummy tokens for profiling, " - f"but found {len(num_tokens)} tokens instead.") - if (dummy_data.multi_modal_data is not None and not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)): for k, v in dummy_data.multi_modal_data.items(): diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 3178b0f8c3e6..c872aa5eddad 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -204,17 +204,12 @@ def get_dummy_data( "and/or reduce `mm_counts`.", seq_len, total_len, total_placeholders_by_modality) - num_tokens_to_pad = max(total_len, seq_len) - total_len - prompt_token_ids.extend([0] * num_tokens_to_pad) - return DummyData( seq_data=SequenceData.from_seqs(prompt_token_ids), multi_modal_data=None, multi_modal_placeholders=None, ) - prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) - return DummyData( seq_data=SequenceData.from_seqs(prompt_token_ids), multi_modal_data=mm_inputs["mm_kwargs"], diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 1882ffe9bf69..a9eb250cb877 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -257,7 +257,9 @@ def get_max_tokens_per_item_by_modality( """ if self.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) - processor = self.create_processor(model_config, tokenizer) + processor = self.create_processor(model_config, + tokenizer, + disable_cache=True) seq_len = model_config.max_model_len mm_limits = self.get_mm_limits_per_prompt(model_config) return processor.info.get_mm_max_tokens_per_item( @@ -372,7 +374,9 @@ def get_mm_limits_per_prompt( """ if self.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) - processor = self.create_processor(model_config, tokenizer) + processor = self.create_processor(model_config, + tokenizer, + disable_cache=True) profiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -433,6 +437,8 @@ def create_processor( self, model_config: "ModelConfig", tokenizer: AnyTokenizer, + *, + disable_cache: Optional[bool] = None, ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. @@ -440,11 +446,13 @@ def create_processor( See also: :ref:`mm-processing` """ + if disable_cache is None: + disable_cache = model_config.disable_mm_preprocessor_cache + model_cls = self._get_model_cls(model_config) factories = self._processor_factories[model_cls] ctx = InputProcessingContext(model_config, tokenizer) - cache = (None if model_config.disable_mm_preprocessor_cache else - self._processing_cache) + cache = None if disable_cache else self._processing_cache return factories.build_processor(ctx, cache=cache) From 6ba2d40a2ce9a33c8ab4ce69a506215cb7f7040d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 6 Mar 2025 13:36:33 +0000 Subject: [PATCH 2/2] Revert padding Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 12 ++++++++++++ vllm/multimodal/profiling.py | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 87e3384e1097..babfc4fb809c 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -351,6 +351,18 @@ def dummy_data_for_profiling( _MultiModalCounts(mm_counts), **mm_processor_kwargs) + # Having more tokens is over-conservative but otherwise fine + num_tokens = dummy_data.seq_data.prompt_token_ids + if len(num_tokens) < seq_len: + if is_encoder_data: + logger.warning_once( + f"Expected at least {seq_len} dummy encoder tokens for " + f"profiling, but found {len(num_tokens)} tokens instead.") + else: + raise AssertionError( + f"Expected at least {seq_len} dummy tokens for profiling, " + f"but found {len(num_tokens)} tokens instead.") + if (dummy_data.multi_modal_data is not None and not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)): for k, v in dummy_data.multi_modal_data.items(): diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index c872aa5eddad..3178b0f8c3e6 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -204,12 +204,17 @@ def get_dummy_data( "and/or reduce `mm_counts`.", seq_len, total_len, total_placeholders_by_modality) + num_tokens_to_pad = max(total_len, seq_len) - total_len + prompt_token_ids.extend([0] * num_tokens_to_pad) + return DummyData( seq_data=SequenceData.from_seqs(prompt_token_ids), multi_modal_data=None, multi_modal_placeholders=None, ) + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) + return DummyData( seq_data=SequenceData.from_seqs(prompt_token_ids), multi_modal_data=mm_inputs["mm_kwargs"],