From 425d3c4dc1b21ebce2d864ec3a94ae97d7f8e7c5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 29 Dec 2024 13:23:10 +0000 Subject: [PATCH 01/40] batch Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_next.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a39f2f4124d0..5e70c11363c8 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -528,10 +528,8 @@ def _process_image_pixels( stacked_image_features = self._image_pixels_to_features( self.vision_tower, stacked_pixel_values) - return [ - self.multi_modal_projector(image_features) for image_features in - torch.split(stacked_image_features, num_patches_per_batch) - ] + return torch.split(self.multi_modal_projector(stacked_image_features), + num_patches_per_batch) def _process_image_input( self, From 8edcc8331967044825007984ab4298d868de22f9 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 04:35:07 +0000 Subject: [PATCH 02/40] blip2 Signed-off-by: Roger Wang --- vllm/model_executor/models/blip2.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 76b8505ee1c2..4e16ae522c9b 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData @@ -468,6 +468,9 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size new_token_ids += inputs["prompt_token_ids"] + placeholder_ranges = [ + PlaceholderRange(offset=0, length=image_feature_size) + ] new_prompt = inputs.get("prompt") if new_prompt is not None: @@ -475,7 +478,8 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) @MULTIMODAL_REGISTRY.register_image_input_mapper() From 5f7629114a7b46c337013ff821f2489453d0c893 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 07:11:11 +0000 Subject: [PATCH 03/40] chameleon Signed-off-by: Roger Wang --- docs/source/models/supported_models.md | 4 ++-- vllm/model_executor/models/chameleon.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 518505abeb2a..3ec203e3bea4 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -577,14 +577,14 @@ See [this page](#generative-models) for more information on how to use generativ - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - - ✅︎ - - + - ✅︎ * - `ChameleonForConditionalGeneration` - Chameleon - T + I - `facebook/chameleon-7b` etc. - - ✅︎ - - + - ✅︎ * - `FuyuForCausalLM` - Fuyu - T + I diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a40c321ce0a5..afca81f5d4fd 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -153,7 +153,8 @@ def input_processor_for_chameleon(ctx: InputContext, # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": ranges}) class ChameleonLayerNorm(nn.LayerNorm): From 814f3bd2be02064d984df760a3daf0a7845d4647 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 07:11:20 +0000 Subject: [PATCH 04/40] fix util Signed-off-by: Roger Wang --- vllm/multimodal/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 87b12a6fb33c..7b6ded6a2708 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens( placeholder_token_idx = 0 for i, token in enumerate(prompt_token_ids): if token == placeholder_token_id: + curr_repeat_count = repeat_count[placeholder_token_idx] replacement_ids = repeat_and_pad_token( placeholder_token_id, - repeat_count=repeat_count[placeholder_token_idx], + repeat_count=curr_repeat_count, pad_token_left=pad_token_left, pad_token_right=pad_token_right, ) + offset = len(new_token_ids) + if pad_token_left is not None: + offset += 1 placeholder_ranges.append({ - "offset": len(new_token_ids), - "length": len(replacement_ids) + "offset": offset, + "length": curr_repeat_count, }) new_token_ids.extend(replacement_ids) placeholder_token_idx += 1 From efeb99991b5815ea49b8308d80c23b346a9da418 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 10:38:29 +0000 Subject: [PATCH 05/40] fuyu Signed-off-by: Roger Wang --- docs/source/models/supported_models.md | 2 +- vllm/model_executor/models/fuyu.py | 61 +++++++++++++++++++++----- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 3ec203e3bea4..7bb4da545cf4 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -591,7 +591,7 @@ See [this page](#generative-models) for more information on how to use generativ - `adept/fuyu-8b` etc. - - ✅︎ - - + - ✅︎ * - `ChatGLMModel` - GLM-4V - T + I diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6e86900326c4..02bc0af05325 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -35,7 +35,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -61,6 +61,7 @@ class FuyuImagePixelInputs(TypedDict): Shape: (batch_size, num_patches, patch_size_x * patch_size_y * num_channels) """ + image_input_ids: torch.Tensor def _calculate_num_image_tokens( @@ -177,7 +178,14 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): image_patch[0] for image_patch in model_image_input["image_patches"] ]) - new_multi_modal_data["image"] = image_patches + # dim0 is batch_size, dim1 is subseq_size which will always be 1 + image_input_ids: List[List[ + torch.Tensor]] = model_image_input["image_input_ids"] + image_input_ids = image_input_ids[0][0].tolist() + new_multi_modal_data["image"] = { + "image_patches": image_patches, + "image_input_ids": image_input_ids + } elif is_list_of(image_list, torch.Tensor): raise NotImplementedError("Embeddings input is not supported yet") @@ -188,10 +196,6 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): prompt = inputs.get("prompt") prompt_token_ids = inputs["prompt_token_ids"] tokenizer = cached_get_tokenizer(model_config.model) - # dim0 is batch_size, dim1 is subseq_size which will always be 1 - image_input_ids: List[List[ - torch.Tensor]] = model_image_input["image_input_ids"] - image_input_ids = image_input_ids[0][0].tolist() bos_token = tokenizer.encode("", add_special_tokens=False)[1:] boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] @@ -199,14 +203,21 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[ 1:] + boa_token + placeholder_ranges = [ + PlaceholderRange(offset=0, length=len(image_input_ids)) + ] + return token_inputs(prompt=new_prompt, prompt_token_ids=new_prompt_token_ids, - multi_modal_data=new_multi_modal_data) + multi_modal_data=new_multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) def input_mapper_for_fuyu(ctx: InputContext, data: object): model_config = ctx.model_config data_list = data if isinstance(data, list) else [data] + + # For profiling with dummy image data if is_list_of(data_list, Image.Image): # Fuyu's image_processor can also finish token padding image_processor: FuyuImageProcessor = cached_get_image_processor( @@ -217,9 +228,18 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): image_patch[0] for image_patch in model_image_input["image_patches"] ]) + image_input_ids = model_image_input["image_input_ids"][0][0] + return MultiModalKwargs({ + "pixel_values": data, + "image_input_ids": image_input_ids, + }) - # image has been processed with prompt in input processor - return MultiModalKwargs({"pixel_values": data}) + # For actual inference when image has been processed with + # prompt in input processor + return MultiModalKwargs({ + "pixel_values": data[0]["image_patches"], + "image_input_ids": data[0]["image_input_ids"], + }) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) @@ -282,7 +302,7 @@ def _validate_shape(d: torch.Tensor): def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: pixel_values = kwargs.pop("pixel_values", None) - + image_input_ids = kwargs.pop("image_input_ids", None) if pixel_values is not None: if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " @@ -292,6 +312,7 @@ def _parse_and_validate_image_input( type="pixel_values", data=self._validate_pixel_values( flatten_bn(pixel_values, concat=True)), + image_input_ids=image_input_ids, ) return None @@ -301,7 +322,23 @@ def _process_image_input( assert self.vision_embed_tokens is not None vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) - return vision_embeddings + hidden_size = vision_embeddings.shape[-1] + vision_embeddings = vision_embeddings.reshape(-1, hidden_size) + + # NOTE: image_input_ids contains both image placeholder tokens and + # newline tokens. + image_input_ids = image_input["image_input_ids"] + image_sizes = [ + len(input_ids_per_image) for input_ids_per_image in image_input_ids + ] + image_input_ids = torch.flatten(image_input_ids) + + image_token_mask = image_input_ids == _IMAGE_TOKEN_ID + full_vision_embeddings = self.language_model.get_input_embeddings( + image_input_ids) + full_vision_embeddings[image_token_mask] = vision_embeddings + + return torch.split(full_vision_embeddings, image_sizes) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) @@ -319,7 +356,7 @@ def get_input_embeddings( if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - _IMAGE_TOKEN_ID) + [_IMAGE_TOKEN_ID, _NEWLINE_TOKEN_ID]) return inputs_embeds def forward( From 5e568e8f6044102c9dbfcf4cf86fab29ca685d29 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:10:40 +0000 Subject: [PATCH 06/40] aria Signed-off-by: Roger Wang --- docs/source/models/supported_models.md | 2 +- vllm/model_executor/models/aria.py | 115 +++++++++++++++++++------ 2 files changed, 90 insertions(+), 27 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 7bb4da545cf4..b82bb649e6b5 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -570,7 +570,7 @@ See [this page](#generative-models) for more information on how to use generativ - `rhymes-ai/Aria` - - ✅︎ - - + - ✅︎ * - `Blip2ForConditionalGeneration` - BLIP-2 - T + IE diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 9437ad968842..a15672cec9c7 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,15 +1,18 @@ import math -from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn +from PIL import Image from torch.nn.init import trunc_normal_ from transformers import LlamaConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_rank -from vllm.inputs import INPUT_REGISTRY, token_inputs +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -17,8 +20,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - SamplingMetadata) +from vllm.model_executor.layers.sampler import (SamplerOutput, + SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) @@ -35,10 +38,12 @@ from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) +from vllm.utils import is_list_of from .utils import flatten_bn @@ -445,15 +450,74 @@ def build_mm_projector(config): ) -def get_max_multimodal_tokens(ctx): - return max(ctx.model_config.hf_config.image_size2tokens.values()) - - -def input_mapper_for_aria(ctx, data): - return MultiModalKwargs(data) - - -def input_processor(ctx, llm_inputs): +def get_aria_max_multimodal_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config() + image_size2tokens = { + int(math.sqrt(k) * hf_config.vision_config.patch_size): v + for k, v in hf_config.projector_patch_to_query_dict.items() + } + return max(image_size2tokens.values()) + + +def dummy_seq_data_for_aria(ctx: InputContext, seq_len: int, num_images: int): + image_feature_size = get_aria_max_multimodal_tokens(ctx) + hf_config = ctx.get_hf_config() + return SequenceData.from_prompt_token_counts( + (hf_config.image_token_index, image_feature_size * num_images), + (0, seq_len - image_feature_size * num_images), + ), { + "image": + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } + + +def dummy_image_for_aria( + ctx: InputContext, + num_images: int, +): + hf_config = ctx.get_hf_config() + max_image_size = hf_config.vision_config.image_size + image = Image.new("RGB", (max_image_size, max_image_size), color=0) + images = [image] * num_images + + return {"image": images} + + +def dummy_data_for_aria(ctx: InputContext, seq_len: int, + mm_counts: Mapping[str, int]): + num_images = mm_counts["image"] + seq_data, ranges = dummy_seq_data_for_aria(ctx, seq_len, num_images) + mm_data = dummy_image_for_aria(ctx, num_images) + return DummyData(seq_data, mm_data, ranges) + + +def input_mapper_for_aria(ctx: InputContext, data: object): + data_list = data if isinstance(data, list) else [data] + + # For profiling with dummy image data + if is_list_of(data_list, Image.Image): + hf_config = ctx.get_hf_config() + max_image_size = hf_config.vision_config.image_size + model_config = ctx.model_config + image_processor = cached_get_image_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code) + image_inputs = image_processor.preprocess( + data_list, + max_image_size=max_image_size, + split_image=False, + return_tensors="pt").data + image_inputs['pixel_values'] = image_inputs['pixel_values'].to( + ctx.model_config.dtype) + return MultiModalKwargs(image_inputs) + + # For actual inference when image has been processed with + # prompt in input processor + return MultiModalKwargs(data_list[0]) + + +def input_processor_for_aria(ctx: InputContext, llm_inputs: DecoderOnlyInputs): multi_modal_data = llm_inputs.get("multi_modal_data") # if it is pure text input, use it as is if multi_modal_data is None or "image" not in multi_modal_data: @@ -494,9 +558,12 @@ def input_processor(ctx, llm_inputs): repeat_count=num_crops, ) - repeat_count = [hf_config.image_size2tokens[max_image_size] - ] * sum(num_crops).item() - new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens( + image_size2tokens = { + int(math.sqrt(k) * hf_config.vision_config.patch_size): v + for k, v in hf_config.projector_patch_to_query_dict.items() + } + repeat_count = [image_size2tokens[max_image_size]] * sum(num_crops).item() + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, None, prompt_token_ids, @@ -508,12 +575,14 @@ def input_processor(ctx, llm_inputs): prompt_token_ids=new_token_ids, prompt=new_prompt, multi_modal_data={"image": image_inputs}, + multi_modal_placeholders={"image": ranges}, ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_aria_max_multimodal_tokens) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) -@INPUT_REGISTRY.register_input_processor(input_processor) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_aria) +@INPUT_REGISTRY.register_input_processor(input_processor_for_aria) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ Aria model for conditional generation tasks. @@ -540,12 +609,6 @@ def __init__( config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - # prepare the image_size to tokens mapping for the image preprocess, see - # input_processor - config.image_size2tokens = { - int(math.sqrt(k) * config.vision_config.patch_size): v - for k, v in config.projector_patch_to_query_dict.items() - } self.config = config self.vision_tower = AriaVisionModel(config.vision_config) self.multi_modal_projector = build_mm_projector(config) @@ -566,7 +629,7 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() def _validate_image_sizes( self, images: List[torch.Tensor]) -> List[torch.Tensor]: From 135fd5c324f55b158054a5c090584c4abca3aa36 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:10:52 +0000 Subject: [PATCH 07/40] fix profiling Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 509771b7e2e5..fdf8a7ba440f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -651,6 +651,8 @@ def profile_run(self) -> None: self.max_num_encoder_input_tokens, self.encoder_cache_size) // max_tokens_per_mm_item + max_num_mm_items = min(self.max_num_reqs, max_num_mm_items) + # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 From 0a8dbe0f0fd727e05b52b97442a020b9622ab3a0 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:11:54 +0000 Subject: [PATCH 08/40] update Signed-off-by: Roger Wang --- examples/offline_inference_vision_language.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 77af914a6ef0..068f29a21ff9 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -28,6 +28,7 @@ def run_aria(question: str, modality: str): tokenizer_mode="slow", trust_remote_code=True, dtype="bfloat16", + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" @@ -191,8 +192,10 @@ def run_llava_next(question: str, modality: str): prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", - max_model_len=8192, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + max_num_batched_tokens=32768, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 4}, + enable_prefix_caching=False) stop_token_ids = None return llm, prompt, stop_token_ids @@ -591,7 +594,7 @@ def main(args): # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, + sampling_params = SamplingParams(temperature=0, max_tokens=64, stop_token_ids=stop_token_ids) From 03f741d844a74f0f6f6e351194d4b00b92d89ff7 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:21:18 +0000 Subject: [PATCH 09/40] add llava-next Signed-off-by: Roger Wang --- docs/source/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index b82bb649e6b5..13143fe45f53 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅︎ - - + - ✅︎ * - `LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - T + V From 8bce94989afe81db7962dd6f8bb1a8d288cd4b12 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:23:54 +0000 Subject: [PATCH 10/40] revert testing code Signed-off-by: Roger Wang --- examples/offline_inference_vision_language.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 068f29a21ff9..efa9b7ac3807 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -192,10 +192,8 @@ def run_llava_next(question: str, modality: str): prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", - max_num_batched_tokens=32768, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, - limit_mm_per_prompt={"image": 4}, - enable_prefix_caching=False) + max_model_len=8192, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids From bbde4140feb299b83112496aff3399fbf0c30aba Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:24:44 +0000 Subject: [PATCH 11/40] revert testing code Signed-off-by: Roger Wang --- examples/offline_inference_vision_language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index efa9b7ac3807..93deec663c36 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -592,7 +592,7 @@ def main(args): # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0, + sampling_params = SamplingParams(temperature=0.2, max_tokens=64, stop_token_ids=stop_token_ids) From ea928c6e55ca346ef1b20894e5cbc49c928df0ba Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:46:13 +0000 Subject: [PATCH 12/40] tweak and clarify Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fdf8a7ba440f..6ed62415d998 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -647,11 +647,18 @@ def profile_run(self) -> None: self.mm_registry.get_max_tokens_per_item_by_modality( self.model_config).values()) - max_num_mm_items = min( + max_num_mm_items_encoder_budget = min( self.max_num_encoder_input_tokens, self.encoder_cache_size) // max_tokens_per_mm_item - max_num_mm_items = min(self.max_num_reqs, max_num_mm_items) + max_mm_items_per_req = max( + self.mm_registry.get_mm_limits_per_prompt( + self.model_config).values()) + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget) # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we From 55eada7145eec6864404bdf7c31b991ab1833c23 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:58:23 +0000 Subject: [PATCH 13/40] clarify Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6ed62415d998..c8ba5e932a19 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -654,6 +654,10 @@ def profile_run(self) -> None: max_mm_items_per_req = max( self.mm_registry.get_mm_limits_per_prompt( self.model_config).values()) + + # NOTE: We do not consider max_num_batched_tokens on + # purpose because the image embeddings can be generated in + # advanced and chunked prefilled. max_num_mm_items_decoder_budget = self.max_num_reqs * \ max_mm_items_per_req From bbd57528af1894609821eaf7445be6e2e6850f50 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 30 Dec 2024 14:59:32 +0000 Subject: [PATCH 14/40] reword Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c8ba5e932a19..a08a86d4007d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -655,9 +655,9 @@ def profile_run(self) -> None: self.mm_registry.get_mm_limits_per_prompt( self.model_config).values()) - # NOTE: We do not consider max_num_batched_tokens on - # purpose because the image embeddings can be generated in - # advanced and chunked prefilled. + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. max_num_mm_items_decoder_budget = self.max_num_reqs * \ max_mm_items_per_req From 0452b99b143eab5fc7c4596a9ad167a74bc1f022 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 30 Dec 2024 16:48:44 +0000 Subject: [PATCH 15/40] Use merged multi-modal processor for blip2 and chameleon Signed-off-by: DarkLight1337 --- tests/multimodal/test_processing.py | 4 + vllm/model_executor/models/blip.py | 74 ----------- vllm/model_executor/models/blip2.py | 139 ++++++++------------- vllm/model_executor/models/chameleon.py | 157 +++++++++++------------- vllm/multimodal/processing.py | 5 +- 5 files changed, 129 insertions(+), 250 deletions(-) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 1b2847ed0f53..43fb6e4e25e7 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -624,6 +624,10 @@ def _test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ + ("rhymes-ai/Aria", {"image"}), + ("Salesforce/blip2-opt-2.7b", {"image"}), + ("facebook/chameleon-7b", {"image"}), + ("adept/fuyu-8b", {"image"}), ("llava-hf/llava-1.5-7b-hf", {"image"}), ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}), ("mistral-community/pixtral-12b", {"image"}), diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 42a239cadac4..129a0bcecc86 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -8,18 +8,13 @@ from transformers import Blip2VisionConfig, BlipVisionConfig from vllm.attention.layer import MultiHeadAttention -from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.inputs import DecoderOnlyInputs, token_inputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import SequenceData def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -33,36 +28,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: return grid_length * grid_length -def get_blip_image_feature_size( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_num_patches(image_size=hf_config.image_size, - patch_size=hf_config.patch_size) - - -def get_max_blip_image_tokens( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_image_feature_size(hf_config) - - -def dummy_seq_data_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ) - - def dummy_image_for_blip( hf_config: Union[BlipVisionConfig, Blip2VisionConfig], num_images: int, @@ -80,45 +45,6 @@ def dummy_image_for_blip( return {"image": image if num_images == 1 else [image] * num_images} -def input_processor_for_blip( - model_config: ModelConfig, - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - inputs: DecoderOnlyInputs, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=image_token_id, - repeat_count=image_feature_size, - ) - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": ranges}) - - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 4e16ae522c9b..c65acb85aa98 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -4,24 +4,25 @@ import torch import torch.nn as nn -from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, - apply_chunking_to_forward) +from transformers import (BatchFeature, Blip2Config, Blip2Processor, + Blip2QFormerConfig, apply_chunking_to_forward) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors, PlaceholderRange -from vllm.multimodal.utils import consecutive_placeholder_ranges -from vllm.sequence import IntermediateTensors, SequenceData - -from .blip import (BlipVisionModel, dummy_image_for_blip, - get_max_blip_image_tokens) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors + +from .blip import BlipVisionModel, dummy_image_for_blip from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -396,96 +397,60 @@ def forward( return sequence_output -def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: - return hf_config.num_query_tokens - - def get_max_blip2_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - - if isinstance(vision_config, Blip2VisionConfig): - return get_max_blip_image_tokens(vision_config) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def dummy_seq_data_for_blip2( - hf_config: Blip2Config, - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip2_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_data_for_blip2(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] - - seq_data, ranges = dummy_seq_data_for_blip2( - hf_config, - seq_len, - num_images, - image_token_id=BLIP2_IMAGE_TOKEN_ID, - ) - - if isinstance(vision_config, Blip2VisionConfig): - mm_data = dummy_image_for_blip(vision_config, num_images) + return hf_config.num_query_tokens - return DummyData(seq_data, mm_data, ranges) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) +class Blip2MultiModalProcessor(BaseMultiModalProcessor): + def _get_hf_processor(self) -> Blip2Processor: + return self.ctx.get_hf_processor(Blip2Processor) -def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) - hf_config = ctx.get_hf_config(Blip2Config) - image_feature_size = get_blip2_image_feature_size(hf_config) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + max_image_tokens = get_max_blip2_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target="", # An empty target is never matched against + replacement="" * max_image_tokens, + ) + ] - # The original model places image tokens at the front - # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 - new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size - new_token_ids += inputs["prompt_token_ids"] - placeholder_ranges = [ - PlaceholderRange(offset=0, length=image_feature_size) - ] + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + num_images = mm_counts.get("image", 0) - new_prompt = inputs.get("prompt") - if new_prompt is not None: - new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt + data = dummy_image_for_blip(vision_config, num_images) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": placeholder_ranges}) + return ProcessorInputs( + prompt_text="", + mm_data=data, + ) -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) -@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index afca81f5d4fd..e027579cdb8e 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -6,13 +6,13 @@ import torch.nn.functional as F from PIL import Image from torch import nn -from transformers import ChameleonConfig, ChameleonVQVAEConfig +from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, + ChameleonVQVAEConfig) from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -29,11 +29,13 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP @@ -45,10 +47,6 @@ # and processor files, so we hardcode them in the model file for now. CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 CHAMELEON_IMAGE_SEQ_LENGTH = 1024 -CHAMELEON_IMAGE_TOKEN_ID = 8711 -CHAMELEON_IMAGE_START_TOKEN_ID = 8197 -CHAMELEON_IMAGE_END_TOKEN_ID = 8196 -CHAMELEON_SEP_TOKEN_ID = 8710 class ChameleonImagePixelInputs(TypedDict): @@ -61,28 +59,6 @@ def get_max_chameleon_image_tokens(ctx: InputContext): return CHAMELEON_IMAGE_SEQ_LENGTH -def dummy_seq_data_for_chameleon( - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - def dummy_image_for_chameleon( num_images: int, *, @@ -100,61 +76,70 @@ def dummy_image_for_chameleon( return {"image": image if num_images == 1 else [image] * num_images} -def dummy_data_for_chameleon(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] +class ChameleonMultiModalProcessor(BaseMultiModalProcessor): + + def _get_hf_processor(self) -> ChameleonProcessor: + return self.ctx.get_hf_processor(ChameleonProcessor) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image"), ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + processor = self._get_hf_processor() + + return [ + PromptReplacement( + modality="image", + target="", + replacement="".join([ + processor.image_start_token, + processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH, + processor.image_end_token, + ]), + ) + ] - seq_data, ranges = dummy_seq_data_for_chameleon( - seq_len, - num_images, - image_token_id=CHAMELEON_IMAGE_TOKEN_ID, - ) + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) - mm_data = dummy_image_for_chameleon(num_images) - return DummyData(seq_data, mm_data, ranges) + data = dummy_image_for_chameleon(num_images) + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=data, + ) -def input_processor_for_chameleon(ctx: InputContext, - inputs: DecoderOnlyInputs): + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the image_start_token and image_end_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"] + 1, + length=p["length"] - 2) for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } - """ - Processing input prompt to insert required tokens for image placeholder. - - See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58 - """ # noqa - - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID, - repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH, - pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID, - pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID, - ) - - # Appending sep token for chat mode to follow default processor - # behavior - if new_prompt is not None: - new_prompt += tokenizer.sep_token - new_token_ids += [CHAMELEON_SEP_TOKEN_ID] - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": ranges}) + return result class ChameleonLayerNorm(nn.LayerNorm): @@ -926,10 +911,8 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon) -@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon) +@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3ece0762e322..f7fb5d3bba51 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,7 @@ import pickle import re from abc import ABC, abstractmethod +from collections import defaultdict from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field from functools import lru_cache @@ -352,13 +353,13 @@ def _replace_matches( ) -> list[_S]: out_seqs = list[_S]() prev_end_idx = 0 - next_idx_by_modality = {modality: 0 for modality in mm_item_counts} + next_idx_by_modality = defaultdict[str, int](lambda: 0) for match in _resolve_matches(prompt, matches): modality = match.modality item_idx = next_idx_by_modality[modality] - if item_idx >= mm_item_counts[modality]: + if item_idx >= mm_item_counts.get(modality, 0): continue start_idx = match.start_idx From 938c0bf8d48934100e1c7078d3b8c7d36b16ed19 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 30 Dec 2024 17:20:25 +0000 Subject: [PATCH 16/40] Limit max num seqs Signed-off-by: DarkLight1337 --- examples/offline_inference_vision_language.py | 1 + tests/models/decoder_only/vision_language/test_models.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 93deec663c36..6480bda1ebdb 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -58,6 +58,7 @@ def run_chameleon(question: str, modality: str): prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b", max_model_len=4096, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 1a9c1b4ef1be..f0bc1a14773d 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -179,6 +179,7 @@ test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, + max_num_seqs=2, auto_cls=AutoModelForVision2Seq, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" From 6cc54a7a2d17477d0b989ae9d5dce2ddcb3d562e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 30 Dec 2024 17:44:05 +0000 Subject: [PATCH 17/40] Update comments Signed-off-by: DarkLight1337 --- examples/offline_inference_vision_language.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 6480bda1ebdb..69fe56b44124 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -24,11 +24,13 @@ def run_aria(question: str, modality: str): assert modality == "image" model_name = "rhymes-ai/Aria" + # NOTE: Need L40 to run this llm = LLM(model=model_name, tokenizer_mode="slow", - trust_remote_code=True, dtype="bfloat16", + max_model_len=4096, max_num_seqs=2, + trust_remote_code=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" @@ -259,7 +261,7 @@ def run_minicpmv(question: str, modality: str): # 2.5 # model_name = "openbmb/MiniCPM-Llama3-V-2_5" - #2.6 + # 2.6 model_name = "openbmb/MiniCPM-V-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -432,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str): model_name = "mistral-community/pixtral-12b" + # NOTE: Need L40 to run this llm = LLM( model=model_name, max_model_len=8192, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) From ba713ba2e33f71548fdbacbb08f08e27c12fb7e5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 30 Dec 2024 17:48:35 +0000 Subject: [PATCH 18/40] Be more clear Signed-off-by: DarkLight1337 --- examples/offline_inference_vision_language.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 69fe56b44124..b51bfae45526 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -24,7 +24,7 @@ def run_aria(question: str, modality: str): assert modality == "image" model_name = "rhymes-ai/Aria" - # NOTE: Need L40 to run this + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, tokenizer_mode="slow", dtype="bfloat16", @@ -434,7 +434,7 @@ def run_pixtral_hf(question: str, modality: str): model_name = "mistral-community/pixtral-12b" - # NOTE: Need L40 to run this + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM( model=model_name, max_model_len=8192, From b0efc4fcfdfa60726f47d3dfe90caec5a55fdb18 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 03:54:35 +0000 Subject: [PATCH 19/40] Merged multi-modal processor for Aria Signed-off-by: DarkLight1337 --- .../vision_language/test_models.py | 5 +- vllm/model_executor/models/aria.py | 206 ++++++------------ .../models/idefics2_vision_model.py | 6 +- 3 files changed, 77 insertions(+), 140 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index f0bc1a14773d..30473e79f89f 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -140,10 +140,7 @@ "aria": VLMTestInfo( models=["rhymes-ai/Aria"], tokenizer_mode="slow", - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - ), + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index a15672cec9c7..21a7e38d24a2 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -6,13 +6,12 @@ import torch.nn as nn from PIL import Image from torch.nn.init import trunc_normal_ -from transformers import LlamaConfig +from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_rank -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -25,27 +24,22 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.idefics2_vision_model import ( - Idefics2VisionTransformer) -from vllm.model_executor.models.interfaces import SupportsMultiModal -from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP, - LlamaModel) -from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, - is_pp_missing_parameter, - maybe_prefix, - merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) -from vllm.utils import is_list_of -from .utils import flatten_bn +from .idefics2_vision_model import Idefics2VisionTransformer +from .interfaces import SupportsMultiModal +from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + is_pp_missing_parameter, maybe_prefix, + merge_multimodal_embeddings) class AriaImagePixelInputs(TypedDict): @@ -256,7 +250,7 @@ def forward(self, x, attn_mask=None): class AriaFusedMoE(FusedMoE): def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - shard_id: str) -> Set[str]: + shard_id: str) -> None: # Override the weight_loader to handle the expert weights in the Aria # model, which are already packed with experts, and merge the gate and # up weights for each expert. @@ -351,7 +345,7 @@ class MoEDecoderLayer(LlamaDecoderLayer): def __init__( self, - config: LlamaConfig, + config: AriaMoELMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -439,7 +433,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -def build_mm_projector(config): +def build_mm_projector(config: PretrainedConfig): return AriaProjector( patch_to_query_dict=config.projector_patch_to_query_dict, embed_dim=config.vision_config.hidden_size, @@ -450,7 +444,7 @@ def build_mm_projector(config): ) -def get_aria_max_multimodal_tokens(ctx: InputContext): +def get_max_aria_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config() image_size2tokens = { int(math.sqrt(k) * hf_config.vision_config.patch_size): v @@ -459,130 +453,69 @@ def get_aria_max_multimodal_tokens(ctx: InputContext): return max(image_size2tokens.values()) -def dummy_seq_data_for_aria(ctx: InputContext, seq_len: int, num_images: int): - image_feature_size = get_aria_max_multimodal_tokens(ctx) - hf_config = ctx.get_hf_config() - return SequenceData.from_prompt_token_counts( - (hf_config.image_token_index, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - def dummy_image_for_aria( - ctx: InputContext, + vision_config: AriaVisionConfig, num_images: int, ): - hf_config = ctx.get_hf_config() - max_image_size = hf_config.vision_config.image_size + max_image_size = vision_config.image_size image = Image.new("RGB", (max_image_size, max_image_size), color=0) images = [image] * num_images return {"image": images} -def dummy_data_for_aria(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - seq_data, ranges = dummy_seq_data_for_aria(ctx, seq_len, num_images) - mm_data = dummy_image_for_aria(ctx, num_images) - return DummyData(seq_data, mm_data, ranges) - - -def input_mapper_for_aria(ctx: InputContext, data: object): - data_list = data if isinstance(data, list) else [data] - - # For profiling with dummy image data - if is_list_of(data_list, Image.Image): - hf_config = ctx.get_hf_config() - max_image_size = hf_config.vision_config.image_size - model_config = ctx.model_config - image_processor = cached_get_image_processor( - model_config.model, - trust_remote_code=model_config.trust_remote_code) - image_inputs = image_processor.preprocess( - data_list, - max_image_size=max_image_size, - split_image=False, - return_tensors="pt").data - image_inputs['pixel_values'] = image_inputs['pixel_values'].to( - ctx.model_config.dtype) - return MultiModalKwargs(image_inputs) - - # For actual inference when image has been processed with - # prompt in input processor - return MultiModalKwargs(data_list[0]) - - -def input_processor_for_aria(ctx: InputContext, llm_inputs: DecoderOnlyInputs): - multi_modal_data = llm_inputs.get("multi_modal_data") - # if it is pure text input, use it as is - if multi_modal_data is None or "image" not in multi_modal_data: - return llm_inputs - - model_config = ctx.model_config - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) - hf_config = model_config.hf_config - - # prepare image tokens, the max_image_size is used to determine the number - # of patch_size for every image - max_image_size = multi_modal_data.pop("max_image_size", 980) - _split_image = multi_modal_data.pop("split_image", False) - - assert isinstance(max_image_size, - (int, float)), "max_image_size should be float or int" - images = (multi_modal_data["image"] if isinstance( - multi_modal_data["image"], list) else [multi_modal_data["image"]]) - - image_inputs = image_processor.preprocess(images, - max_image_size=max_image_size, - split_image=_split_image, - return_tensors="pt").data - image_inputs['pixel_values'] = image_inputs['pixel_values'].to( - ctx.model_config.dtype) - num_crops = image_inputs.pop("num_crops") - - prompt_token_ids = llm_inputs["prompt_token_ids"] - if num_crops.sum().item() > 0: - _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=num_crops, +class AriaMultiModalProcessor(BaseMultiModalProcessor): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_mask=MultiModalFieldConfig.batched("image"), ) - image_size2tokens = { - int(math.sqrt(k) * hf_config.vision_config.patch_size): v - for k, v in hf_config.projector_patch_to_query_dict.items() - } - repeat_count = [image_size2tokens[max_image_size]] * sum(num_crops).item() - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=repeat_count, - ) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config() + image_token_id = hf_config.image_token_index + + max_image_tokens = get_max_aria_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=[image_token_id] * max_image_tokens, + ) + ] - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data={"image": image_inputs}, - multi_modal_placeholders={"image": ranges}, - ) + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config() + vision_config = hf_config.vision_config + num_images = mm_counts.get("image", 0) + + data = dummy_image_for_aria(vision_config, num_images) + hf_processor = self._get_hf_processor() + image_token = hf_processor.image_token # type: ignore -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_aria_max_multimodal_tokens) -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_aria) -@INPUT_REGISTRY.register_input_processor(input_processor_for_aria) + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=data, + ) + + +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens) +@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ Aria model for conditional generation tasks. @@ -651,7 +584,12 @@ def _parse_and_validate_image_input( pixel_values = self._validate_image_sizes(pixel_values) pixel_values = flatten_bn(pixel_values, concat=True) + if pixel_mask is not None: + if not isinstance(pixel_mask, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel mask. " + f"Got type: {type(pixel_mask)}") + pixel_mask = flatten_bn(pixel_mask, concat=True) return AriaImagePixelInputs( diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index e430a158d869..4e42a4b6f9e6 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -69,7 +69,8 @@ def forward(self, patch_attention_mask: torch.BoolTensor, tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: batch_size, _, max_im_h, max_im_w = pixel_values.shape - patch_embeds = self.patch_embedding(pixel_values) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(target_dtype)) embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = ( max_im_h // self.patch_size, @@ -309,7 +310,8 @@ def forward( hidden_states = self.embeddings( pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, - tgt_sizes=tgt_sizes) + tgt_sizes=tgt_sizes, + ) encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state From cdbd96986a739b401d8b65f69a4dd57685f8b139 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 31 Dec 2024 16:00:17 +0800 Subject: [PATCH 20/40] initialize fuyu merged processor Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 253 +++++++++++------------------ 1 file changed, 92 insertions(+), 161 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 02bc0af05325..242410d9d554 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -15,7 +15,6 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from array import array from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict) @@ -23,24 +22,22 @@ import torch.nn as nn import torch.utils.checkpoint from PIL import Image -from transformers import FuyuImageProcessor +from transformers import BatchFeature, FuyuProcessor from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors, PlaceholderRange -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges) -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) -from vllm.utils import is_list_of +from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, + MultiModalFieldConfig, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, @@ -96,156 +93,90 @@ def get_max_fuyu_image_tokens(ctx: InputContext): return (ncol + 1) * nrow -def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): - ncol, nrow = get_max_fuyu_image_feature_size() - image_feature_size = get_max_fuyu_image_tokens(ctx) - - image_token_ids = ( - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol + - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0]) * (seq_len - image_feature_size * num_images) - return SequenceData(token_ids), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_image_for_fuyu( - num_images: int, - *, - image_width: int, - image_height: int, -): - image = Image.new("RGB", (image_width, image_height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def dummy_data_for_fuyu(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) - mm_data = dummy_image_for_fuyu(num_images, - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT) - return DummyData(seq_data, mm_data, ranges) - - -def _fuyu_image_preprocess(image_processor: FuyuImageProcessor, - data: List[Image.Image]): - image_encoding = image_processor.preprocess(data, return_tensors="pt") - batch_images = torch.stack([img[0] for img in image_encoding["images"] - ]).unsqueeze(1) - image_unpadded_heights = torch.tensor( - image_encoding["image_unpadded_heights"]) - image_unpadded_widths = torch.tensor( - image_encoding["image_unpadded_widths"]) - - batch_size = len(image_encoding["images"]) - image_present = torch.ones(batch_size, 1, 1) - model_image_input = image_processor.preprocess_with_tokenizer_info( - image_input=batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=_IMAGE_TOKEN_ID, - image_newline_id=_NEWLINE_TOKEN_ID, - variable_sized=True, - ) - return model_image_input - - -def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - image_data = multi_modal_data["image"] - new_multi_modal_data = {} - image_list = image_data if isinstance(image_data, list) else [image_data] - - # process image data - if is_list_of(image_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, image_data) - image_patches = torch.cat([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - # dim0 is batch_size, dim1 is subseq_size which will always be 1 - image_input_ids: List[List[ - torch.Tensor]] = model_image_input["image_input_ids"] - image_input_ids = image_input_ids[0][0].tolist() - new_multi_modal_data["image"] = { - "image_patches": image_patches, - "image_input_ids": image_input_ids - } - - elif is_list_of(image_list, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - # process prompts - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - tokenizer = cached_get_tokenizer(model_config.model) - bos_token = tokenizer.encode("", add_special_tokens=False)[1:] - boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] - - new_prompt = prompt + "\x04" - new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[ - 1:] + boa_token - - placeholder_ranges = [ - PlaceholderRange(offset=0, length=len(image_input_ids)) - ] - - return token_inputs(prompt=new_prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=new_multi_modal_data, - multi_modal_placeholders={"image": placeholder_ranges}) - - -def input_mapper_for_fuyu(ctx: InputContext, data: object): - model_config = ctx.model_config - data_list = data if isinstance(data, list) else [data] - - # For profiling with dummy image data - if is_list_of(data_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, data_list) - data = torch.stack([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - image_input_ids = model_image_input["image_input_ids"][0][0] - return MultiModalKwargs({ - "pixel_values": data, - "image_input_ids": image_input_ids, - }) - - # For actual inference when image has been processed with - # prompt in input processor - return MultiModalKwargs({ - "pixel_values": data[0]["image_patches"], - "image_input_ids": data[0]["image_input_ids"], - }) - - -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) +class FuyuMultiModalProcessor(BaseMultiModalProcessor): + + def _get_hf_processor(self) -> FuyuProcessor: + return self.ctx.get_hf_processor(FuyuProcessor) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + tokenizer = self._get_tokenizer() + processed_outputs = super()._call_hf_processor(prompt, mm_data, + mm_kwargs) + if "image_patches" in processed_outputs: + # separate image_input_ids from input_ids if has image inputs + new_prompt = tokenizer.decode(processed_outputs["input_ids"][0], + skip_special_tokens=True) + image_prompt = new_prompt.split("")[0] + # we can't set add_special_tokens=False here, because placeholder + # and newline are all special tokens + image_input_ids = tokenizer.encode(image_prompt, + return_tensors="pt") + # Drop begin token since it doesn't belong to image_input_ids + processed_outputs["image_input_ids"] = image_input_ids[:, 2:] + processed_outputs["pixel_values"] = processed_outputs.pop( + "image_patches") + else: + # FuyuProcessor won't add bos and boa if no images inputs, we add + # them back manually + bos_token = tokenizer.encode("", add_special_tokens=False)[1:] + boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] + prompt_ids = tokenizer.encode( + prompt, + add_special_tokens=False, # type: ignore + ) + prompt_ids = bos_token + prompt_ids + boa_token + processed_outputs["input_ids"] = torch.tensor([prompt_ids]) + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_input_ids=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + image_input_ids = out_mm_kwargs.get("image_input_ids", []) + if isinstance(image_input_ids, torch.Tensor): + image_input_ids = image_input_ids.squeeze(0).tolist() + return [ + PromptReplacement( + modality="image", + target="", + replacement=image_input_ids, + ) + ] + + def _get_dummy_mm_inputs(self, mm_counts): + num_images = mm_counts.get("image", 0) + image = Image.new( + "RGB", + (MAX_IMAGE_FEATURE_SIZE_WIDTH, MAX_IMAGE_FEATURE_SIZE_HEIGHT), + color=0, + ) + mm_data = dict(image=image if num_images == 1 else [image] * + num_images) + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu) -@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) +@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -312,7 +243,7 @@ def _parse_and_validate_image_input( type="pixel_values", data=self._validate_pixel_values( flatten_bn(pixel_values, concat=True)), - image_input_ids=image_input_ids, + image_input_ids=flatten_bn(image_input_ids), ) return None From 48c694623fb20a65322c3ad1565a1588e4cbec79 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 08:15:25 +0000 Subject: [PATCH 21/40] Clean up Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aria.py | 4 +-- vllm/model_executor/models/blip2.py | 34 ++++++++++++++---- vllm/model_executor/models/chameleon.py | 4 +-- vllm/model_executor/models/fuyu.py | 48 +++++++++++++------------ 4 files changed, 56 insertions(+), 34 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 21a7e38d24a2..69587aa5ddb7 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -500,13 +500,13 @@ def _get_dummy_mm_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config() - vision_config = hf_config.vision_config + vision_config: AriaVisionConfig = hf_config.vision_config num_images = mm_counts.get("image", 0) data = dummy_image_for_aria(vision_config, num_images) hf_processor = self._get_hf_processor() - image_token = hf_processor.image_token # type: ignore + image_token: str = hf_processor.image_token # type: ignore return ProcessorInputs( prompt_text=image_token * num_images, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c65acb85aa98..c6456310b7df 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,8 +15,9 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -29,8 +30,7 @@ # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo -BLIP2_IMAGE_TOKEN = "" -BLIP2_IMAGE_TOKEN_ID = 50265 +_IMAGE_TOKEN_ID = 50265 class Blip2ImagePixelInputs(TypedDict): @@ -428,11 +428,31 @@ def _get_prompt_replacements( return [ PromptReplacement( modality="image", - target="", # An empty target is never matched against - replacement="" * max_image_tokens, + target="", + replacement="" * max_image_tokens + "", ) ] + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the trailing bos_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], @@ -596,7 +616,7 @@ def get_input_embeddings( if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - BLIP2_IMAGE_TOKEN_ID) + _IMAGE_TOKEN_ID) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index e027579cdb8e..9f8d84673361 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -86,7 +86,7 @@ def _get_mm_fields_config( hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image"), ) + return dict(pixel_values=MultiModalFieldConfig.batched("image")) def _get_prompt_replacements( self, @@ -722,7 +722,7 @@ def forward(self, pixel_values: torch.Tensor): for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): hidden_state = self.down[i_level].block[i_block]( - hidden_states[-1], ) + hidden_states[-1]) if len(self.down[i_level].attn) > 0: hidden_state = self.down[i_level].attn[i_block]( hidden_state) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 242410d9d554..0fe72396bf6d 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -56,40 +56,41 @@ class FuyuImagePixelInputs(TypedDict): data: torch.Tensor """ Shape: - (batch_size, num_patches, patch_size_x * patch_size_y * num_channels) + `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)` """ image_input_ids: torch.Tensor -def _calculate_num_image_tokens( - height: int, - width: int, +def _get_fuyu_num_image_tokens( + image_height: int, + image_width: int, ) -> Tuple[int, int]: """ - calculate number of image tokens needed for a given image size - The expected Fuyu image prompts is in format: + Calculate the number of image tokens needed for a given image size. + + The expected Fuyu image prompts can be expressed as: + + .. code-block:: (image_token * ncols + newline_token) * nrows - args: - image_size: Tuple[int, int] - (width, height) of the image - returns: - ncols: int - number of image tokens in x direction - nrows: int - number of image tokens in y direction + + Args: + image_size: Tuple[int, int] - `(width, height)` of the image + + Returns: + ncols: int - number of image tokens in `x` direction + nrows: int - number of image tokens in `y` direction """ - ncol = math.ceil(width / 30) - nrow = math.ceil(height / 30) + ncol = math.ceil(image_width / 30) + nrow = math.ceil(image_height / 30) return ncol, nrow -def get_max_fuyu_image_feature_size(): - - return _calculate_num_image_tokens( - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - width=MAX_IMAGE_FEATURE_SIZE_WIDTH, +def get_max_fuyu_image_tokens(ctx: InputContext): + ncol, nrow = _get_fuyu_num_image_tokens( + image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, ) - -def get_max_fuyu_image_tokens(ctx: InputContext): - ncol, nrow = get_max_fuyu_image_feature_size() return (ncol + 1) * nrow @@ -162,13 +163,14 @@ def _get_prompt_replacements( def _get_dummy_mm_inputs(self, mm_counts): num_images = mm_counts.get("image", 0) + image = Image.new( "RGB", (MAX_IMAGE_FEATURE_SIZE_WIDTH, MAX_IMAGE_FEATURE_SIZE_HEIGHT), color=0, ) - mm_data = dict(image=image if num_images == 1 else [image] * - num_images) + mm_data = {"image": image if num_images == 1 else [image] * num_images} + return ProcessorInputs( prompt_text="", mm_data=mm_data, From ea767599e92319a0b1096a29b569d5c112c1e3b0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 08:15:47 +0000 Subject: [PATCH 22/40] Clean up Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0fe72396bf6d..b2432e766e1d 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -80,18 +80,18 @@ def _get_fuyu_num_image_tokens( ncols: int - number of image tokens in `x` direction nrows: int - number of image tokens in `y` direction """ - ncol = math.ceil(image_width / 30) - nrow = math.ceil(image_height / 30) - return ncol, nrow + ncols = math.ceil(image_width / 30) + nrows = math.ceil(image_height / 30) + return ncols, nrows def get_max_fuyu_image_tokens(ctx: InputContext): - ncol, nrow = _get_fuyu_num_image_tokens( + ncols, nrows = _get_fuyu_num_image_tokens( image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, ) - return (ncol + 1) * nrow + return (ncols + 1) * nrows class FuyuMultiModalProcessor(BaseMultiModalProcessor): From bc976a7a1f125939f497c3876f3195802059a83c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 08:30:03 +0000 Subject: [PATCH 23/40] Try remove mark Signed-off-by: DarkLight1337 --- tests/models/decoder_only/vision_language/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 30473e79f89f..7db08166826e 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -199,7 +199,6 @@ vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], - marks=[large_gpu_mark(min_gb=48)], ), "glm4": VLMTestInfo( models=["THUDM/glm-4v-9b"], From f79f79a70dc492f9f075843f1c4610d15344915c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 08:47:12 +0000 Subject: [PATCH 24/40] Consolidate dummy data code Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aria.py | 23 ++++++--------- vllm/model_executor/models/blip.py | 18 ------------ vllm/model_executor/models/blip2.py | 13 +++++++-- vllm/model_executor/models/chameleon.py | 29 ++++++------------- vllm/model_executor/models/fuyu.py | 19 ++++++------ vllm/model_executor/models/qwen2_audio.py | 14 +++++---- vllm/model_executor/models/qwen2_vl.py | 17 +++++------ vllm/model_executor/models/ultravox.py | 13 +++++---- vllm/multimodal/processing.py | 35 +++++++++++++++++++++-- 9 files changed, 95 insertions(+), 86 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 69587aa5ddb7..0648d98ac405 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -4,7 +4,6 @@ import torch import torch.nn as nn -from PIL import Image from torch.nn.init import trunc_normal_ from transformers import BatchFeature, PretrainedConfig @@ -453,17 +452,6 @@ def get_max_aria_image_tokens(ctx: InputContext): return max(image_size2tokens.values()) -def dummy_image_for_aria( - vision_config: AriaVisionConfig, - num_images: int, -): - max_image_size = vision_config.image_size - image = Image.new("RGB", (max_image_size, max_image_size), color=0) - images = [image] * num_images - - return {"image": images} - - class AriaMultiModalProcessor(BaseMultiModalProcessor): def _get_mm_fields_config( @@ -501,16 +489,23 @@ def _get_dummy_mm_inputs( ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config() vision_config: AriaVisionConfig = hf_config.vision_config + + max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) - data = dummy_image_for_aria(vision_config, num_images) + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } hf_processor = self._get_hf_processor() image_token: str = hf_processor.image_token # type: ignore return ProcessorInputs( prompt_text=image_token * num_images, - mm_data=data, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 129a0bcecc86..987dfaf44f22 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,7 +4,6 @@ import torch import torch.nn as nn -from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig from vllm.attention.layer import MultiHeadAttention @@ -28,23 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: return grid_length * grid_length -def dummy_image_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = height = hf_config.image_size - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c6456310b7df..bf70f5d904f5 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -23,7 +23,7 @@ PromptReplacement) from vllm.sequence import IntermediateTensors -from .blip import BlipVisionModel, dummy_image_for_blip +from .blip import BlipVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -459,13 +459,20 @@ def _get_dummy_mm_inputs( ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config(Blip2Config) vision_config = hf_config.vision_config + + max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) - data = dummy_image_for_blip(vision_config, num_images) + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } return ProcessorInputs( prompt_text="", - mm_data=data, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 9f8d84673361..85fca23b0574 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -3,9 +3,8 @@ Tuple, TypedDict, Union) import torch +import torch.nn as nn import torch.nn.functional as F -from PIL import Image -from torch import nn from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, ChameleonVQVAEConfig) @@ -59,23 +58,6 @@ def get_max_chameleon_image_tokens(ctx: InputContext): return CHAMELEON_IMAGE_SEQ_LENGTH -def dummy_image_for_chameleon( - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = CHAMELEON_CROP_SIZE_WIDTH - height = CHAMELEON_CROP_SIZE_HEIGHT - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - class ChameleonMultiModalProcessor(BaseMultiModalProcessor): def _get_hf_processor(self) -> ChameleonProcessor: @@ -114,11 +96,16 @@ def _get_dummy_mm_inputs( ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - data = dummy_image_for_chameleon(num_images) + mm_data = { + "image": + self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH, + height=CHAMELEON_CROP_SIZE_HEIGHT, + num_images=num_images) + } return ProcessorInputs( prompt_text="" * num_images, - mm_data=data, + mm_data=mm_data, ) def apply( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b2432e766e1d..d9c91234ac2b 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -20,8 +20,6 @@ import torch import torch.nn as nn -import torch.utils.checkpoint -from PIL import Image from transformers import BatchFeature, FuyuProcessor from vllm.attention import AttentionMetadata @@ -161,15 +159,18 @@ def _get_prompt_replacements( ) ] - def _get_dummy_mm_inputs(self, mm_counts): + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - image = Image.new( - "RGB", - (MAX_IMAGE_FEATURE_SIZE_WIDTH, MAX_IMAGE_FEATURE_SIZE_HEIGHT), - color=0, - ) - mm_data = {"image": image if num_images == 1 else [image] * num_images} + mm_data = { + "image": + self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH, + height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + num_images=num_images) + } return ProcessorInputs( prompt_text="", diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 25a351bd9c65..deb6987e7f16 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -23,7 +23,6 @@ from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn from transformers import BatchFeature @@ -181,16 +180,19 @@ def _get_dummy_mm_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts.get("audio", 0) - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|AUDIO|>" * audio_count, - mm_data=data, + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 574845ef5a52..a84adcbca962 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -30,7 +30,6 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from PIL import Image from transformers import BatchFeature from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) @@ -891,12 +890,10 @@ def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token image_processor = _get_image_processor(hf_processor) - data = {} + image_token: str = hf_processor.image_token resized_height, resized_width = smart_resize( height=9999999, width=9999999, @@ -904,14 +901,18 @@ def _get_dummy_mm_inputs( min_pixels=image_processor.min_pixels, max_pixels=image_processor.max_pixels, ) + num_images = mm_counts.get("image", 0) - dummy_image = Image.new("RGB", (resized_width, resized_height), - color=0) - data["image"] = [dummy_image] * num_images + mm_data = { + "image": + self._get_dummy_images(width=resized_width, + height=resized_height, + num_images=num_images) + } return ProcessorInputs( prompt_text=image_token * num_images, - mm_data=data, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 7b4aeeec5f40..8234ce62fb49 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -192,16 +192,19 @@ def _get_dummy_mm_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts.get("audio", 0) - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|audio|>" * audio_count, - mm_data=data, + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, ) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index f7fb5d3bba51..96812f42b864 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -8,9 +8,10 @@ from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union import numpy as np +import numpy.typing as npt import torch from blake3 import blake3 -from PIL.Image import Image +from PIL import Image from transformers import BatchFeature, ProcessorMixin from vllm.inputs import DummyData, InputProcessingContext @@ -513,7 +514,7 @@ def _serialize_item(self, obj: object) -> bytes: return obj.encode("utf-8") if isinstance(obj, bytes): return obj - if isinstance(obj, Image): + if isinstance(obj, Image.Image): return obj.tobytes() # Convertible to NumPy arrays @@ -1007,6 +1008,36 @@ def apply( mm_placeholders=mm_placeholders, ) + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + @abstractmethod def _get_dummy_mm_inputs( self, From 45ec10cb61d403c1a0a5b89ba5e27e1212bcc8ec Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 31 Dec 2024 16:58:49 +0800 Subject: [PATCH 25/40] fix fuyu variant images test Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d9c91234ac2b..e1c63a511b0e 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -36,6 +36,7 @@ MultiModalFieldConfig, ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors +from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, @@ -117,8 +118,10 @@ def _call_hf_processor( return_tensors="pt") # Drop begin token since it doesn't belong to image_input_ids processed_outputs["image_input_ids"] = image_input_ids[:, 2:] - processed_outputs["pixel_values"] = processed_outputs.pop( - "image_patches") + processed_outputs["pixel_values"] = [ + image_patch[0] + for image_patch in processed_outputs.pop("image_patches") + ] else: # FuyuProcessor won't add bos and boa if no images inputs, we add # them back manually @@ -265,6 +268,8 @@ def _process_image_input( image_sizes = [ len(input_ids_per_image) for input_ids_per_image in image_input_ids ] + if is_list_of(image_input_ids, torch.Tensor): + image_input_ids = torch.cat(image_input_ids) image_input_ids = torch.flatten(image_input_ids) image_token_mask = image_input_ids == _IMAGE_TOKEN_ID From 0fe561d45bff05a2cd1beef31632e0ef0149bbf4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 09:01:19 +0000 Subject: [PATCH 26/40] Fix some type errors in Pixtral-HF Signed-off-by: DarkLight1337 --- vllm/model_executor/models/pixtral.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 22d29f5bbc50..2bce13792a88 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,8 +1,8 @@ +import math from dataclasses import dataclass, fields from functools import cached_property from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union -import numpy import torch import torch.nn as nn import torch.nn.functional as F @@ -306,7 +306,7 @@ def _parse_and_validate_image_input( images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]] = None, image_tokens: Optional[torch.Tensor] = None, - ) -> Optional[List[torch.Tensor]]: + ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: if images is None: return None, None @@ -604,11 +604,11 @@ def max_patches_per_side(self) -> int: return self.args.image_size // self.args.patch_size @property - def device(self) -> torch.device: + def device(self) -> torch.types.Device: return next(self.parameters()).device @property - def dtype(self) -> torch.device: + def dtype(self) -> torch.dtype: return next(self.parameters()).dtype @property @@ -741,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, ratio = max(image_width / max_width, image_height / max_height) if ratio > 1: - image_width = int(numpy.ceil(image_width / ratio)) - image_height = int(numpy.ceil(image_height / ratio)) + image_width = int(math.ceil(image_width / ratio)) + image_height = int(math.ceil(image_height / ratio)) num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens( (image_height, image_width), From 3512ed6034d19094882980ee659a63145a6cceb9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 31 Dec 2024 17:08:28 +0800 Subject: [PATCH 27/40] fix missing flatten_bn in fuyu Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index e1c63a511b0e..88ba9cede5f9 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -248,7 +248,7 @@ def _parse_and_validate_image_input( return FuyuImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), + flatten_bn(flatten_bn(pixel_values), concat=True)), image_input_ids=flatten_bn(image_input_ids), ) From 5e0f66c3712f7c22bd365debfb2a6d2873fb7dbf Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 09:23:11 +0000 Subject: [PATCH 28/40] Update docs Signed-off-by: DarkLight1337 --- vllm/multimodal/processing.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index a549180e2cb5..44238262704e 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -675,10 +675,14 @@ def _get_prompt_replacements( Given the original multi-modal items for this modality and HF-processed data, output the replacements to perform. - Note: - Even when the HF processor already performs replacement for us, - we still use this replacement information to determine - the placeholder token positions for each multi-modal item. + Notes: + - You should not assume that HF processor always performs prompt + replacement: in :meth:`_apply_hf_processor_missing`, this method + is called on text-only and multimodal-only inputs separately, + instead of passing them in the same call. + - The replacement information returned by this method is also used + to determine the placeholder token positions for each multi-modal + item. """ raise NotImplementedError @@ -712,6 +716,10 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: + """ + Call the HF processor on the prompt text and + associated multi-modal data. + """ return self.ctx.call_hf_processor( self._get_hf_processor(**mm_kwargs), dict(text=prompt, **mm_data), @@ -725,7 +733,8 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: """ - Apply the HF processor on the full prompt text and multi-modal data. + Wrapper of :meth:`_call_hf_processor` that applies + additional pre-processing and post-processing. """ processor_data, passthrough_data = self._get_hf_mm_data(mm_items) @@ -756,10 +765,11 @@ def _apply_hf_processor_missing( Apply the HF processor on the full prompt text, but only on the multi-modal data that are missing from the cache. - Note: We pass prompt text and multi-modal data into the HF processor - in separate calls to avoid HF prompt replacement being done for - cached items; instead, we rely on our own prompt replacement logic - for the full text. + Note: + We pass prompt text and multi-modal data into the HF processor + in separate calls to avoid HF prompt replacement being done for + cached items; instead, we rely on our own prompt replacement logic + for the full text. """ mm_missing_counts = mm_missing_data_items.get_all_counts() From 1c243abe2ad7319c6bd4f4f714394395a0c93bd4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 09:24:51 +0000 Subject: [PATCH 29/40] Update docs Signed-off-by: DarkLight1337 --- vllm/multimodal/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 44238262704e..7712c3bcebe2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -769,7 +769,7 @@ def _apply_hf_processor_missing( We pass prompt text and multi-modal data into the HF processor in separate calls to avoid HF prompt replacement being done for cached items; instead, we rely on our own prompt replacement logic - for the full text. + (:meth:`_get_prompt_replacements`) for the full text. """ mm_missing_counts = mm_missing_data_items.get_all_counts() From 09d64f46e9015c71a8d1e0229a2fe5a524294942 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 11:16:39 +0000 Subject: [PATCH 30/40] Get fuyu processor tests to pass Signed-off-by: DarkLight1337 --- tests/multimodal/test_processing.py | 33 +++--- vllm/model_executor/models/fuyu.py | 158 +++++++++++++++------------- vllm/model_executor/models/llava.py | 4 +- 3 files changed, 100 insertions(+), 95 deletions(-) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 43fb6e4e25e7..f51f20451382 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -528,7 +528,7 @@ def _rand_audio( def _test_processing_cache_correctness( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -583,9 +583,8 @@ def _test_processing_cache_correctness( partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000), } input_max_count = { - "image": 3, - "video": 3, - "audio": 3, + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() } for batch_idx in range(num_batches): @@ -624,16 +623,16 @@ def _test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ - ("rhymes-ai/Aria", {"image"}), - ("Salesforce/blip2-opt-2.7b", {"image"}), - ("facebook/chameleon-7b", {"image"}), - ("adept/fuyu-8b", {"image"}), - ("llava-hf/llava-1.5-7b-hf", {"image"}), - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}), - ("mistral-community/pixtral-12b", {"image"}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}), - ("fixie-ai/ultravox-v0_3", {"audio"}), + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", [("image", False)]), + ("facebook/chameleon-7b", {"image": True}), + ("adept/fuyu-8b", [("image", False)]), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -641,7 +640,7 @@ def _test_processing_cache_correctness( # yapf: enable def test_processing_cache_correctness( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -657,7 +656,7 @@ def test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image"}), + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -665,7 +664,7 @@ def test_processing_cache_correctness( # yapf: enable def test_processing_cache_correctness_phi3v( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 88ba9cede5f9..0c8d4a982167 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -20,7 +20,7 @@ import torch import torch.nn as nn -from transformers import BatchFeature, FuyuProcessor +from transformers import BatchFeature, FuyuConfig, FuyuProcessor from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -29,14 +29,15 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, - MultiModalFieldConfig, ProcessorInputs, + MultiModalDataItems, ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, @@ -50,14 +51,13 @@ MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920 -class FuyuImagePixelInputs(TypedDict): - type: Literal["pixel_values"] +class FuyuImagePatchInputs(TypedDict): + type: Literal["image_patches"] data: torch.Tensor """ Shape: `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)` """ - image_input_ids: torch.Tensor def _get_fuyu_num_image_tokens( @@ -104,35 +104,26 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: - tokenizer = self._get_tokenizer() - processed_outputs = super()._call_hf_processor(prompt, mm_data, - mm_kwargs) - if "image_patches" in processed_outputs: - # separate image_input_ids from input_ids if has image inputs - new_prompt = tokenizer.decode(processed_outputs["input_ids"][0], - skip_special_tokens=True) - image_prompt = new_prompt.split("")[0] - # we can't set add_special_tokens=False here, because placeholder - # and newline are all special tokens - image_input_ids = tokenizer.encode(image_prompt, - return_tensors="pt") - # Drop begin token since it doesn't belong to image_input_ids - processed_outputs["image_input_ids"] = image_input_ids[:, 2:] - processed_outputs["pixel_values"] = [ - image_patch[0] - for image_patch in processed_outputs.pop("image_patches") - ] - else: - # FuyuProcessor won't add bos and boa if no images inputs, we add - # them back manually - bos_token = tokenizer.encode("", add_special_tokens=False)[1:] - boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] - prompt_ids = tokenizer.encode( - prompt, - add_special_tokens=False, # type: ignore - ) - prompt_ids = bos_token + prompt_ids + boa_token - processed_outputs["input_ids"] = torch.tensor([prompt_ids]) + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + + processed_outputs["image_patches"] = image_patches[0] + return processed_outputs def _get_mm_fields_config( @@ -140,10 +131,7 @@ def _get_mm_fields_config( hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_input_ids=MultiModalFieldConfig.batched("image"), - ) + return dict(image_patches=MultiModalFieldConfig.batched("image")) def _get_prompt_replacements( self, @@ -151,17 +139,54 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - image_input_ids = out_mm_kwargs.get("image_input_ids", []) - if isinstance(image_input_ids, torch.Tensor): - image_input_ids = image_input_ids.squeeze(0).tolist() + hf_config = self.ctx.get_hf_config(FuyuConfig) + bos_token_id = hf_config.bos_token_id + + tokenizer = self._get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = _get_fuyu_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + + [bos_token_id, boa_token_id]) + return [ PromptReplacement( modality="image", - target="", - replacement=image_input_ids, + target=[eot_token_id], + replacement=get_replacement_fuyu, ) ] + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only |SPEAKER| (image) tokens should be considered as placeholders, + # so we ignore the trailing bos_token_id and boa_token_id + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 2) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], @@ -237,47 +262,28 @@ def _validate_shape(d: torch.Tensor): return data.to(self.vision_embed_tokens.weight.dtype) def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: - pixel_values = kwargs.pop("pixel_values", None) - image_input_ids = kwargs.pop("image_input_ids", None) - if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): + self, **kwargs: object) -> Optional[FuyuImagePatchInputs]: + image_patches = kwargs.pop("image_patches", None) + if image_patches is not None: + if not isinstance(image_patches, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " - f"Got type: {type(pixel_values)}") + f"Got type: {type(image_patches)}") - return FuyuImagePixelInputs( - type="pixel_values", + return FuyuImagePatchInputs( + type="image_patches", data=self._validate_pixel_values( - flatten_bn(flatten_bn(pixel_values), concat=True)), - image_input_ids=flatten_bn(image_input_ids), + flatten_bn(image_patches, concat=True)), ) return None def _process_image_input( - self, image_input: FuyuImagePixelInputs) -> torch.Tensor: + self, image_input: FuyuImagePatchInputs) -> torch.Tensor: assert self.vision_embed_tokens is not None vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) hidden_size = vision_embeddings.shape[-1] - vision_embeddings = vision_embeddings.reshape(-1, hidden_size) - - # NOTE: image_input_ids contains both image placeholder tokens and - # newline tokens. - image_input_ids = image_input["image_input_ids"] - image_sizes = [ - len(input_ids_per_image) for input_ids_per_image in image_input_ids - ] - if is_list_of(image_input_ids, torch.Tensor): - image_input_ids = torch.cat(image_input_ids) - image_input_ids = torch.flatten(image_input_ids) - - image_token_mask = image_input_ids == _IMAGE_TOKEN_ID - full_vision_embeddings = self.language_model.get_input_embeddings( - image_input_ids) - full_vision_embeddings[image_token_mask] = vision_embeddings - - return torch.split(full_vision_embeddings, image_sizes) + return vision_embeddings.reshape(-1, hidden_size) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 1d6ee2a0be72..34dc7fa31ce6 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -144,8 +144,8 @@ def _call_hf_processor( # Original output: (1, num_images, C, H, W) # New output: (num_images, C, H, W) assert (isinstance(pixel_values, list) - and len(pixel_values) == 1 - and isinstance(pixel_values[0], list) + and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) and len(pixel_values[0]) == len(images)) processed_outputs["pixel_values"] = pixel_values[0] From 6d6d71c474bbfeb52150cc95362335f24feb7a17 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 11:29:42 +0000 Subject: [PATCH 31/40] Oops Signed-off-by: DarkLight1337 --- tests/multimodal/test_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index f51f20451382..81278cde264f 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -624,9 +624,9 @@ def _test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ ("rhymes-ai/Aria", {"image": True}), - ("Salesforce/blip2-opt-2.7b", [("image", False)]), + ("Salesforce/blip2-opt-2.7b", {"image": False}), ("facebook/chameleon-7b", {"image": True}), - ("adept/fuyu-8b", [("image", False)]), + ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), ("mistral-community/pixtral-12b", {"image": True}), From ea93a2c42a3754e732c9ab19f085ef2fe7166db3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 11:47:26 +0000 Subject: [PATCH 32/40] Fix unable to run model Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0c8d4a982167..9cb684821a30 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -301,7 +301,7 @@ def get_input_embeddings( if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - [_IMAGE_TOKEN_ID, _NEWLINE_TOKEN_ID]) + _IMAGE_TOKEN_ID) return inputs_embeds def forward( From 9aeb7b2bc2af5068e201a0c4c455827757b1d25e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 11:51:30 +0000 Subject: [PATCH 33/40] Avoid warning from HF Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 9cb684821a30..d5a458ad8565 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -104,6 +104,12 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: + if not mm_data: + # Avoid warning from HF logger for text-only input + tokenizer = self._get_tokenizer() + processed_outputs = tokenizer(prompt).data # type: ignore + return BatchFeature(processed_outputs) + processed_outputs = super()._call_hf_processor( prompt=prompt, mm_data=mm_data, From 768c1d9bdc7096f9481b27c45fa54735aa7cd1a8 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 31 Dec 2024 21:00:43 +0800 Subject: [PATCH 34/40] fix too large image for fuyu Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d5a458ad8565..4926ef93a150 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -20,7 +20,8 @@ import torch import torch.nn as nn -from transformers import BatchFeature, FuyuConfig, FuyuProcessor +from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, + FuyuProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -107,7 +108,8 @@ def _call_hf_processor( if not mm_data: # Avoid warning from HF logger for text-only input tokenizer = self._get_tokenizer() - processed_outputs = tokenizer(prompt).data # type: ignore + processed_outputs = tokenizer( + prompt, return_tensors="pt").data # type: ignore return BatchFeature(processed_outputs) processed_outputs = super()._call_hf_processor( @@ -153,13 +155,28 @@ def _get_prompt_replacements( assert isinstance(eot_token_id, int) boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + hf_processor = self._get_hf_processor() + image_processor: FuyuImageProcessor = hf_processor.image_processor + target_size = image_processor.size + target_height, target_width = (target_size["height"], + target_size["width"]) + def get_replacement_fuyu(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) + width, height = image_size.width, image_size.height + if not (width <= target_width and height <= target_height): + height_scale_factor = target_height / height + width_scale_factor = target_width / width + optimal_scale_factor = min(height_scale_factor, + width_scale_factor) + + height = int(height * optimal_scale_factor) + width = int(width * optimal_scale_factor) ncols, nrows = _get_fuyu_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, + image_width=width, + image_height=height, ) return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + From 0c82c512fcf80102e3c0d63ab431529a8f7725f9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 31 Dec 2024 21:37:54 +0800 Subject: [PATCH 35/40] fix prompt token ids Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 4926ef93a150..d9a1c00432d2 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -105,12 +105,18 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: + if not mm_data: # Avoid warning from HF logger for text-only input + # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id + # Tokenizer won't add boa_token_id by default, we add it manually. tokenizer = self._get_tokenizer() - processed_outputs = tokenizer( - prompt, return_tensors="pt").data # type: ignore - return BatchFeature(processed_outputs) + boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + processed_outputs = tokenizer(prompt).data # type: ignore + processed_outputs["input_ids"] = [ + processed_outputs["input_ids"] + [boa_token_id] + ] + return BatchFeature(processed_outputs, tensor_type="pt") processed_outputs = super()._call_hf_processor( prompt=prompt, @@ -153,7 +159,6 @@ def _get_prompt_replacements( tokenizer = self._get_tokenizer() eot_token_id = tokenizer.bos_token_id assert isinstance(eot_token_id, int) - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore hf_processor = self._get_hf_processor() image_processor: FuyuImageProcessor = hf_processor.image_processor @@ -180,7 +185,7 @@ def get_replacement_fuyu(item_idx: int): ) return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + - [bos_token_id, boa_token_id]) + [bos_token_id]) return [ PromptReplacement( @@ -199,10 +204,10 @@ def apply( result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) # Only |SPEAKER| (image) tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id and boa_token_id + # so we ignore the trailing bos_token_id result["mm_placeholders"] = { modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 2) + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) for p in ps ] for modality, ps in result["mm_placeholders"].items() @@ -295,7 +300,7 @@ def _parse_and_validate_image_input( return FuyuImagePatchInputs( type="image_patches", data=self._validate_pixel_values( - flatten_bn(image_patches, concat=True)), + flatten_bn(flatten_bn(image_patches), concat=True)), ) return None From d0d1fdc4d3f291721ecb2a9bcd3e8b59cfeba915 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 14:41:33 +0000 Subject: [PATCH 36/40] Fix missing batch dimension in vision embeddings Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d9a1c00432d2..e9d50a2de90c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -300,18 +300,21 @@ def _parse_and_validate_image_input( return FuyuImagePatchInputs( type="image_patches", data=self._validate_pixel_values( - flatten_bn(flatten_bn(image_patches), concat=True)), + flatten_bn(image_patches, concat=True)), ) return None def _process_image_input( self, image_input: FuyuImagePatchInputs) -> torch.Tensor: + image_patches = image_input["data"] assert self.vision_embed_tokens is not None - vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) - hidden_size = vision_embeddings.shape[-1] - return vision_embeddings.reshape(-1, hidden_size) + vision_embeddings, _ = self.vision_embed_tokens(image_patches) + + batch_size, num_patches, _ = image_patches.shape + _, _, hidden_size = vision_embeddings.shape + return vision_embeddings.reshape(batch_size, num_patches, hidden_size) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) From afcf7b18a0f0e1f7515a7f7c928f8b3c60c3d17d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 31 Dec 2024 23:10:28 +0800 Subject: [PATCH 37/40] fix variant patches batching Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index e9d50a2de90c..a8ef3cb2872a 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -59,6 +59,10 @@ class FuyuImagePatchInputs(TypedDict): Shape: `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)` """ + patches_per_image: List[int] + """ + List of number of total patches for each image in the batch. + """ def _get_fuyu_num_image_tokens( @@ -300,21 +304,22 @@ def _parse_and_validate_image_input( return FuyuImagePatchInputs( type="image_patches", data=self._validate_pixel_values( - flatten_bn(image_patches, concat=True)), + flatten_bn(flatten_bn(image_patches), concat=True)), + patches_per_image=[ + x.size(0) for x in flatten_bn(image_patches) + ], ) return None def _process_image_input( - self, image_input: FuyuImagePatchInputs) -> torch.Tensor: + self, image_input: FuyuImagePatchInputs) -> NestedTensors: image_patches = image_input["data"] + patches_per_image = image_input["patches_per_image"] assert self.vision_embed_tokens is not None vision_embeddings, _ = self.vision_embed_tokens(image_patches) - - batch_size, num_patches, _ = image_patches.shape - _, _, hidden_size = vision_embeddings.shape - return vision_embeddings.reshape(batch_size, num_patches, hidden_size) + return vision_embeddings.split(patches_per_image, dim=0) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) From cb9522d2836d8283c2e4d2337736ae182a05ba65 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 15:21:46 +0000 Subject: [PATCH 38/40] Simplify the code Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index a8ef3cb2872a..ada6990d342a 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -57,11 +57,13 @@ class FuyuImagePatchInputs(TypedDict): data: torch.Tensor """ Shape: - `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)` + `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` """ + patches_per_image: List[int] """ List of number of total patches for each image in the batch. + This is used to restore the first two dimensions of `data`. """ @@ -116,11 +118,8 @@ def _call_hf_processor( # Tokenizer won't add boa_token_id by default, we add it manually. tokenizer = self._get_tokenizer() boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore - processed_outputs = tokenizer(prompt).data # type: ignore - processed_outputs["input_ids"] = [ - processed_outputs["input_ids"] + [boa_token_id] - ] - return BatchFeature(processed_outputs, tensor_type="pt") + prompt_ids = tokenizer.encode(prompt) + [boa_token_id] + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") processed_outputs = super()._call_hf_processor( prompt=prompt, @@ -300,13 +299,15 @@ def _parse_and_validate_image_input( if not isinstance(image_patches, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " f"Got type: {type(image_patches)}") + + image_patches_flat = flatten_bn(image_patches) return FuyuImagePatchInputs( type="image_patches", data=self._validate_pixel_values( - flatten_bn(flatten_bn(image_patches), concat=True)), + flatten_bn(image_patches_flat, concat=True)), patches_per_image=[ - x.size(0) for x in flatten_bn(image_patches) + x.size(0) for x in image_patches_flat ], ) From df832dfc9537ed0cbe48011ef31bb8b9a9d0c1ab Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 31 Dec 2024 15:39:17 +0000 Subject: [PATCH 39/40] format Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index ada6990d342a..8c14866f20b9 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -299,16 +299,14 @@ def _parse_and_validate_image_input( if not isinstance(image_patches, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " f"Got type: {type(image_patches)}") - + image_patches_flat = flatten_bn(image_patches) return FuyuImagePatchInputs( type="image_patches", data=self._validate_pixel_values( flatten_bn(image_patches_flat, concat=True)), - patches_per_image=[ - x.size(0) for x in image_patches_flat - ], + patches_per_image=[x.size(0) for x in image_patches_flat], ) return None From cc9c5f1fc8aa4ac8dea8aa712a79875f73f80c97 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 31 Dec 2024 19:28:32 +0000 Subject: [PATCH 40/40] simplify Signed-off-by: Roger Wang --- vllm/model_executor/models/aria.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 0648d98ac405..4ad6e859f4d9 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,4 +1,3 @@ -import math from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -445,11 +444,7 @@ def build_mm_projector(config: PretrainedConfig): def get_max_aria_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config() - image_size2tokens = { - int(math.sqrt(k) * hf_config.vision_config.patch_size): v - for k, v in hf_config.projector_patch_to_query_dict.items() - } - return max(image_size2tokens.values()) + return max(hf_config.projector_patch_to_query_dict.values()) class AriaMultiModalProcessor(BaseMultiModalProcessor):