From 425d3c4dc1b21ebce2d864ec3a94ae97d7f8e7c5 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 29 Dec 2024 13:23:10 +0000
Subject: [PATCH 01/40] batch

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_next.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a39f2f4124d0..5e70c11363c8 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -528,10 +528,8 @@ def _process_image_pixels(
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return [
-            self.multi_modal_projector(image_features) for image_features in
-            torch.split(stacked_image_features, num_patches_per_batch)
-        ]
+        return torch.split(self.multi_modal_projector(stacked_image_features),
+                           num_patches_per_batch)
 
     def _process_image_input(
         self,

From 8edcc8331967044825007984ab4298d868de22f9 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 04:35:07 +0000
Subject: [PATCH 02/40] blip2

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/blip2.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 76b8505ee1c2..4e16ae522c9b 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -16,7 +16,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
@@ -468,6 +468,9 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
     # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
     new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
     new_token_ids += inputs["prompt_token_ids"]
+    placeholder_ranges = [
+        PlaceholderRange(offset=0, length=image_feature_size)
+    ]
 
     new_prompt = inputs.get("prompt")
     if new_prompt is not None:
@@ -475,7 +478,8 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()

From 5f7629114a7b46c337013ff821f2489453d0c893 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 07:11:11 +0000
Subject: [PATCH 03/40] chameleon

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md  | 4 ++--
 vllm/model_executor/models/chameleon.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 518505abeb2a..3ec203e3bea4 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -577,14 +577,14 @@ See [this page](#generative-models) for more information on how to use generativ
   - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChameleonForConditionalGeneration`
   - Chameleon
   - T + I
   - `facebook/chameleon-7b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `FuyuForCausalLM`
   - Fuyu
   - T + I
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a40c321ce0a5..afca81f5d4fd 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -153,7 +153,8 @@ def input_processor_for_chameleon(ctx: InputContext,
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": ranges})
 
 
 class ChameleonLayerNorm(nn.LayerNorm):

From 814f3bd2be02064d984df760a3daf0a7845d4647 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 07:11:20 +0000
Subject: [PATCH 04/40] fix util

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 87b12a6fb33c..7b6ded6a2708 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens(
     placeholder_token_idx = 0
     for i, token in enumerate(prompt_token_ids):
         if token == placeholder_token_id:
+            curr_repeat_count = repeat_count[placeholder_token_idx]
             replacement_ids = repeat_and_pad_token(
                 placeholder_token_id,
-                repeat_count=repeat_count[placeholder_token_idx],
+                repeat_count=curr_repeat_count,
                 pad_token_left=pad_token_left,
                 pad_token_right=pad_token_right,
             )
+            offset = len(new_token_ids)
+            if pad_token_left is not None:
+                offset += 1
             placeholder_ranges.append({
-                "offset": len(new_token_ids),
-                "length": len(replacement_ids)
+                "offset": offset,
+                "length": curr_repeat_count,
             })
             new_token_ids.extend(replacement_ids)
             placeholder_token_idx += 1

From efeb99991b5815ea49b8308d80c23b346a9da418 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 10:38:29 +0000
Subject: [PATCH 05/40] fuyu

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/fuyu.py     | 61 +++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3ec203e3bea4..7bb4da545cf4 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -591,7 +591,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `adept/fuyu-8b` etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `ChatGLMModel`
   - GLM-4V
   - T + I
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 6e86900326c4..02bc0af05325 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -35,7 +35,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -61,6 +61,7 @@ class FuyuImagePixelInputs(TypedDict):
     Shape: 
     (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
     """
+    image_input_ids: torch.Tensor
 
 
 def _calculate_num_image_tokens(
@@ -177,7 +178,14 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
             image_patch[0]
             for image_patch in model_image_input["image_patches"]
         ])
-        new_multi_modal_data["image"] = image_patches
+        # dim0 is batch_size, dim1 is subseq_size which will always be 1
+        image_input_ids: List[List[
+            torch.Tensor]] = model_image_input["image_input_ids"]
+        image_input_ids = image_input_ids[0][0].tolist()
+        new_multi_modal_data["image"] = {
+            "image_patches": image_patches,
+            "image_input_ids": image_input_ids
+        }
 
     elif is_list_of(image_list, torch.Tensor):
         raise NotImplementedError("Embeddings input is not supported yet")
@@ -188,10 +196,6 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
     prompt = inputs.get("prompt")
     prompt_token_ids = inputs["prompt_token_ids"]
     tokenizer = cached_get_tokenizer(model_config.model)
-    # dim0 is batch_size, dim1 is subseq_size which will always be 1
-    image_input_ids: List[List[
-        torch.Tensor]] = model_image_input["image_input_ids"]
-    image_input_ids = image_input_ids[0][0].tolist()
     bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
     boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
 
@@ -199,14 +203,21 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
     new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
         1:] + boa_token
 
+    placeholder_ranges = [
+        PlaceholderRange(offset=0, length=len(image_input_ids))
+    ]
+
     return token_inputs(prompt=new_prompt,
                         prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=new_multi_modal_data)
+                        multi_modal_data=new_multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 def input_mapper_for_fuyu(ctx: InputContext, data: object):
     model_config = ctx.model_config
     data_list = data if isinstance(data, list) else [data]
+
+    # For profiling with dummy image data
     if is_list_of(data_list, Image.Image):
         # Fuyu's image_processor can also finish token padding
         image_processor: FuyuImageProcessor = cached_get_image_processor(
@@ -217,9 +228,18 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
             image_patch[0]
             for image_patch in model_image_input["image_patches"]
         ])
+        image_input_ids = model_image_input["image_input_ids"][0][0]
+        return MultiModalKwargs({
+            "pixel_values": data,
+            "image_input_ids": image_input_ids,
+        })
 
-    # image has been processed with prompt in input processor
-    return MultiModalKwargs({"pixel_values": data})
+    # For actual inference when image has been processed with
+    # prompt in input processor
+    return MultiModalKwargs({
+        "pixel_values": data[0]["image_patches"],
+        "image_input_ids": data[0]["image_input_ids"],
+    })
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
@@ -282,7 +302,7 @@ def _validate_shape(d: torch.Tensor):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-
+        image_input_ids = kwargs.pop("image_input_ids", None)
         if pixel_values is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
@@ -292,6 +312,7 @@ def _parse_and_validate_image_input(
                 type="pixel_values",
                 data=self._validate_pixel_values(
                     flatten_bn(pixel_values, concat=True)),
+                image_input_ids=image_input_ids,
             )
 
         return None
@@ -301,7 +322,23 @@ def _process_image_input(
 
         assert self.vision_embed_tokens is not None
         vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
-        return vision_embeddings
+        hidden_size = vision_embeddings.shape[-1]
+        vision_embeddings = vision_embeddings.reshape(-1, hidden_size)
+
+        # NOTE: image_input_ids contains both image placeholder tokens and
+        # newline tokens.
+        image_input_ids = image_input["image_input_ids"]
+        image_sizes = [
+            len(input_ids_per_image) for input_ids_per_image in image_input_ids
+        ]
+        image_input_ids = torch.flatten(image_input_ids)
+
+        image_token_mask = image_input_ids == _IMAGE_TOKEN_ID
+        full_vision_embeddings = self.language_model.get_input_embeddings(
+            image_input_ids)
+        full_vision_embeddings[image_token_mask] = vision_embeddings
+
+        return torch.split(full_vision_embeddings, image_sizes)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
@@ -319,7 +356,7 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
-                _IMAGE_TOKEN_ID)
+                [_IMAGE_TOKEN_ID, _NEWLINE_TOKEN_ID])
         return inputs_embeds
 
     def forward(

From 5e568e8f6044102c9dbfcf4cf86fab29ca685d29 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:10:40 +0000
Subject: [PATCH 06/40] aria

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md |   2 +-
 vllm/model_executor/models/aria.py     | 115 +++++++++++++++++++------
 2 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7bb4da545cf4..b82bb649e6b5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -570,7 +570,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `rhymes-ai/Aria`
   -
   - ✅︎
-  -
+  - ✅︎
 * - `Blip2ForConditionalGeneration`
   - BLIP-2
   - T + I<sup>E</sup>
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 9437ad968842..a15672cec9c7 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,15 +1,18 @@
 import math
-from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from torch.nn.init import trunc_normal_
 from transformers import LlamaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import INPUT_REGISTRY, token_inputs
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -17,8 +20,8 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
-from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
-                                                SamplingMetadata)
+from vllm.model_executor.layers.sampler import (SamplerOutput,
+                                                SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
@@ -35,10 +38,12 @@
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
+from vllm.utils import is_list_of
 
 from .utils import flatten_bn
 
@@ -445,15 +450,74 @@ def build_mm_projector(config):
     )
 
 
-def get_max_multimodal_tokens(ctx):
-    return max(ctx.model_config.hf_config.image_size2tokens.values())
-
-
-def input_mapper_for_aria(ctx, data):
-    return MultiModalKwargs(data)
-
-
-def input_processor(ctx, llm_inputs):
+def get_aria_max_multimodal_tokens(ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    image_size2tokens = {
+        int(math.sqrt(k) * hf_config.vision_config.patch_size): v
+        for k, v in hf_config.projector_patch_to_query_dict.items()
+    }
+    return max(image_size2tokens.values())
+
+
+def dummy_seq_data_for_aria(ctx: InputContext, seq_len: int, num_images: int):
+    image_feature_size = get_aria_max_multimodal_tokens(ctx)
+    hf_config = ctx.get_hf_config()
+    return SequenceData.from_prompt_token_counts(
+        (hf_config.image_token_index, image_feature_size * num_images),
+        (0, seq_len - image_feature_size * num_images),
+    ), {
+        "image":
+        consecutive_placeholder_ranges(num_items=num_images,
+                                       item_size=image_feature_size)
+    }
+
+
+def dummy_image_for_aria(
+    ctx: InputContext,
+    num_images: int,
+):
+    hf_config = ctx.get_hf_config()
+    max_image_size = hf_config.vision_config.image_size
+    image = Image.new("RGB", (max_image_size, max_image_size), color=0)
+    images = [image] * num_images
+
+    return {"image": images}
+
+
+def dummy_data_for_aria(ctx: InputContext, seq_len: int,
+                        mm_counts: Mapping[str, int]):
+    num_images = mm_counts["image"]
+    seq_data, ranges = dummy_seq_data_for_aria(ctx, seq_len, num_images)
+    mm_data = dummy_image_for_aria(ctx, num_images)
+    return DummyData(seq_data, mm_data, ranges)
+
+
+def input_mapper_for_aria(ctx: InputContext, data: object):
+    data_list = data if isinstance(data, list) else [data]
+
+    # For profiling with dummy image data
+    if is_list_of(data_list, Image.Image):
+        hf_config = ctx.get_hf_config()
+        max_image_size = hf_config.vision_config.image_size
+        model_config = ctx.model_config
+        image_processor = cached_get_image_processor(
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
+        image_inputs = image_processor.preprocess(
+            data_list,
+            max_image_size=max_image_size,
+            split_image=False,
+            return_tensors="pt").data
+        image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
+            ctx.model_config.dtype)
+        return MultiModalKwargs(image_inputs)
+
+    # For actual inference when image has been processed with
+    # prompt in input processor
+    return MultiModalKwargs(data_list[0])
+
+
+def input_processor_for_aria(ctx: InputContext, llm_inputs: DecoderOnlyInputs):
     multi_modal_data = llm_inputs.get("multi_modal_data")
     # if it is pure text input, use it as is
     if multi_modal_data is None or "image" not in multi_modal_data:
@@ -494,9 +558,12 @@ def input_processor(ctx, llm_inputs):
             repeat_count=num_crops,
         )
 
-    repeat_count = [hf_config.image_size2tokens[max_image_size]
-                    ] * sum(num_crops).item()
-    new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens(
+    image_size2tokens = {
+        int(math.sqrt(k) * hf_config.vision_config.patch_size): v
+        for k, v in hf_config.projector_patch_to_query_dict.items()
+    }
+    repeat_count = [image_size2tokens[max_image_size]] * sum(num_crops).item()
+    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
         tokenizer,
         None,
         prompt_token_ids,
@@ -508,12 +575,14 @@ def input_processor(ctx, llm_inputs):
         prompt_token_ids=new_token_ids,
         prompt=new_prompt,
         multi_modal_data={"image": image_inputs},
+        multi_modal_placeholders={"image": ranges},
     )
 
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_aria_max_multimodal_tokens)
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
-@INPUT_REGISTRY.register_input_processor(input_processor)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_aria)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_aria)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
@@ -540,12 +609,6 @@ def __init__(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
 
-        # prepare the image_size to tokens mapping for the image preprocess, see
-        # input_processor
-        config.image_size2tokens = {
-            int(math.sqrt(k) * config.vision_config.patch_size): v
-            for k, v in config.projector_patch_to_query_dict.items()
-        }
         self.config = config
         self.vision_tower = AriaVisionModel(config.vision_config)
         self.multi_modal_projector = build_mm_projector(config)
@@ -566,7 +629,7 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def _validate_image_sizes(
             self, images: List[torch.Tensor]) -> List[torch.Tensor]:

From 135fd5c324f55b158054a5c090584c4abca3aa36 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:10:52 +0000
Subject: [PATCH 07/40] fix profiling

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 509771b7e2e5..fdf8a7ba440f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -651,6 +651,8 @@ def profile_run(self) -> None:
                 self.max_num_encoder_input_tokens,
                 self.encoder_cache_size) // max_tokens_per_mm_item
 
+            max_num_mm_items = min(self.max_num_reqs, max_num_mm_items)
+
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1

From 0a8dbe0f0fd727e05b52b97442a020b9622ab3a0 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:11:54 +0000
Subject: [PATCH 08/40] update

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_vision_language.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 77af914a6ef0..068f29a21ff9 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -28,6 +28,7 @@ def run_aria(question: str, modality: str):
               tokenizer_mode="slow",
               trust_remote_code=True,
               dtype="bfloat16",
+              max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
@@ -191,8 +192,10 @@ def run_llava_next(question: str, modality: str):
 
     prompt = f"[INST] <image>\n{question} [/INST]"
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+              max_num_batched_tokens=32768,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+              limit_mm_per_prompt={"image": 4},
+              enable_prefix_caching=False)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -591,7 +594,7 @@ def main(args):
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
+    sampling_params = SamplingParams(temperature=0,
                                      max_tokens=64,
                                      stop_token_ids=stop_token_ids)
 

From 03f741d844a74f0f6f6e351194d4b00b92d89ff7 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:21:18 +0000
Subject: [PATCH 09/40] add llava-next

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index b82bb649e6b5..13143fe45f53 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `LlavaNextVideoForConditionalGeneration`
   - LLaVA-NeXT-Video
   - T + V

From 8bce94989afe81db7962dd6f8bb1a8d288cd4b12 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:23:54 +0000
Subject: [PATCH 10/40] revert testing code

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_vision_language.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 068f29a21ff9..efa9b7ac3807 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -192,10 +192,8 @@ def run_llava_next(question: str, modality: str):
 
     prompt = f"[INST] <image>\n{question} [/INST]"
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
-              max_num_batched_tokens=32768,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-              limit_mm_per_prompt={"image": 4},
-              enable_prefix_caching=False)
+              max_model_len=8192,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 

From bbde4140feb299b83112496aff3399fbf0c30aba Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:24:44 +0000
Subject: [PATCH 11/40] revert testing code

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 examples/offline_inference_vision_language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index efa9b7ac3807..93deec663c36 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -592,7 +592,7 @@ def main(args):
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0,
+    sampling_params = SamplingParams(temperature=0.2,
                                      max_tokens=64,
                                      stop_token_ids=stop_token_ids)
 

From ea928c6e55ca346ef1b20894e5cbc49c928df0ba Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:46:13 +0000
Subject: [PATCH 12/40] tweak and clarify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fdf8a7ba440f..6ed62415d998 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -647,11 +647,18 @@ def profile_run(self) -> None:
                 self.mm_registry.get_max_tokens_per_item_by_modality(
                     self.model_config).values())
 
-            max_num_mm_items = min(
+            max_num_mm_items_encoder_budget = min(
                 self.max_num_encoder_input_tokens,
                 self.encoder_cache_size) // max_tokens_per_mm_item
 
-            max_num_mm_items = min(self.max_num_reqs, max_num_mm_items)
+            max_mm_items_per_req = max(
+                self.mm_registry.get_mm_limits_per_prompt(
+                    self.model_config).values())
+            max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                max_mm_items_per_req
+
+            max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                   max_num_mm_items_decoder_budget)
 
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we

From 55eada7145eec6864404bdf7c31b991ab1833c23 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:58:23 +0000
Subject: [PATCH 13/40] clarify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6ed62415d998..c8ba5e932a19 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -654,6 +654,10 @@ def profile_run(self) -> None:
             max_mm_items_per_req = max(
                 self.mm_registry.get_mm_limits_per_prompt(
                     self.model_config).values())
+
+            # NOTE: We do not consider max_num_batched_tokens on
+            # purpose because the image embeddings can be generated in
+            # advanced and chunked prefilled.
             max_num_mm_items_decoder_budget = self.max_num_reqs * \
                 max_mm_items_per_req
 

From bbd57528af1894609821eaf7445be6e2e6850f50 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 30 Dec 2024 14:59:32 +0000
Subject: [PATCH 14/40] reword

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c8ba5e932a19..a08a86d4007d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -655,9 +655,9 @@ def profile_run(self) -> None:
                 self.mm_registry.get_mm_limits_per_prompt(
                     self.model_config).values())
 
-            # NOTE: We do not consider max_num_batched_tokens on
-            # purpose because the image embeddings can be generated in
-            # advanced and chunked prefilled.
+            # NOTE: We do not consider max_num_batched_tokens on purpose
+            # because the multimodal embeddings can be generated in advance
+            # and chunked prefilled.
             max_num_mm_items_decoder_budget = self.max_num_reqs * \
                 max_mm_items_per_req
 

From 0452b99b143eab5fc7c4596a9ad167a74bc1f022 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 16:48:44 +0000
Subject: [PATCH 15/40] Use merged multi-modal processor for blip2 and
 chameleon

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py     |   4 +
 vllm/model_executor/models/blip.py      |  74 -----------
 vllm/model_executor/models/blip2.py     | 139 ++++++++-------------
 vllm/model_executor/models/chameleon.py | 157 +++++++++++-------------
 vllm/multimodal/processing.py           |   5 +-
 5 files changed, 129 insertions(+), 250 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 1b2847ed0f53..43fb6e4e25e7 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -624,6 +624,10 @@ def _test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
+    ("rhymes-ai/Aria", {"image"}),
+    ("Salesforce/blip2-opt-2.7b", {"image"}),
+    ("facebook/chameleon-7b", {"image"}),
+    ("adept/fuyu-8b", {"image"}),
     ("llava-hf/llava-1.5-7b-hf", {"image"}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
     ("mistral-community/pixtral-12b", {"image"}),
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 42a239cadac4..129a0bcecc86 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -8,18 +8,13 @@
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import DecoderOnlyInputs, token_inputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import SequenceData
 
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -33,36 +28,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def get_blip_image_feature_size(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size)
-
-
-def get_max_blip_image_tokens(
-        hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int:
-    return get_blip_image_feature_size(hf_config)
-
-
-def dummy_seq_data_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    )
-
-
 def dummy_image_for_blip(
     hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
     num_images: int,
@@ -80,45 +45,6 @@ def dummy_image_for_blip(
     return {"image": image if num_images == 1 else [image] * num_images}
 
 
-def input_processor_for_blip(
-    model_config: ModelConfig,
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    if image_feature_size_override is None:
-        image_feature_size = get_blip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=image_token_id,
-        repeat_count=image_feature_size,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
-
-
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 4e16ae522c9b..c65acb85aa98 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -4,24 +4,25 @@
 
 import torch
 import torch.nn as nn
-from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
-                          apply_chunking_to_forward)
+from transformers import (BatchFeature, Blip2Config, Blip2Processor,
+                          Blip2QFormerConfig, apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
-
-from .blip import (BlipVisionModel, dummy_image_for_blip,
-                   get_max_blip_image_tokens)
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
+
+from .blip import BlipVisionModel, dummy_image_for_blip
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -396,96 +397,60 @@ def forward(
         return sequence_output
 
 
-def get_blip2_image_feature_size(hf_config: Blip2Config) -> int:
-    return hf_config.num_query_tokens
-
-
 def get_max_blip2_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        return get_max_blip_image_tokens(vision_config)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
-def dummy_seq_data_for_blip2(
-    hf_config: Blip2Config,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_blip2_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_data_for_blip2(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(Blip2Config)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_blip2(
-        hf_config,
-        seq_len,
-        num_images,
-        image_token_id=BLIP2_IMAGE_TOKEN_ID,
-    )
-
-    if isinstance(vision_config, Blip2VisionConfig):
-        mm_data = dummy_image_for_blip(vision_config, num_images)
+    return hf_config.num_query_tokens
 
-        return DummyData(seq_data, mm_data, ranges)
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+class Blip2MultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_hf_processor(self) -> Blip2Processor:
+        return self.ctx.get_hf_processor(Blip2Processor)
 
-def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
 
-    hf_config = ctx.get_hf_config(Blip2Config)
-    image_feature_size = get_blip2_image_feature_size(hf_config)
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        max_image_tokens = get_max_blip2_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="",  # An empty target is never matched against
+                replacement="<image>" * max_image_tokens,
+            )
+        ]
 
-    # The original model places image tokens at the front
-    # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
-    new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size
-    new_token_ids += inputs["prompt_token_ids"]
-    placeholder_ranges = [
-        PlaceholderRange(offset=0, length=image_feature_size)
-    ]
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config(Blip2Config)
+        vision_config = hf_config.vision_config
+        num_images = mm_counts.get("image", 0)
 
-    new_prompt = inputs.get("prompt")
-    if new_prompt is not None:
-        new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt
+        data = dummy_image_for_blip(vision_config, num_images)
 
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=data,
+        )
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
+@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index afca81f5d4fd..e027579cdb8e 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -6,13 +6,13 @@
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
-from transformers import ChameleonConfig, ChameleonVQVAEConfig
+from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
+                          ChameleonVQVAEConfig)
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -29,11 +29,13 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -45,10 +47,6 @@
 # and processor files, so we hardcode them in the model file for now.
 CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
 CHAMELEON_IMAGE_SEQ_LENGTH = 1024
-CHAMELEON_IMAGE_TOKEN_ID = 8711
-CHAMELEON_IMAGE_START_TOKEN_ID = 8197
-CHAMELEON_IMAGE_END_TOKEN_ID = 8196
-CHAMELEON_SEP_TOKEN_ID = 8710
 
 
 class ChameleonImagePixelInputs(TypedDict):
@@ -61,28 +59,6 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
     return CHAMELEON_IMAGE_SEQ_LENGTH
 
 
-def dummy_seq_data_for_chameleon(
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-):
-    if image_feature_size_override is None:
-        image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
 def dummy_image_for_chameleon(
     num_images: int,
     *,
@@ -100,61 +76,70 @@ def dummy_image_for_chameleon(
     return {"image": image if num_images == 1 else [image] * num_images}
 
 
-def dummy_data_for_chameleon(ctx: InputContext, seq_len: int,
-                             mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
+class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_hf_processor(self) -> ChameleonProcessor:
+        return self.ctx.get_hf_processor(ChameleonProcessor)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"), )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        processor = self._get_hf_processor()
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement="".join([
+                    processor.image_start_token,
+                    processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH,
+                    processor.image_end_token,
+                ]),
+            )
+        ]
 
-    seq_data, ranges = dummy_seq_data_for_chameleon(
-        seq_len,
-        num_images,
-        image_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-    )
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
 
-    mm_data = dummy_image_for_chameleon(num_images)
-    return DummyData(seq_data, mm_data, ranges)
+        data = dummy_image_for_chameleon(num_images)
 
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=data,
+        )
 
-def input_processor_for_chameleon(ctx: InputContext,
-                                  inputs: DecoderOnlyInputs):
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the image_start_token and image_end_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"] + 1,
+                                 length=p["length"] - 2) for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
 
-    """
-    Processing input prompt to insert required tokens for image placeholder.
-
-    See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58
-    """ # noqa
-
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID,
-        repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH,
-        pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID,
-        pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID,
-    )
-
-    # Appending sep token for chat mode to follow default processor
-    # behavior
-    if new_prompt is not None:
-        new_prompt += tokenizer.sep_token
-    new_token_ids += [CHAMELEON_SEP_TOKEN_ID]
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
+        return result
 
 
 class ChameleonLayerNorm(nn.LayerNorm):
@@ -926,10 +911,8 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon)
+@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3ece0762e322..f7fb5d3bba51 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,7 @@
 import pickle
 import re
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
 from functools import lru_cache
@@ -352,13 +353,13 @@ def _replace_matches(
 ) -> list[_S]:
     out_seqs = list[_S]()
     prev_end_idx = 0
-    next_idx_by_modality = {modality: 0 for modality in mm_item_counts}
+    next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     for match in _resolve_matches(prompt, matches):
         modality = match.modality
 
         item_idx = next_idx_by_modality[modality]
-        if item_idx >= mm_item_counts[modality]:
+        if item_idx >= mm_item_counts.get(modality, 0):
             continue
 
         start_idx = match.start_idx

From 938c0bf8d48934100e1c7078d3b8c7d36b16ed19 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 17:20:25 +0000
Subject: [PATCH 16/40] Limit max num seqs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py            | 1 +
 tests/models/decoder_only/vision_language/test_models.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 93deec663c36..6480bda1ebdb 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -58,6 +58,7 @@ def run_chameleon(question: str, modality: str):
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
+              max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 1a9c1b4ef1be..f0bc1a14773d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -179,6 +179,7 @@
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
+        max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"

From 6cc54a7a2d17477d0b989ae9d5dce2ddcb3d562e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 17:44:05 +0000
Subject: [PATCH 17/40] Update comments

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 6480bda1ebdb..69fe56b44124 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -24,11 +24,13 @@ def run_aria(question: str, modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
+    # NOTE: Need L40 to run this
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
-              trust_remote_code=True,
               dtype="bfloat16",
+              max_model_len=4096,
               max_num_seqs=2,
+              trust_remote_code=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
     prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
@@ -259,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
+    # 2.6
     model_name = "openbmb/MiniCPM-V-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
@@ -432,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):
 
     model_name = "mistral-community/pixtral-12b"
 
+    # NOTE: Need L40 to run this
     llm = LLM(
         model=model_name,
         max_model_len=8192,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 

From ba713ba2e33f71548fdbacbb08f08e27c12fb7e5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Dec 2024 17:48:35 +0000
Subject: [PATCH 18/40] Be more clear

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference_vision_language.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 69fe56b44124..b51bfae45526 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -24,7 +24,7 @@ def run_aria(question: str, modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
-    # NOTE: Need L40 to run this
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
               dtype="bfloat16",
@@ -434,7 +434,7 @@ def run_pixtral_hf(question: str, modality: str):
 
     model_name = "mistral-community/pixtral-12b"
 
-    # NOTE: Need L40 to run this
+    # NOTE: Need L40 (or equivalent) to avoid OOM
     llm = LLM(
         model=model_name,
         max_model_len=8192,

From b0efc4fcfdfa60726f47d3dfe90caec5a55fdb18 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 03:54:35 +0000
Subject: [PATCH 19/40] Merged multi-modal processor for Aria

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../vision_language/test_models.py            |   5 +-
 vllm/model_executor/models/aria.py            | 206 ++++++------------
 .../models/idefics2_vision_model.py           |   6 +-
 3 files changed, 77 insertions(+), 140 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index f0bc1a14773d..30473e79f89f 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -140,10 +140,7 @@
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
         tokenizer_mode="slow",
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-        ),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         dtype="bfloat16",
         prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index a15672cec9c7..21a7e38d24a2 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -6,13 +6,12 @@
 import torch.nn as nn
 from PIL import Image
 from torch.nn.init import trunc_normal_
-from transformers import LlamaConfig
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -25,27 +24,22 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.models.idefics2_vision_model import (
-    Idefics2VisionTransformer)
-from vllm.model_executor.models.interfaces import SupportsMultiModal
-from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP,
-                                              LlamaModel)
-from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
-                                              is_pp_missing_parameter,
-                                              maybe_prefix,
-                                              merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
                                                   AriaVisionConfig)
-from vllm.utils import is_list_of
 
-from .utils import flatten_bn
+from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import SupportsMultiModal
+from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 
 class AriaImagePixelInputs(TypedDict):
@@ -256,7 +250,7 @@ def forward(self, x, attn_mask=None):
 class AriaFusedMoE(FusedMoE):
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      shard_id: str) -> Set[str]:
+                      shard_id: str) -> None:
         # Override the weight_loader to handle the expert weights in the Aria
         # model, which are already packed with experts, and merge the gate and
         # up weights for each expert.
@@ -351,7 +345,7 @@ class MoEDecoderLayer(LlamaDecoderLayer):
 
     def __init__(
         self,
-        config: LlamaConfig,
+        config: AriaMoELMConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -439,7 +433,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-def build_mm_projector(config):
+def build_mm_projector(config: PretrainedConfig):
     return AriaProjector(
         patch_to_query_dict=config.projector_patch_to_query_dict,
         embed_dim=config.vision_config.hidden_size,
@@ -450,7 +444,7 @@ def build_mm_projector(config):
     )
 
 
-def get_aria_max_multimodal_tokens(ctx: InputContext):
+def get_max_aria_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config()
     image_size2tokens = {
         int(math.sqrt(k) * hf_config.vision_config.patch_size): v
@@ -459,130 +453,69 @@ def get_aria_max_multimodal_tokens(ctx: InputContext):
     return max(image_size2tokens.values())
 
 
-def dummy_seq_data_for_aria(ctx: InputContext, seq_len: int, num_images: int):
-    image_feature_size = get_aria_max_multimodal_tokens(ctx)
-    hf_config = ctx.get_hf_config()
-    return SequenceData.from_prompt_token_counts(
-        (hf_config.image_token_index, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
 def dummy_image_for_aria(
-    ctx: InputContext,
+    vision_config: AriaVisionConfig,
     num_images: int,
 ):
-    hf_config = ctx.get_hf_config()
-    max_image_size = hf_config.vision_config.image_size
+    max_image_size = vision_config.image_size
     image = Image.new("RGB", (max_image_size, max_image_size), color=0)
     images = [image] * num_images
 
     return {"image": images}
 
 
-def dummy_data_for_aria(ctx: InputContext, seq_len: int,
-                        mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    seq_data, ranges = dummy_seq_data_for_aria(ctx, seq_len, num_images)
-    mm_data = dummy_image_for_aria(ctx, num_images)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def input_mapper_for_aria(ctx: InputContext, data: object):
-    data_list = data if isinstance(data, list) else [data]
-
-    # For profiling with dummy image data
-    if is_list_of(data_list, Image.Image):
-        hf_config = ctx.get_hf_config()
-        max_image_size = hf_config.vision_config.image_size
-        model_config = ctx.model_config
-        image_processor = cached_get_image_processor(
-            model_config.model,
-            trust_remote_code=model_config.trust_remote_code)
-        image_inputs = image_processor.preprocess(
-            data_list,
-            max_image_size=max_image_size,
-            split_image=False,
-            return_tensors="pt").data
-        image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
-            ctx.model_config.dtype)
-        return MultiModalKwargs(image_inputs)
-
-    # For actual inference when image has been processed with
-    # prompt in input processor
-    return MultiModalKwargs(data_list[0])
-
-
-def input_processor_for_aria(ctx: InputContext, llm_inputs: DecoderOnlyInputs):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    # if it is pure text input, use it as is
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return llm_inputs
-
-    model_config = ctx.model_config
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-    image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
-    hf_config = model_config.hf_config
-
-    # prepare image tokens, the max_image_size is used to determine the number
-    # of patch_size for every image
-    max_image_size = multi_modal_data.pop("max_image_size", 980)
-    _split_image = multi_modal_data.pop("split_image", False)
-
-    assert isinstance(max_image_size,
-                      (int, float)), "max_image_size should be float or int"
-    images = (multi_modal_data["image"] if isinstance(
-        multi_modal_data["image"], list) else [multi_modal_data["image"]])
-
-    image_inputs = image_processor.preprocess(images,
-                                              max_image_size=max_image_size,
-                                              split_image=_split_image,
-                                              return_tensors="pt").data
-    image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
-        ctx.model_config.dtype)
-    num_crops = image_inputs.pop("num_crops")
-
-    prompt_token_ids = llm_inputs["prompt_token_ids"]
-    if num_crops.sum().item() > 0:
-        _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens(
-            tokenizer,
-            None,
-            prompt_token_ids,
-            placeholder_token_id=hf_config.image_token_index,
-            repeat_count=num_crops,
+class AriaMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            pixel_mask=MultiModalFieldConfig.batched("image"),
         )
 
-    image_size2tokens = {
-        int(math.sqrt(k) * hf_config.vision_config.patch_size): v
-        for k, v in hf_config.projector_patch_to_query_dict.items()
-    }
-    repeat_count = [image_size2tokens[max_image_size]] * sum(num_crops).item()
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        None,
-        prompt_token_ids,
-        placeholder_token_id=hf_config.image_token_index,
-        repeat_count=repeat_count,
-    )
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.ctx.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        max_image_tokens = get_max_aria_image_tokens(self.ctx)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=[image_token_id] * max_image_tokens,
+            )
+        ]
 
-    return token_inputs(
-        prompt_token_ids=new_token_ids,
-        prompt=new_prompt,
-        multi_modal_data={"image": image_inputs},
-        multi_modal_placeholders={"image": ranges},
-    )
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.ctx.get_hf_config()
+        vision_config = hf_config.vision_config
+        num_images = mm_counts.get("image", 0)
+
+        data = dummy_image_for_aria(vision_config, num_images)
 
+        hf_processor = self._get_hf_processor()
+        image_token = hf_processor.image_token  # type: ignore
 
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_aria_max_multimodal_tokens)
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_aria)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_aria)
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=data,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens)
+@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
 class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     """
     Aria model for conditional generation tasks.
@@ -651,7 +584,12 @@ def _parse_and_validate_image_input(
 
         pixel_values = self._validate_image_sizes(pixel_values)
         pixel_values = flatten_bn(pixel_values, concat=True)
+
         if pixel_mask is not None:
+            if not isinstance(pixel_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel mask. "
+                                 f"Got type: {type(pixel_mask)}")
+
             pixel_mask = flatten_bn(pixel_mask, concat=True)
 
         return AriaImagePixelInputs(
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index e430a158d869..4e42a4b6f9e6 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -69,7 +69,8 @@ def forward(self,
                 patch_attention_mask: torch.BoolTensor,
                 tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
-        patch_embeds = self.patch_embedding(pixel_values)
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(target_dtype))
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
         max_nb_patches_h, max_nb_patches_w = (
             max_im_h // self.patch_size,
@@ -309,7 +310,8 @@ def forward(
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
-            tgt_sizes=tgt_sizes)
+            tgt_sizes=tgt_sizes,
+        )
         encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state

From cdbd96986a739b401d8b65f69a4dd57685f8b139 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 31 Dec 2024 16:00:17 +0800
Subject: [PATCH 20/40] initialize fuyu merged processor

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/fuyu.py | 253 +++++++++++------------------
 1 file changed, 92 insertions(+), 161 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 02bc0af05325..242410d9d554 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """ PyTorch Fuyu model."""
 import math
-from array import array
 from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict)
 
@@ -23,24 +22,22 @@
 import torch.nn as nn
 import torch.utils.checkpoint
 from PIL import Image
-from transformers import FuyuImageProcessor
+from transformers import BatchFeature, FuyuProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.inputs import InputContext
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.image import cached_get_image_processor
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges)
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
-from vllm.utils import is_list_of
+from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalDataItems,
+                                        MultiModalFieldConfig, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -96,156 +93,90 @@ def get_max_fuyu_image_tokens(ctx: InputContext):
     return (ncol + 1) * nrow
 
 
-def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
-    ncol, nrow = get_max_fuyu_image_feature_size()
-    image_feature_size = get_max_fuyu_image_tokens(ctx)
-
-    image_token_ids = (
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol +
-        array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow
-    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images
-    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - image_feature_size * num_images)
-    return SequenceData(token_ids), {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_fuyu(
-    num_images: int,
-    *,
-    image_width: int,
-    image_height: int,
-):
-    image = Image.new("RGB", (image_width, image_height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
-                        mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
-    mm_data = dummy_image_for_fuyu(num_images,
-                                   image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-                                   image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
-                           data: List[Image.Image]):
-    image_encoding = image_processor.preprocess(data, return_tensors="pt")
-    batch_images = torch.stack([img[0] for img in image_encoding["images"]
-                                ]).unsqueeze(1)
-    image_unpadded_heights = torch.tensor(
-        image_encoding["image_unpadded_heights"])
-    image_unpadded_widths = torch.tensor(
-        image_encoding["image_unpadded_widths"])
-
-    batch_size = len(image_encoding["images"])
-    image_present = torch.ones(batch_size, 1, 1)
-    model_image_input = image_processor.preprocess_with_tokenizer_info(
-        image_input=batch_images,
-        image_present=image_present,
-        image_unpadded_h=image_unpadded_heights,
-        image_unpadded_w=image_unpadded_widths,
-        image_placeholder_id=_IMAGE_TOKEN_ID,
-        image_newline_id=_NEWLINE_TOKEN_ID,
-        variable_sized=True,
-    )
-    return model_image_input
-
-
-def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    model_config = ctx.model_config
-    image_data = multi_modal_data["image"]
-    new_multi_modal_data = {}
-    image_list = image_data if isinstance(image_data, list) else [image_data]
-
-    # process image data
-    if is_list_of(image_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, image_data)
-        image_patches = torch.cat([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-        # dim0 is batch_size, dim1 is subseq_size which will always be 1
-        image_input_ids: List[List[
-            torch.Tensor]] = model_image_input["image_input_ids"]
-        image_input_ids = image_input_ids[0][0].tolist()
-        new_multi_modal_data["image"] = {
-            "image_patches": image_patches,
-            "image_input_ids": image_input_ids
-        }
-
-    elif is_list_of(image_list, torch.Tensor):
-        raise NotImplementedError("Embeddings input is not supported yet")
-    else:
-        raise TypeError(f"Invalid image type: {type(image_data)}")
-
-    # process prompts
-    prompt = inputs.get("prompt")
-    prompt_token_ids = inputs["prompt_token_ids"]
-    tokenizer = cached_get_tokenizer(model_config.model)
-    bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
-    boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
-
-    new_prompt = prompt + "\x04"
-    new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
-        1:] + boa_token
-
-    placeholder_ranges = [
-        PlaceholderRange(offset=0, length=len(image_input_ids))
-    ]
-
-    return token_inputs(prompt=new_prompt,
-                        prompt_token_ids=new_prompt_token_ids,
-                        multi_modal_data=new_multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
-
-
-def input_mapper_for_fuyu(ctx: InputContext, data: object):
-    model_config = ctx.model_config
-    data_list = data if isinstance(data, list) else [data]
-
-    # For profiling with dummy image data
-    if is_list_of(data_list, Image.Image):
-        # Fuyu's image_processor can also finish token padding
-        image_processor: FuyuImageProcessor = cached_get_image_processor(
-            model_config.model)
-
-        model_image_input = _fuyu_image_preprocess(image_processor, data_list)
-        data = torch.stack([
-            image_patch[0]
-            for image_patch in model_image_input["image_patches"]
-        ])
-        image_input_ids = model_image_input["image_input_ids"][0][0]
-        return MultiModalKwargs({
-            "pixel_values": data,
-            "image_input_ids": image_input_ids,
-        })
-
-    # For actual inference when image has been processed with
-    # prompt in input processor
-    return MultiModalKwargs({
-        "pixel_values": data[0]["image_patches"],
-        "image_input_ids": data[0]["image_input_ids"],
-    })
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
+class FuyuMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_hf_processor(self) -> FuyuProcessor:
+        return self.ctx.get_hf_processor(FuyuProcessor)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self._get_tokenizer()
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs)
+        if "image_patches" in processed_outputs:
+            # separate image_input_ids from input_ids if has image inputs
+            new_prompt = tokenizer.decode(processed_outputs["input_ids"][0],
+                                          skip_special_tokens=True)
+            image_prompt = new_prompt.split("<s>")[0]
+            # we can't set add_special_tokens=False here, because placeholder
+            # and newline are all special tokens
+            image_input_ids = tokenizer.encode(image_prompt,
+                                               return_tensors="pt")
+            # Drop begin token since it doesn't belong to image_input_ids
+            processed_outputs["image_input_ids"] = image_input_ids[:, 2:]
+            processed_outputs["pixel_values"] = processed_outputs.pop(
+                "image_patches")
+        else:
+            # FuyuProcessor won't add bos and boa if no images inputs, we add
+            # them back manually
+            bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
+            boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
+            prompt_ids = tokenizer.encode(
+                prompt,
+                add_special_tokens=False,  # type: ignore
+            )
+            prompt_ids = bos_token + prompt_ids + boa_token
+            processed_outputs["input_ids"] = torch.tensor([prompt_ids])
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_input_ids=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        image_input_ids = out_mm_kwargs.get("image_input_ids", [])
+        if isinstance(image_input_ids, torch.Tensor):
+            image_input_ids = image_input_ids.squeeze(0).tolist()
+        return [
+            PromptReplacement(
+                modality="image",
+                target="",
+                replacement=image_input_ids,
+            )
+        ]
+
+    def _get_dummy_mm_inputs(self, mm_counts):
+        num_images = mm_counts.get("image", 0)
+        image = Image.new(
+            "RGB",
+            (MAX_IMAGE_FEATURE_SIZE_WIDTH, MAX_IMAGE_FEATURE_SIZE_HEIGHT),
+            color=0,
+        )
+        mm_data = dict(image=image if num_images == 1 else [image] *
+                       num_images)
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -312,7 +243,7 @@ def _parse_and_validate_image_input(
                 type="pixel_values",
                 data=self._validate_pixel_values(
                     flatten_bn(pixel_values, concat=True)),
-                image_input_ids=image_input_ids,
+                image_input_ids=flatten_bn(image_input_ids),
             )
 
         return None

From 48c694623fb20a65322c3ad1565a1588e4cbec79 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 08:15:25 +0000
Subject: [PATCH 21/40] Clean up

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/aria.py      |  4 +--
 vllm/model_executor/models/blip2.py     | 34 ++++++++++++++----
 vllm/model_executor/models/chameleon.py |  4 +--
 vllm/model_executor/models/fuyu.py      | 48 +++++++++++++------------
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 21a7e38d24a2..69587aa5ddb7 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -500,13 +500,13 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config()
-        vision_config = hf_config.vision_config
+        vision_config: AriaVisionConfig = hf_config.vision_config
         num_images = mm_counts.get("image", 0)
 
         data = dummy_image_for_aria(vision_config, num_images)
 
         hf_processor = self._get_hf_processor()
-        image_token = hf_processor.image_token  # type: ignore
+        image_token: str = hf_processor.image_token  # type: ignore
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index c65acb85aa98..c6456310b7df 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,8 +15,9 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
@@ -29,8 +30,7 @@
 
 # We use this internally as placeholders since there is no image token
 # defined on the HuggingFace repo
-BLIP2_IMAGE_TOKEN = "<image>"
-BLIP2_IMAGE_TOKEN_ID = 50265
+_IMAGE_TOKEN_ID = 50265
 
 
 class Blip2ImagePixelInputs(TypedDict):
@@ -428,11 +428,31 @@ def _get_prompt_replacements(
         return [
             PromptReplacement(
                 modality="image",
-                target="",  # An empty target is never matched against
-                replacement="<image>" * max_image_tokens,
+                target="</s>",
+                replacement="<image>" * max_image_tokens + "</s>",
             )
         ]
 
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only <image> tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
@@ -596,7 +616,7 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
-                BLIP2_IMAGE_TOKEN_ID)
+                _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e027579cdb8e..9f8d84673361 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -86,7 +86,7 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(pixel_values=MultiModalFieldConfig.batched("image"), )
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_replacements(
         self,
@@ -722,7 +722,7 @@ def forward(self, pixel_values: torch.Tensor):
         for i_level in range(self.num_resolutions):
             for i_block in range(self.num_res_blocks):
                 hidden_state = self.down[i_level].block[i_block](
-                    hidden_states[-1], )
+                    hidden_states[-1])
                 if len(self.down[i_level].attn) > 0:
                     hidden_state = self.down[i_level].attn[i_block](
                         hidden_state)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 242410d9d554..0fe72396bf6d 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -56,40 +56,41 @@ class FuyuImagePixelInputs(TypedDict):
     data: torch.Tensor
     """
     Shape: 
-    (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+    `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)`
     """
     image_input_ids: torch.Tensor
 
 
-def _calculate_num_image_tokens(
-    height: int,
-    width: int,
+def _get_fuyu_num_image_tokens(
+    image_height: int,
+    image_width: int,
 ) -> Tuple[int, int]:
     """
-    calculate number of image tokens needed for a given image size
-    The expected Fuyu image prompts is in format:
+    Calculate the number of image tokens needed for a given image size.
+
+    The expected Fuyu image prompts can be expressed as:
+
+    .. code-block::
         (image_token * ncols + newline_token) * nrows
-    args:
-        image_size: Tuple[int, int] - (width, height) of the image
-    returns:
-        ncols: int - number of image tokens in x direction
-        nrows: int - number of image tokens in y direction
+
+    Args:
+        image_size: Tuple[int, int] - `(width, height)` of the image
+
+    Returns:
+        ncols: int - number of image tokens in `x` direction
+        nrows: int - number of image tokens in `y` direction
     """
-    ncol = math.ceil(width / 30)
-    nrow = math.ceil(height / 30)
+    ncol = math.ceil(image_width / 30)
+    nrow = math.ceil(image_height / 30)
     return ncol, nrow
 
 
-def get_max_fuyu_image_feature_size():
-
-    return _calculate_num_image_tokens(
-        height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
-        width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+def get_max_fuyu_image_tokens(ctx: InputContext):
+    ncol, nrow = _get_fuyu_num_image_tokens(
+        image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
 
-
-def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncol, nrow = get_max_fuyu_image_feature_size()
     return (ncol + 1) * nrow
 
 
@@ -162,13 +163,14 @@ def _get_prompt_replacements(
 
     def _get_dummy_mm_inputs(self, mm_counts):
         num_images = mm_counts.get("image", 0)
+
         image = Image.new(
             "RGB",
             (MAX_IMAGE_FEATURE_SIZE_WIDTH, MAX_IMAGE_FEATURE_SIZE_HEIGHT),
             color=0,
         )
-        mm_data = dict(image=image if num_images == 1 else [image] *
-                       num_images)
+        mm_data = {"image": image if num_images == 1 else [image] * num_images}
+
         return ProcessorInputs(
             prompt_text="",
             mm_data=mm_data,

From ea767599e92319a0b1096a29b569d5c112c1e3b0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 08:15:47 +0000
Subject: [PATCH 22/40] Clean up

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/fuyu.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 0fe72396bf6d..b2432e766e1d 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -80,18 +80,18 @@ def _get_fuyu_num_image_tokens(
         ncols: int - number of image tokens in `x` direction
         nrows: int - number of image tokens in `y` direction
     """
-    ncol = math.ceil(image_width / 30)
-    nrow = math.ceil(image_height / 30)
-    return ncol, nrow
+    ncols = math.ceil(image_width / 30)
+    nrows = math.ceil(image_height / 30)
+    return ncols, nrows
 
 
 def get_max_fuyu_image_tokens(ctx: InputContext):
-    ncol, nrow = _get_fuyu_num_image_tokens(
+    ncols, nrows = _get_fuyu_num_image_tokens(
         image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
         image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
     )
 
-    return (ncol + 1) * nrow
+    return (ncols + 1) * nrows
 
 
 class FuyuMultiModalProcessor(BaseMultiModalProcessor):

From bc976a7a1f125939f497c3876f3195802059a83c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 08:30:03 +0000
Subject: [PATCH 23/40] Try remove mark

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/decoder_only/vision_language/test_models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 30473e79f89f..7db08166826e 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -199,7 +199,6 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[large_gpu_mark(min_gb=48)],
     ),
     "glm4": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],

From f79f79a70dc492f9f075843f1c4610d15344915c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 08:47:12 +0000
Subject: [PATCH 24/40] Consolidate dummy data code

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/aria.py        | 23 ++++++---------
 vllm/model_executor/models/blip.py        | 18 ------------
 vllm/model_executor/models/blip2.py       | 13 +++++++--
 vllm/model_executor/models/chameleon.py   | 29 ++++++-------------
 vllm/model_executor/models/fuyu.py        | 19 ++++++------
 vllm/model_executor/models/qwen2_audio.py | 14 +++++----
 vllm/model_executor/models/qwen2_vl.py    | 17 +++++------
 vllm/model_executor/models/ultravox.py    | 13 +++++----
 vllm/multimodal/processing.py             | 35 +++++++++++++++++++++--
 9 files changed, 95 insertions(+), 86 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 69587aa5ddb7..0648d98ac405 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
 from torch.nn.init import trunc_normal_
 from transformers import BatchFeature, PretrainedConfig
 
@@ -453,17 +452,6 @@ def get_max_aria_image_tokens(ctx: InputContext):
     return max(image_size2tokens.values())
 
 
-def dummy_image_for_aria(
-    vision_config: AriaVisionConfig,
-    num_images: int,
-):
-    max_image_size = vision_config.image_size
-    image = Image.new("RGB", (max_image_size, max_image_size), color=0)
-    images = [image] * num_images
-
-    return {"image": images}
-
-
 class AriaMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_mm_fields_config(
@@ -501,16 +489,23 @@ def _get_dummy_mm_inputs(
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config()
         vision_config: AriaVisionConfig = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
-        data = dummy_image_for_aria(vision_config, num_images)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
 
         hf_processor = self._get_hf_processor()
         image_token: str = hf_processor.image_token  # type: ignore
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
-            mm_data=data,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 129a0bcecc86..987dfaf44f22 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
@@ -28,23 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
     return grid_length * grid_length
 
 
-def dummy_image_for_blip(
-    hf_config: Union[BlipVisionConfig, Blip2VisionConfig],
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index c6456310b7df..bf70f5d904f5 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -23,7 +23,7 @@
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
 
-from .blip import BlipVisionModel, dummy_image_for_blip
+from .blip import BlipVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -459,13 +459,20 @@ def _get_dummy_mm_inputs(
     ) -> ProcessorInputs:
         hf_config = self.ctx.get_hf_config(Blip2Config)
         vision_config = hf_config.vision_config
+
+        max_image_size = vision_config.image_size
         num_images = mm_counts.get("image", 0)
 
-        data = dummy_image_for_blip(vision_config, num_images)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text="",
-            mm_data=data,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 9f8d84673361..85fca23b0574 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,9 +3,8 @@
                     Tuple, TypedDict, Union)
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torch import nn
 from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
                           ChameleonVQVAEConfig)
 
@@ -59,23 +58,6 @@ def get_max_chameleon_image_tokens(ctx: InputContext):
     return CHAMELEON_IMAGE_SEQ_LENGTH
 
 
-def dummy_image_for_chameleon(
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = CHAMELEON_CROP_SIZE_WIDTH
-    height = CHAMELEON_CROP_SIZE_HEIGHT
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
 class ChameleonMultiModalProcessor(BaseMultiModalProcessor):
 
     def _get_hf_processor(self) -> ChameleonProcessor:
@@ -114,11 +96,16 @@ def _get_dummy_mm_inputs(
     ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        data = dummy_image_for_chameleon(num_images)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH,
+                                   height=CHAMELEON_CROP_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text="<image>" * num_images,
-            mm_data=data,
+            mm_data=mm_data,
         )
 
     def apply(
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index b2432e766e1d..d9c91234ac2b 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -20,8 +20,6 @@
 
 import torch
 import torch.nn as nn
-import torch.utils.checkpoint
-from PIL import Image
 from transformers import BatchFeature, FuyuProcessor
 
 from vllm.attention import AttentionMetadata
@@ -161,15 +159,18 @@ def _get_prompt_replacements(
             )
         ]
 
-    def _get_dummy_mm_inputs(self, mm_counts):
+    def _get_dummy_mm_inputs(
+        self,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
         num_images = mm_counts.get("image", 0)
 
-        image = Image.new(
-            "RGB",
-            (MAX_IMAGE_FEATURE_SIZE_WIDTH, MAX_IMAGE_FEATURE_SIZE_HEIGHT),
-            color=0,
-        )
-        mm_data = {"image": image if num_images == 1 else [image] * num_images}
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+                                   height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text="",
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 25a351bd9c65..deb6987e7f16 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,6 @@
 from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
-import numpy as np
 import torch
 import torch.nn as nn
 from transformers import BatchFeature
@@ -181,16 +180,19 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|AUDIO|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 574845ef5a52..a84adcbca962 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -30,7 +30,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from PIL import Image
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
@@ -891,12 +890,10 @@ def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_images = mm_counts.get("image", 0)
         hf_processor = self._get_hf_processor()
-        image_token: str = hf_processor.image_token
         image_processor = _get_image_processor(hf_processor)
 
-        data = {}
+        image_token: str = hf_processor.image_token
         resized_height, resized_width = smart_resize(
             height=9999999,
             width=9999999,
@@ -904,14 +901,18 @@ def _get_dummy_mm_inputs(
             min_pixels=image_processor.min_pixels,
             max_pixels=image_processor.max_pixels,
         )
+        num_images = mm_counts.get("image", 0)
 
-        dummy_image = Image.new("RGB", (resized_width, resized_height),
-                                color=0)
-        data["image"] = [dummy_image] * num_images
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=resized_width,
+                                   height=resized_height,
+                                   num_images=num_images)
+        }
 
         return ProcessorInputs(
             prompt_text=image_token * num_images,
-            mm_data=data,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 7b4aeeec5f40..8234ce62fb49 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -192,16 +192,19 @@ def _get_dummy_mm_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         feature_extractor = self._get_feature_extractor()
+
         sampling_rate = feature_extractor.sampling_rate
         audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
 
-        audio_count = mm_counts.get("audio", 0)
-        audio = np.zeros(audio_len)
-        data = {"audio": [audio] * audio_count}
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
 
         return ProcessorInputs(
-            prompt_text="<|audio|>" * audio_count,
-            mm_data=data,
+            prompt_text="<|audio|>" * num_audios,
+            mm_data=mm_data,
         )
 
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index f7fb5d3bba51..96812f42b864 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -8,9 +8,10 @@
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
 import numpy as np
+import numpy.typing as npt
 import torch
 from blake3 import blake3
-from PIL.Image import Image
+from PIL import Image
 from transformers import BatchFeature, ProcessorMixin
 
 from vllm.inputs import DummyData, InputProcessingContext
@@ -513,7 +514,7 @@ def _serialize_item(self, obj: object) -> bytes:
             return obj.encode("utf-8")
         if isinstance(obj, bytes):
             return obj
-        if isinstance(obj, Image):
+        if isinstance(obj, Image.Image):
             return obj.tobytes()
 
         # Convertible to NumPy arrays
@@ -1007,6 +1008,36 @@ def apply(
             mm_placeholders=mm_placeholders,
         )
 
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        image = Image.new("RGB", (width, height), color=0)
+        return [image] * num_images
+
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        video = np.zeros((num_frames, width, height, 3))
+        return [video] * num_videos
+
     @abstractmethod
     def _get_dummy_mm_inputs(
         self,

From 45ec10cb61d403c1a0a5b89ba5e27e1212bcc8ec Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 31 Dec 2024 16:58:49 +0800
Subject: [PATCH 25/40] fix fuyu variant images test

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/fuyu.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index d9c91234ac2b..e1c63a511b0e 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -36,6 +36,7 @@
                                         MultiModalFieldConfig, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -117,8 +118,10 @@ def _call_hf_processor(
                                                return_tensors="pt")
             # Drop begin token since it doesn't belong to image_input_ids
             processed_outputs["image_input_ids"] = image_input_ids[:, 2:]
-            processed_outputs["pixel_values"] = processed_outputs.pop(
-                "image_patches")
+            processed_outputs["pixel_values"] = [
+                image_patch[0]
+                for image_patch in processed_outputs.pop("image_patches")
+            ]
         else:
             # FuyuProcessor won't add bos and boa if no images inputs, we add
             # them back manually
@@ -265,6 +268,8 @@ def _process_image_input(
         image_sizes = [
             len(input_ids_per_image) for input_ids_per_image in image_input_ids
         ]
+        if is_list_of(image_input_ids, torch.Tensor):
+            image_input_ids = torch.cat(image_input_ids)
         image_input_ids = torch.flatten(image_input_ids)
 
         image_token_mask = image_input_ids == _IMAGE_TOKEN_ID

From 0fe561d45bff05a2cd1beef31632e0ef0149bbf4 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 09:01:19 +0000
Subject: [PATCH 26/40] Fix some type errors in Pixtral-HF

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/pixtral.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 22d29f5bbc50..2bce13792a88 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,8 +1,8 @@
+import math
 from dataclasses import dataclass, fields
 from functools import cached_property
 from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
 
-import numpy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -306,7 +306,7 @@ def _parse_and_validate_image_input(
         images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
                                torch.Tensor]] = None,
         image_tokens: Optional[torch.Tensor] = None,
-    ) -> Optional[List[torch.Tensor]]:
+    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]:
         if images is None:
             return None, None
 
@@ -604,11 +604,11 @@ def max_patches_per_side(self) -> int:
         return self.args.image_size // self.args.patch_size
 
     @property
-    def device(self) -> torch.device:
+    def device(self) -> torch.types.Device:
         return next(self.parameters()).device
 
     @property
-    def dtype(self) -> torch.device:
+    def dtype(self) -> torch.dtype:
         return next(self.parameters()).dtype
 
     @property
@@ -741,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
     ratio = max(image_width / max_width, image_height / max_height)
 
     if ratio > 1:
-        image_width = int(numpy.ceil(image_width / ratio))
-        image_height = int(numpy.ceil(image_height / ratio))
+        image_width = int(math.ceil(image_width / ratio))
+        image_height = int(math.ceil(image_height / ratio))
 
     num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
         (image_height, image_width),

From 3512ed6034d19094882980ee659a63145a6cceb9 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 31 Dec 2024 17:08:28 +0800
Subject: [PATCH 27/40] fix missing flatten_bn in fuyu

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/fuyu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index e1c63a511b0e..88ba9cede5f9 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -248,7 +248,7 @@ def _parse_and_validate_image_input(
             return FuyuImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                    flatten_bn(flatten_bn(pixel_values), concat=True)),
                 image_input_ids=flatten_bn(image_input_ids),
             )
 

From 5e0f66c3712f7c22bd365debfb2a6d2873fb7dbf Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 09:23:11 +0000
Subject: [PATCH 28/40] Update docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/processing.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index a549180e2cb5..44238262704e 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -675,10 +675,14 @@ def _get_prompt_replacements(
         Given the original multi-modal items for this modality
         and HF-processed data, output the replacements to perform.
 
-        Note:
-            Even when the HF processor already performs replacement for us,
-            we still use this replacement information to determine
-            the placeholder token positions for each multi-modal item.
+        Notes:
+            - You should not assume that HF processor always performs prompt
+              replacement: in :meth:`_apply_hf_processor_missing`, this method
+              is called on text-only and multimodal-only inputs separately,
+              instead of passing them in the same call.
+            - The replacement information returned by this method is also used
+              to determine the placeholder token positions for each multi-modal
+              item.
         """
         raise NotImplementedError
 
@@ -712,6 +716,10 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+        """
+        Call the HF processor on the prompt text and
+        associated multi-modal data.
+        """
         return self.ctx.call_hf_processor(
             self._get_hf_processor(**mm_kwargs),
             dict(text=prompt, **mm_data),
@@ -725,7 +733,8 @@ def _apply_hf_processor(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         """
-        Apply the HF processor on the full prompt text and multi-modal data.
+        Wrapper of :meth:`_call_hf_processor` that applies
+        additional pre-processing and post-processing.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -756,10 +765,11 @@ def _apply_hf_processor_missing(
         Apply the HF processor on the full prompt text, but only on the
         multi-modal data that are missing from the cache.
 
-        Note: We pass prompt text and multi-modal data into the HF processor
-        in separate calls to avoid HF prompt replacement being done for
-        cached items; instead, we rely on our own prompt replacement logic
-        for the full text.
+        Note:
+            We pass prompt text and multi-modal data into the HF processor
+            in separate calls to avoid HF prompt replacement being done for
+            cached items; instead, we rely on our own prompt replacement logic
+            for the full text.
         """
         mm_missing_counts = mm_missing_data_items.get_all_counts()
 

From 1c243abe2ad7319c6bd4f4f714394395a0c93bd4 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 09:24:51 +0000
Subject: [PATCH 29/40] Update docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 44238262704e..7712c3bcebe2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -769,7 +769,7 @@ def _apply_hf_processor_missing(
             We pass prompt text and multi-modal data into the HF processor
             in separate calls to avoid HF prompt replacement being done for
             cached items; instead, we rely on our own prompt replacement logic
-            for the full text.
+            (:meth:`_get_prompt_replacements`) for the full text.
         """
         mm_missing_counts = mm_missing_data_items.get_all_counts()
 

From 09d64f46e9015c71a8d1e0229a2fe5a524294942 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 11:16:39 +0000
Subject: [PATCH 30/40] Get fuyu processor tests to pass

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py |  33 +++---
 vllm/model_executor/models/fuyu.py  | 158 +++++++++++++++-------------
 vllm/model_executor/models/llava.py |   4 +-
 3 files changed, 100 insertions(+), 95 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 43fb6e4e25e7..f51f20451382 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -528,7 +528,7 @@ def _rand_audio(
 
 def _test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -583,9 +583,8 @@ def _test_processing_cache_correctness(
         partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
     }
     input_max_count = {
-        "image": 3,
-        "video": 3,
-        "audio": 3,
+        modality: 3 if supports_multi else 1
+        for modality, supports_multi in modalities.items()
     }
 
     for batch_idx in range(num_batches):
@@ -624,16 +623,16 @@ def _test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("rhymes-ai/Aria", {"image"}),
-    ("Salesforce/blip2-opt-2.7b", {"image"}),
-    ("facebook/chameleon-7b", {"image"}),
-    ("adept/fuyu-8b", {"image"}),
-    ("llava-hf/llava-1.5-7b-hf", {"image"}),
-    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}),
-    ("mistral-community/pixtral-12b", {"image"}),
-    ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}),
-    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}),
-    ("fixie-ai/ultravox-v0_3", {"audio"}),
+    ("rhymes-ai/Aria", {"image": True}),
+    ("Salesforce/blip2-opt-2.7b", [("image", False)]),
+    ("facebook/chameleon-7b", {"image": True}),
+    ("adept/fuyu-8b", [("image", False)]),
+    ("llava-hf/llava-1.5-7b-hf", {"image": True}),
+    ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
+    ("mistral-community/pixtral-12b", {"image": True}),
+    ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
+    ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
+    ("fixie-ai/ultravox-v0_3", {"audio": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -641,7 +640,7 @@ def _test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
@@ -657,7 +656,7 @@ def test_processing_cache_correctness(
 
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
-    ("microsoft/Phi-3-vision-128k-instruct", {"image"}),
+    ("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -665,7 +664,7 @@ def test_processing_cache_correctness(
 # yapf: enable
 def test_processing_cache_correctness_phi3v(
     model_id: str,
-    modalities: set[str],
+    modalities: dict[str, bool],
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 88ba9cede5f9..0c8d4a982167 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -20,7 +20,7 @@
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, FuyuProcessor
+from transformers import BatchFeature, FuyuConfig, FuyuProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -29,14 +29,15 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputsV2, MultiModalKwargs,
+                                    NestedTensors, PlaceholderRange)
+from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataItems,
-                                        MultiModalFieldConfig, ProcessorInputs,
+                                        MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -50,14 +51,13 @@
 MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
 
 
-class FuyuImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
+class FuyuImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
     data: torch.Tensor
     """
     Shape: 
     `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)`
     """
-    image_input_ids: torch.Tensor
 
 
 def _get_fuyu_num_image_tokens(
@@ -104,35 +104,26 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        tokenizer = self._get_tokenizer()
-        processed_outputs = super()._call_hf_processor(prompt, mm_data,
-                                                       mm_kwargs)
-        if "image_patches" in processed_outputs:
-            # separate image_input_ids from input_ids if has image inputs
-            new_prompt = tokenizer.decode(processed_outputs["input_ids"][0],
-                                          skip_special_tokens=True)
-            image_prompt = new_prompt.split("<s>")[0]
-            # we can't set add_special_tokens=False here, because placeholder
-            # and newline are all special tokens
-            image_input_ids = tokenizer.encode(image_prompt,
-                                               return_tensors="pt")
-            # Drop begin token since it doesn't belong to image_input_ids
-            processed_outputs["image_input_ids"] = image_input_ids[:, 2:]
-            processed_outputs["pixel_values"] = [
-                image_patch[0]
-                for image_patch in processed_outputs.pop("image_patches")
-            ]
-        else:
-            # FuyuProcessor won't add bos and boa if no images inputs, we add
-            # them back manually
-            bos_token = tokenizer.encode("<s>", add_special_tokens=False)[1:]
-            boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
-            prompt_ids = tokenizer.encode(
-                prompt,
-                add_special_tokens=False,  # type: ignore
-            )
-            prompt_ids = bos_token + prompt_ids + boa_token
-            processed_outputs["input_ids"] = torch.tensor([prompt_ids])
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        image_patches = processed_outputs.get("image_patches")
+        if image_patches is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
+
+            # Original output: (1, num_images, Pn, Px * Py * C)
+            # New output: (num_images, Pn, Px * Py * C)
+            assert (isinstance(image_patches, list)
+                    and len(image_patches) == 1)
+            assert (isinstance(image_patches[0], torch.Tensor)
+                    and len(image_patches[0]) == len(images))
+
+            processed_outputs["image_patches"] = image_patches[0]
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -140,10 +131,7 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_input_ids=MultiModalFieldConfig.batched("image"),
-        )
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_replacements(
         self,
@@ -151,17 +139,54 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
-        image_input_ids = out_mm_kwargs.get("image_input_ids", [])
-        if isinstance(image_input_ids, torch.Tensor):
-            image_input_ids = image_input_ids.squeeze(0).tolist()
+        hf_config = self.ctx.get_hf_config(FuyuConfig)
+        bos_token_id = hf_config.bos_token_id
+
+        tokenizer = self._get_tokenizer()
+        eot_token_id = tokenizer.bos_token_id
+        assert isinstance(eot_token_id, int)
+        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = _get_fuyu_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
+                    [bos_token_id, boa_token_id])
+
         return [
             PromptReplacement(
                 modality="image",
-                target="",
-                replacement=image_input_ids,
+                target=[eot_token_id],
+                replacement=get_replacement_fuyu,
             )
         ]
 
+    def apply(
+        self,
+        prompt_text: str,
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+
+        # Only |SPEAKER| (image) tokens should be considered as placeholders,
+        # so we ignore the trailing bos_token_id and boa_token_id
+        result["mm_placeholders"] = {
+            modality: [
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 2)
+                for p in ps
+            ]
+            for modality, ps in result["mm_placeholders"].items()
+        }
+
+        return result
+
     def _get_dummy_mm_inputs(
         self,
         mm_counts: Mapping[str, int],
@@ -237,47 +262,28 @@ def _validate_shape(d: torch.Tensor):
         return data.to(self.vision_embed_tokens.weight.dtype)
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[FuyuImagePixelInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_input_ids = kwargs.pop("image_input_ids", None)
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
+            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
+        image_patches = kwargs.pop("image_patches", None)
+        if image_patches is not None:
+            if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(pixel_values)}")
+                                 f"Got type: {type(image_patches)}")
 
-            return FuyuImagePixelInputs(
-                type="pixel_values",
+            return FuyuImagePatchInputs(
+                type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(flatten_bn(pixel_values), concat=True)),
-                image_input_ids=flatten_bn(image_input_ids),
+                    flatten_bn(image_patches, concat=True)),
             )
 
         return None
 
     def _process_image_input(
-            self, image_input: FuyuImagePixelInputs) -> torch.Tensor:
+            self, image_input: FuyuImagePatchInputs) -> torch.Tensor:
 
         assert self.vision_embed_tokens is not None
         vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
         hidden_size = vision_embeddings.shape[-1]
-        vision_embeddings = vision_embeddings.reshape(-1, hidden_size)
-
-        # NOTE: image_input_ids contains both image placeholder tokens and
-        # newline tokens.
-        image_input_ids = image_input["image_input_ids"]
-        image_sizes = [
-            len(input_ids_per_image) for input_ids_per_image in image_input_ids
-        ]
-        if is_list_of(image_input_ids, torch.Tensor):
-            image_input_ids = torch.cat(image_input_ids)
-        image_input_ids = torch.flatten(image_input_ids)
-
-        image_token_mask = image_input_ids == _IMAGE_TOKEN_ID
-        full_vision_embeddings = self.language_model.get_input_embeddings(
-            image_input_ids)
-        full_vision_embeddings[image_token_mask] = vision_embeddings
-
-        return torch.split(full_vision_embeddings, image_sizes)
+        return vision_embeddings.reshape(-1, hidden_size)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 1d6ee2a0be72..34dc7fa31ce6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -144,8 +144,8 @@ def _call_hf_processor(
                 # Original output: (1, num_images, C, H, W)
                 # New output: (num_images, C, H, W)
                 assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1
-                        and isinstance(pixel_values[0], list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
                         and len(pixel_values[0]) == len(images))
 
                 processed_outputs["pixel_values"] = pixel_values[0]

From 6d6d71c474bbfeb52150cc95362335f24feb7a17 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 11:29:42 +0000
Subject: [PATCH 31/40] Oops

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index f51f20451382..81278cde264f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -624,9 +624,9 @@ def _test_processing_cache_correctness(
 # yapf: disable
 @pytest.mark.parametrize(("model_id", "modalities"), [
     ("rhymes-ai/Aria", {"image": True}),
-    ("Salesforce/blip2-opt-2.7b", [("image", False)]),
+    ("Salesforce/blip2-opt-2.7b", {"image": False}),
     ("facebook/chameleon-7b", {"image": True}),
-    ("adept/fuyu-8b", [("image", False)]),
+    ("adept/fuyu-8b", {"image": False}),
     ("llava-hf/llava-1.5-7b-hf", {"image": True}),
     ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
     ("mistral-community/pixtral-12b", {"image": True}),

From ea93a2c42a3754e732c9ab19f085ef2fe7166db3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 11:47:26 +0000
Subject: [PATCH 32/40] Fix unable to run model

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/fuyu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 0c8d4a982167..9cb684821a30 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -301,7 +301,7 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
-                [_IMAGE_TOKEN_ID, _NEWLINE_TOKEN_ID])
+                _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(

From 9aeb7b2bc2af5068e201a0c4c455827757b1d25e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 11:51:30 +0000
Subject: [PATCH 33/40] Avoid warning from HF

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/fuyu.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 9cb684821a30..d5a458ad8565 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -104,6 +104,12 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            tokenizer = self._get_tokenizer()
+            processed_outputs = tokenizer(prompt).data  # type: ignore
+            return BatchFeature(processed_outputs)
+
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,

From 768c1d9bdc7096f9481b27c45fa54735aa7cd1a8 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 31 Dec 2024 21:00:43 +0800
Subject: [PATCH 34/40] fix too large image for fuyu

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/fuyu.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index d5a458ad8565..4926ef93a150 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -20,7 +20,8 @@
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, FuyuConfig, FuyuProcessor
+from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
+                          FuyuProcessor)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -107,7 +108,8 @@ def _call_hf_processor(
         if not mm_data:
             # Avoid warning from HF logger for text-only input
             tokenizer = self._get_tokenizer()
-            processed_outputs = tokenizer(prompt).data  # type: ignore
+            processed_outputs = tokenizer(
+                prompt, return_tensors="pt").data  # type: ignore
             return BatchFeature(processed_outputs)
 
         processed_outputs = super()._call_hf_processor(
@@ -153,13 +155,28 @@ def _get_prompt_replacements(
         assert isinstance(eot_token_id, int)
         boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
 
+        hf_processor = self._get_hf_processor()
+        image_processor: FuyuImageProcessor = hf_processor.image_processor
+        target_size = image_processor.size
+        target_height, target_width = (target_size["height"],
+                                       target_size["width"])
+
         def get_replacement_fuyu(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
+            width, height = image_size.width, image_size.height
+            if not (width <= target_width and height <= target_height):
+                height_scale_factor = target_height / height
+                width_scale_factor = target_width / width
+                optimal_scale_factor = min(height_scale_factor,
+                                           width_scale_factor)
+
+                height = int(height * optimal_scale_factor)
+                width = int(width * optimal_scale_factor)
 
             ncols, nrows = _get_fuyu_num_image_tokens(
-                image_width=image_size.width,
-                image_height=image_size.height,
+                image_width=width,
+                image_height=height,
             )
 
             return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +

From 0c82c512fcf80102e3c0d63ab431529a8f7725f9 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 31 Dec 2024 21:37:54 +0800
Subject: [PATCH 35/40] fix prompt token ids

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/fuyu.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 4926ef93a150..d9a1c00432d2 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -105,12 +105,18 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
+
         if not mm_data:
             # Avoid warning from HF logger for text-only input
+            # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id
+            # Tokenizer won't add boa_token_id by default, we add it manually.
             tokenizer = self._get_tokenizer()
-            processed_outputs = tokenizer(
-                prompt, return_tensors="pt").data  # type: ignore
-            return BatchFeature(processed_outputs)
+            boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
+            processed_outputs = tokenizer(prompt).data  # type: ignore
+            processed_outputs["input_ids"] = [
+                processed_outputs["input_ids"] + [boa_token_id]
+            ]
+            return BatchFeature(processed_outputs, tensor_type="pt")
 
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
@@ -153,7 +159,6 @@ def _get_prompt_replacements(
         tokenizer = self._get_tokenizer()
         eot_token_id = tokenizer.bos_token_id
         assert isinstance(eot_token_id, int)
-        boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
 
         hf_processor = self._get_hf_processor()
         image_processor: FuyuImageProcessor = hf_processor.image_processor
@@ -180,7 +185,7 @@ def get_replacement_fuyu(item_idx: int):
             )
 
             return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
-                    [bos_token_id, boa_token_id])
+                    [bos_token_id])
 
         return [
             PromptReplacement(
@@ -199,10 +204,10 @@ def apply(
         result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
 
         # Only |SPEAKER| (image) tokens should be considered as placeholders,
-        # so we ignore the trailing bos_token_id and boa_token_id
+        # so we ignore the trailing bos_token_id
         result["mm_placeholders"] = {
             modality: [
-                PlaceholderRange(offset=p["offset"], length=p["length"] - 2)
+                PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
                 for p in ps
             ]
             for modality, ps in result["mm_placeholders"].items()
@@ -295,7 +300,7 @@ def _parse_and_validate_image_input(
             return FuyuImagePatchInputs(
                 type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(image_patches, concat=True)),
+                    flatten_bn(flatten_bn(image_patches), concat=True)),
             )
 
         return None

From d0d1fdc4d3f291721ecb2a9bcd3e8b59cfeba915 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 14:41:33 +0000
Subject: [PATCH 36/40] Fix missing batch dimension in vision embeddings

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/fuyu.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index d9a1c00432d2..e9d50a2de90c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -300,18 +300,21 @@ def _parse_and_validate_image_input(
             return FuyuImagePatchInputs(
                 type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(flatten_bn(image_patches), concat=True)),
+                    flatten_bn(image_patches, concat=True)),
             )
 
         return None
 
     def _process_image_input(
             self, image_input: FuyuImagePatchInputs) -> torch.Tensor:
+        image_patches = image_input["data"]
 
         assert self.vision_embed_tokens is not None
-        vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
-        hidden_size = vision_embeddings.shape[-1]
-        return vision_embeddings.reshape(-1, hidden_size)
+        vision_embeddings, _ = self.vision_embed_tokens(image_patches)
+
+        batch_size, num_patches, _ = image_patches.shape
+        _, _, hidden_size = vision_embeddings.shape
+        return vision_embeddings.reshape(batch_size, num_patches, hidden_size)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)

From afcf7b18a0f0e1f7515a7f7c928f8b3c60c3d17d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 31 Dec 2024 23:10:28 +0800
Subject: [PATCH 37/40] fix variant patches batching

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/fuyu.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index e9d50a2de90c..a8ef3cb2872a 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -59,6 +59,10 @@ class FuyuImagePatchInputs(TypedDict):
     Shape: 
     `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)`
     """
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    """
 
 
 def _get_fuyu_num_image_tokens(
@@ -300,21 +304,22 @@ def _parse_and_validate_image_input(
             return FuyuImagePatchInputs(
                 type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(image_patches, concat=True)),
+                    flatten_bn(flatten_bn(image_patches), concat=True)),
+                patches_per_image=[
+                    x.size(0) for x in flatten_bn(image_patches)
+                ],
             )
 
         return None
 
     def _process_image_input(
-            self, image_input: FuyuImagePatchInputs) -> torch.Tensor:
+            self, image_input: FuyuImagePatchInputs) -> NestedTensors:
         image_patches = image_input["data"]
+        patches_per_image = image_input["patches_per_image"]
 
         assert self.vision_embed_tokens is not None
         vision_embeddings, _ = self.vision_embed_tokens(image_patches)
-
-        batch_size, num_patches, _ = image_patches.shape
-        _, _, hidden_size = vision_embeddings.shape
-        return vision_embeddings.reshape(batch_size, num_patches, hidden_size)
+        return vision_embeddings.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)

From cb9522d2836d8283c2e4d2337736ae182a05ba65 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 15:21:46 +0000
Subject: [PATCH 38/40] Simplify the code

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/fuyu.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a8ef3cb2872a..ada6990d342a 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -57,11 +57,13 @@ class FuyuImagePatchInputs(TypedDict):
     data: torch.Tensor
     """
     Shape: 
-    `(batch_size, num_patches, patch_size_x * patch_size_y * num_channels)`
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
     """
+
     patches_per_image: List[int]
     """
     List of number of total patches for each image in the batch.
+    This is used to restore the first two dimensions of `data`.
     """
 
 
@@ -116,11 +118,8 @@ def _call_hf_processor(
             # Tokenizer won't add boa_token_id by default, we add it manually.
             tokenizer = self._get_tokenizer()
             boa_token_id: int = tokenizer.vocab["<0x04>"]  # type: ignore
-            processed_outputs = tokenizer(prompt).data  # type: ignore
-            processed_outputs["input_ids"] = [
-                processed_outputs["input_ids"] + [boa_token_id]
-            ]
-            return BatchFeature(processed_outputs, tensor_type="pt")
+            prompt_ids = tokenizer.encode(prompt) + [boa_token_id]
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
@@ -300,13 +299,15 @@ def _parse_and_validate_image_input(
             if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
+        
+            image_patches_flat = flatten_bn(image_patches)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 data=self._validate_pixel_values(
-                    flatten_bn(flatten_bn(image_patches), concat=True)),
+                    flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[
-                    x.size(0) for x in flatten_bn(image_patches)
+                    x.size(0) for x in image_patches_flat
                 ],
             )
 

From df832dfc9537ed0cbe48011ef31bb8b9a9d0c1ab Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 31 Dec 2024 15:39:17 +0000
Subject: [PATCH 39/40] format

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/fuyu.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index ada6990d342a..8c14866f20b9 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -299,16 +299,14 @@ def _parse_and_validate_image_input(
             if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
-        
+
             image_patches_flat = flatten_bn(image_patches)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
-                patches_per_image=[
-                    x.size(0) for x in image_patches_flat
-                ],
+                patches_per_image=[x.size(0) for x in image_patches_flat],
             )
 
         return None

From cc9c5f1fc8aa4ac8dea8aa712a79875f73f80c97 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Tue, 31 Dec 2024 19:28:32 +0000
Subject: [PATCH 40/40] simplify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/aria.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 0648d98ac405..4ad6e859f4d9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,4 +1,3 @@
-import math
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
@@ -445,11 +444,7 @@ def build_mm_projector(config: PretrainedConfig):
 
 def get_max_aria_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config()
-    image_size2tokens = {
-        int(math.sqrt(k) * hf_config.vision_config.patch_size): v
-        for k, v in hf_config.projector_patch_to_query_dict.items()
-    }
-    return max(image_size2tokens.values())
+    return max(hf_config.projector_patch_to_query_dict.values())
 
 
 class AriaMultiModalProcessor(BaseMultiModalProcessor):