From de50b1266efae569277397aa6bedf596326e1468 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Mar 2025 23:17:44 +0800
Subject: [PATCH 1/2] draft mllama

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/contributing/model/multimodal.md | 47 +++++++++++++++++++-
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 9cbfc32991f0..2a05a5b4515b 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -598,8 +598,8 @@ def get_dummy_processor_inputs(
 
 ## 4. Specify processing details
 
-Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
-to fill in the missing details about HF processing.
+Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (decoder-only models) /
+{class}`~vllm.multimodal.processing.EncDecMultiModalProcessor` (encoder-decoder models) to fill in the missing details about HF processing.
 
 :::{seealso}
 [Multi-Modal Data Processing](#mm-processing)
@@ -932,6 +932,49 @@ def _get_prompt_updates(
 
 ::::
 
+### (Optional) Encoder-Decoder prompt construction
+If your model is encoder-decoder architecture, you need to also implement `create_encoder_prompt` about
+how to create econder prompt from an implicit text/tokens prompt.
+
+::::{tab-set}
+:::{tab-item} Cross modality example: Mllama
+:sync: mllama
+
+For models like Mllama and Whisper, their encoder only accept processed modality data. However, to support cross-attention
+profiling, we still need to provide a "fake" encoder prompt to profile the sequence length occupied by encoder hidden states.
+
+In this case, we can treat encoder prompt as features tokens created by prompt updates.
+
+Therefore, we just need to provide image token for number of images as encoder prompt. And let prompt updates construct
+the final encoder prompt for us automatically.
+
+```python
+def create_encoder_prompt(
+    self,
+    prompt: Union[str, list[int]],
+    mm_data: MultiModalDataDict,
+) -> Union[str, list[int]]:
+    data = mm_data.get("image", [])
+    num_images = 1 if isinstance(data, Image) else len(data)
+    image_token_id = self.info.get_hf_config().image_token_index
+    return [image_token_id] * num_images
+```
+
+:::
+
+:::{tab-item} Cross text-only: Florence-2
+:sync: florence2
+
+For Florence-2, cross-attention only occurs in its text backbone (Bart), and it uses both
+`encoder_input_ids` and `input_ids`(decoder) in forwarding.
+
+In this case, we need to provide appropriate encoder prompt and decoder prompt to make sure
+correct `encoder_input_ids` and `input_ids` are fed to Bart backbone.
+
+Let's take a look at Florence-2's processed prompts:
+
+::::
+
 ## 5. Register processor-related classes
 
 After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),

From e2dc76d2b49ac42a054b780030ae1ecd1b94c70c Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Mar 2025 23:53:56 +0800
Subject: [PATCH 2/2] florence-2

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/contributing/model/multimodal.md | 54 ++++++++++++++++++--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 2a05a5b4515b..66ac30bedfcd 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -933,8 +933,8 @@ def _get_prompt_updates(
 ::::
 
 ### (Optional) Encoder-Decoder prompt construction
-If your model is encoder-decoder architecture, you need to also implement `create_encoder_prompt` about
-how to create econder prompt from an implicit text/tokens prompt.
+If your model is encoder-decoder architecture, you need to also implement `create_encoder_prompt` and
+`create_decoder_prompt` to indicate how to create econder/decoder prompt from an implicit text/tokens prompt.
 
 ::::{tab-set}
 :::{tab-item} Cross modality example: Mllama
@@ -943,7 +943,8 @@ how to create econder prompt from an implicit text/tokens prompt.
 For models like Mllama and Whisper, their encoder only accept processed modality data. However, to support cross-attention
 profiling, we still need to provide a "fake" encoder prompt to profile the sequence length occupied by encoder hidden states.
 
-In this case, we can treat encoder prompt as features tokens created by prompt updates.
+In this case, we can treat encoder prompt as features tokens created by prompt updates, and implicit prompt as decoder `input_ids`
+(default behavior of `create_decoder_prompt`).
 
 Therefore, we just need to provide image token for number of images as encoder prompt. And let prompt updates construct
 the final encoder prompt for us automatically.
@@ -971,7 +972,52 @@ For Florence-2, cross-attention only occurs in its text backbone (Bart), and it
 In this case, we need to provide appropriate encoder prompt and decoder prompt to make sure
 correct `encoder_input_ids` and `input_ids` are fed to Bart backbone.
 
-Let's take a look at Florence-2's processed prompts:
+For encoder part, we can treat implicit prompt as encoder prompt because it will be processed
+by hf_processor and should be fed to encoder as `encoder_input_ids` directly. We can let
+`create_encoder_prompt` return original prompt directly:
+
+```python
+def create_encoder_prompt(
+    self,
+    prompt: Union[str, list[int]],
+    mm_data: MultiModalDataDict,
+) -> Union[str, list[int]]:
+    return prompt
+```
+
+Then, let's go to the decoder part and take a look at how HF's Bart preparing decoder token_ids:
+
+```python
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+    return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+```
+
+Given the code above, if we don't provide an explicit decoder prompt, the decoder prompt is just one EOS token,
+so we can implement `create_decoder_prompt` as below:
+
+```python
+def create_decoder_prompt(
+    self,
+    prompt: Union[str, list[int]],
+    mm_data: MultiModalDataDict,
+) -> Union[str, list[int]]:
+    return [self.info.get_hf_config().eos_token_id]
+```
 
 ::::