diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index db650b37a38..f9235618661 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -558,6 +558,7 @@ Specified using `--task generate`.
| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b9e8bef26eb..b136b14cd8e 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
+# Keye-VL
+def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
+ model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=8192,
+ trust_remote_code=True,
+ limit_mm_per_prompt={modality: 1},
+ )
+
+ if modality == "image":
+ placeholder = "<|image_pad|>"
+ elif modality == "video":
+ placeholder = "<|video_pad|>"
+
+ prompts = [
+ (
+ f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+ f"{question}<|im_end|>\n"
+ "<|im_start|>assistant\n"
+ )
+ for question in questions
+ ]
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
+
+
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@@ -1154,6 +1185,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3,
"internvl_chat": run_internvl,
+ "keye_vl": run_keye_vl,
"kimi_vl": run_kimi_vl,
"llava": run_llava,
"llava-next": run_llava_next,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index edddd429364..13af8e90419 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+ model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+ engine_args = EngineArgs(
+ model=model_name,
+ trust_remote_code=True,
+ max_model_len=8192,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"image": len(image_urls)},
+ )
+
+ placeholders = [{"type": "image", "image": url} for url in image_urls]
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ *placeholders,
+ {"type": "text", "text": question},
+ ],
+ },
+ ]
+
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+ prompt = processor.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+
+ image_data = [fetch_image(url) for url in image_urls]
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ image_data=image_data,
+ )
+
+
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
@@ -862,6 +899,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
+ "keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl,
"llava": load_llava,
"llava-next": load_llava_next,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index affe2e88b2b..d58eb513f52 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -349,6 +349,8 @@ def check_available_online(
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
+ "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+ trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
trust_remote_code=True,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 45f1894d022..a090ab5c90a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -537,7 +537,7 @@ def _placeholder_str(self, modality: ModalityStr,
return ""
if model_type in ("mllama", "llama4"):
return "<|image|>"
- if model_type in ("qwen2_vl", "qwen2_5_vl"):
+ if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "qwen2_5_omni":
return "<|vision_start|><|IMAGE|><|vision_end|>"
@@ -567,7 +567,7 @@ def _placeholder_str(self, modality: ModalityStr,
return "