NVIDIA · chang-l · Aug 21, 2025 · Aug 15, 2025 · Aug 15, 2025
@@ -1181,6 +1181,17 @@ def _forward_step_inter_pp(self, scheduled_batch) -> SampleState:
 
     def _validate_request(self, request: LlmRequest):
         if isinstance(self.model_engine.model, DecoderModelForCausalLM):
+            # Only skip token‐range checks for Llama4 when the request has multimodal data
+            from ..models.modeling_llama import Llama4ForConditionalGeneration
+            if isinstance(self.model_engine.model,
+                          Llama4ForConditionalGeneration):
+                has_mm = bool(request.py_multimodal_data)
+                if has_mm:
+                    logger.debug(
+                        f"Skipping token-range validation for {type(self.model_engine.model).__name__} "
+                        "(multimodal request)")
+                    return
+
             # FIXME: This check is necessary because of how Qwen2ForProcessRewardModel
             #        subclasses DecoderModelForCausalLM. Perhaps the functionality
             #        of DecoderModelForCausalLM reused by Qwen2ForProcessRewardModel

diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@@ -1,6 +1,7 @@
 from difflib import SequenceMatcher
 
 import pytest
+import torch
 from utils.llm_data import llm_models_root
 
 from tensorrt_llm import LLM, SamplingParams
@@ -43,19 +44,17 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph,
             "This is a very long prompt to exercise long context. Count up to 10000 from 1, 2, 3,"
             + ", ".join(str(i) for i in range(4, 9000))
         },
-        # TODO: Fix multimodal test.
-        # {
-        #     "prompt": "<|image|>This image is of color",
-        #     "multi_modal_data": {
-        #         "image": [torch.ones(3, 1024, 1024)]
-        #     }
-        # },
+        {
+            "prompt": "<|image|>This image is of color",
+            "multi_modal_data": {
+                "image": [torch.ones(3, 1024, 1024)]
+            }
+        },
     ]
 
     expected_outputs = [
-        " the head of state and head of government of the",
-        ", 9000, 9001, ",
-        # " white. What is the color of the background of"  # TODO: Fix multimodal test.
+        " the head of state and head of government of the", ", 9000, 9001, ",
+        " white. What is the color of the background of"
     ]
 
     pytorch_config = dict(attn_backend=backend)