fix eagle case

jaedeok-nvidia · jaedeok-nvidia · commit d11975d559f4 · 2025-07-11T22:07:47.000+09:00
Signed-off-by: Jaedeok Kim &lt;jaedeokk@nvidia.com&gt;
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -191,7 +191,11 @@ def setup_llm(args):
     greedy_decoding = ((args.temperature == 0.0)
                        or (args.top_k == 1 and
                            (args.top_p == 0.0 or args.top_p is None)))
-    mixed_sampler = not greedy_decoding and not args.enable_trtllm_sampler
+    mixed_sampler = (
+        not greedy_decoding and not args.enable_trtllm_sampler
+        # Eagle3 does not support mixed sampler.
+        # Refer TorchSampler._process_requests.
+        and spec_decode_algo != 'EAGLE3')
 
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=args.cuda_graph_batch_sizes,
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -288,6 +288,13 @@ def finish_by(request: Union[
     request.set_finished_reason(reason, beam)
 
 
+def is_generation_only_request(
+    request: Union['LlmRequest',
+                   tensorrt_llm.bindings.internal.batch_manager.LlmRequest]
+) -> bool:
+    return request.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+
+
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
     but detour some features to Python implementation"""
@@ -356,8 +363,10 @@ def __init__(
                                   return_generation_logits,
                                   exclude_last_generation_logits)
 
-    def is_generation_only_request(self):
-        return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+    def is_generation_only(self):
+        # is_generation_only_request is a property getter at the C++ side,
+        # so here use a different name at the pytorch backend.
+        return is_generation_only_request(self)
 
     def create_child_request(self, request_id: int):
         """ Create a child request.
@@ -386,11 +395,14 @@ def create_child_request(self, request_id: int):
         child_request.py_request_id = child_request.request_id
         child_request.py_llm_request_type = child_request.llm_request_type
         child_request.py_batch_idx = None
+        child_request.py_seq_slot = None
 
         # Mimic the behavior of the original LlmRequest.
         child_request.is_attention_dp_dummy = self.is_attention_dp_dummy
         child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy
         child_request.is_dummy = self.is_dummy
+        child_request.is_generation_only = partial(is_generation_only_request,
+                                                   child_request)
         child_request.create_response = partial(create_response, child_request)
         child_request.finish_by = partial(finish_by, child_request)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2081,7 +2081,7 @@ def _handle_responses(self):
                 requests_to_terminate.append(request)
                 continue
 
-            if request.is_generation_only_request():
+            if request.is_generation_only():
                 # If request is in transmission, so we don't need to emit a response
                 # Also, for the first iteration with overlap, we should skip since first
                 # token has already been emitted previously
diff --git a/tests/unittest/_torch/test_llm_request.py b/tests/unittest/_torch/test_llm_request.py
@@ -215,6 +215,8 @@ def test_create_response():
     child_request.state = LlmRequestState.GENERATION_IN_PROGRESS
 
     response = request.create_response(use_fast_logits=True, mpi_world_rank=1)
+    # The response having non-error result contain _result.
+    response.result.deserialize()
     assert response is not None
     assert isinstance(response, LlmResponse)
     assert response.request_id == 1
@@ -224,6 +226,7 @@ def test_create_response():
     assert response.result.sequence_index == 0
 
     child_response = child_request.create_response()
+    child_response.result.deserialize()
     assert child_response is not None
     assert child_response.request_id == 2
     assert child_response.client_id == child_request.py_client_id
@@ -234,12 +237,14 @@ def test_create_response():
     child_request.state = LlmRequestState.GENERATION_COMPLETE
     # is_final=False since the parent request is not yet complete.
     child_response = child_request.create_response()
+    child_response.result.deserialize()
     assert child_response.result.is_final is False
     assert child_response.result.is_sequence_final is True
 
     # is_final=True since all requests are complete.
     request.state = LlmRequestState.GENERATION_COMPLETE
     response = request.create_response()
+    response.result.deserialize()
     assert response.result.is_final is True
     assert response.result.is_sequence_final is True