Skip to content

Commit b519eca

Browse files
fix py llm request
Signed-off-by: Jaedeok Kim <[email protected]>
1 parent d991599 commit b519eca

File tree

2 files changed

+15
-5
lines changed

2 files changed

+15
-5
lines changed

examples/llm-api/quickstart_advanced.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def setup_llm(args):
217217
if args.use_torch_compile else None,
218218
moe_backend=args.moe_backend,
219219
enable_trtllm_sampler=args.enable_trtllm_sampler,
220-
mixed_sampler=mixed_sampler,
220+
enable_mixed_sampler=mixed_sampler,
221221
max_seq_len=args.max_seq_len,
222222
max_batch_size=args.max_batch_size,
223223
max_num_tokens=args.max_num_tokens,

tensorrt_llm/_torch/pyexecutor/llm_request.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -276,10 +276,19 @@ def create_response(
276276
return None
277277
else:
278278
return LlmResponse(request_id=request.py_request_id,
279-
result=LlmResult(result, request.py_result),
279+
result=LlmResult(result, request.py_result,
280+
result.is_final),
280281
client_id=request.py_client_id)
281282

282283

284+
def finish_by(request: Union[
285+
'LlmRequest', tensorrt_llm.bindings.internal.batch_manager.LlmRequest],
286+
reason: FinishReason, beam: int) -> None:
287+
"""CPP finish by reason does not support beam_width > 1"""
288+
request.state = LlmRequestState.GENERATION_COMPLETE
289+
request.set_finished_reason(reason, beam)
290+
291+
283292
class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
284293
"""LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
285294
but detour some features to Python implementation"""
@@ -298,6 +307,7 @@ def __init__(
298307
stop_words_list: list[list[int]] | None = None,
299308
is_draft: bool = False,
300309
**kwargs):
310+
301311
self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
302312
None)
303313
# Multimodal data
@@ -377,8 +387,9 @@ def create_child_request(self, request_id: int):
377387
child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy
378388
child_request.is_dummy = self.is_dummy
379389

380-
# Override create_response to return the child request
390+
# Mimic the behavior of the original LlmRequest.
381391
child_request.create_response = partial(create_response, child_request)
392+
child_request.finish_by = partial(finish_by, child_request)
382393

383394
return child_request
384395

@@ -394,8 +405,7 @@ def is_dummy(self):
394405

395406
def finish_by(self, reason: FinishReason, beam: int) -> None:
396407
"""CPP finish by reason does not support beam_width > 1"""
397-
self.state = LlmRequestState.GENERATION_COMPLETE
398-
self.set_finished_reason(reason, beam)
408+
finish_by(self, reason, beam)
399409

400410

401411
def convert_wordlist(word_list) -> List[List[int]]:

0 commit comments

Comments
 (0)