[TRTLLM-5271] feat: best_of/n for pytorch workflow

evezhier · evezhier · commit 654bac2d3abe · 2025-07-16T15:24:17.000Z
Signed-off-by: Olya Kozlova &lt;okozlova@nvidia.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -467,6 +467,9 @@ class GenericLlmRequest
         initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
     }
 
+    GenericLlmRequest(GenericLlmRequest&& request) = default;
+    GenericLlmRequest(GenericLlmRequest const& request) = default;
+
     void setExcludeInputFromOutput(bool exclude)
     {
         mExcludeInputFromOutput = exclude;
@@ -2315,6 +2318,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         mKvCacheRetentionConfig = request.getKvCacheRetentionConfig();
     }
 
+    LlmRequest(LlmRequest&& request) = default;
+    LlmRequest(LlmRequest const& request) = default;
+
     /// @brief  Create a Response from the current state of the request
     /// @details Note that there is some dependency on the order of operations in this method. Modify with care!
     /// @return An optional Response
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -194,6 +194,8 @@ void initBindings(pybind11::module_& m)
         .def_property_readonly("missed_blocks", &GenLlmReq::getMissedBlocksPerRequest)
         .def_property_readonly("kv_cache_hit_rate", &GenLlmReq::getKVCacheHitRatePerRequest)
         .def_property_readonly("llm_request_type", &GenLlmReq::getLlmRequestType)
+        .def_property_readonly("parent_request_id", &GenLlmReq::getParentRequestId)
+        .def_property_readonly("is_child", &GenLlmReq::isChild)
         .def_property_readonly("multimodal_hashes",
             [](GenLlmReq& self)
             {
@@ -256,7 +258,7 @@ void initBindings(pybind11::module_& m)
         .def_property_readonly("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics);
 
     py::classh<tb::LlmRequest, GenLlmReq>(m, "LlmRequest", pybind11::dynamic_attr())
-        .def(py::init(
+        .def(py::init<>(
                  [](tb::LlmRequest::RequestIdType request_id, tb::LlmRequest::SizeType32 max_new_tokens,
                      std::vector<tb::LlmRequest::TokenIdType> input_tokens, runtime::SamplingConfig sampling_config,
                      bool is_streaming, std::optional<tb::LlmRequest::SizeType32> end_id,
@@ -359,11 +361,14 @@ void initBindings(pybind11::module_& m)
             py::arg("return_perf_metrics") = false, py::arg("guided_decoding_params") = std::nullopt,
             py::arg("language_adapter_uid") = std::nullopt, py::arg("allotted_time_ms") = std::nullopt,
             py::arg("context_phase_params") = std::nullopt)
+        .def(py::init<tb::LlmRequest const&>())
+        //.def(py::init<tb::LlmRequest&&>())
         .def("validate", &tb::LlmRequest::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
             py::arg("max_draft_len"), py::arg("vocab_size_padded"), py::arg("max_endocer_input_len") = std::nullopt,
             py::arg("enable_kv_cache_reuse") = false)
         .def("create_response", &tb::LlmRequest::createResponse, py::arg("use_fast_logits") = false,
             py::arg("mpi_world_rank") = 0)
+        .def("create_child_request", &tb::LlmRequest::createChildRequest, py::arg("child_id"))
         .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false,
             py::arg("mpi_world_rank") = 0)
         .def("create_serialized_result",
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -104,6 +104,8 @@ def add_llm_args(parser):
     parser.add_argument("--top_k", type=int, default=None)
     parser.add_argument("--top_p", type=float, default=None)
     parser.add_argument('--load_format', type=str, default='auto')
+    parser.add_argument('--n', type=int, default=1)
+    parser.add_argument('--best_of', type=int, default=None)
     parser.add_argument('--max_beam_width', type=int, default=1)
 
     # Speculative decoding
@@ -229,10 +231,12 @@ def setup_llm(args):
         temperature=args.temperature,
         top_k=args.top_k,
         top_p=args.top_p,
+        best_of=args.max_beam_width
+        if args.max_beam_width > 1 else args.best_of,
         return_context_logits=args.return_context_logits,
         return_generation_logits=args.return_generation_logits,
         logprobs=args.logprobs,
-        n=args.max_beam_width,
+        n=args.n,
         use_beam_search=args.max_beam_width > 1)
     return llm, sampling_params
 
@@ -246,23 +250,23 @@ def main():
 
     for i, output in enumerate(outputs):
         prompt = output.prompt
-        for beam_idx, beam in enumerate(output.outputs):
-            generated_text = beam.text
+        for sequence_idx, sequence in enumerate(output.outputs):
+            generated_text = sequence.text
             # Skip printing the beam_idx if no beam search was used
-            beam_id_text = f"[{beam_idx}]" if args.max_beam_width > 1 else ""
+            sequence_id_text = f"[{sequence_idx}]" if args.max_beam_width > 1 or args.n > 1 else ""
             print(
-                f"[{i}]{beam_id_text} Prompt: {prompt!r}, Generated text: {generated_text!r}"
+                f"[{i}]{sequence_id_text} Prompt: {prompt!r}, Generated text: {generated_text!r}"
             )
             if args.return_context_logits:
                 print(
-                    f"[{i}]{beam_id_text} Context logits: {output.context_logits}"
+                    f"[{i}]{sequence_id_text} Context logits: {output.context_logits}"
                 )
             if args.return_generation_logits:
                 print(
-                    f"[{i}]{beam_id_text} Generation logits: {beam.generation_logits}"
+                    f"[{i}]{sequence_id_text} Generation logits: {sequence.generation_logits}"
                 )
             if args.logprobs:
-                print(f"[{i}]{beam_id_text} Logprobs: {beam.logprobs}")
+                print(f"[{i}]{sequence_id_text} Logprobs: {sequence.logprobs}")
 
 
 if __name__ == '__main__':
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -277,22 +277,28 @@ def __init__(
             exclude_last_generation_logits: bool = False,
             return_perf_metrics: bool = False,
             stop_words_list: list[list[int]] | None = None,
+            llm_request: Optional[
+                tensorrt_llm.bindings.internal.batch_manager.LlmRequest] = None,
             is_draft: bool = False,
             **kwargs):
+
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
                                                     None)
         # Multimodal data
         self.py_multimodal_data = kwargs.pop("py_multimodal_data", None)
-        super().__init__(
-            *args,
-            client_id=client_id,
-            return_log_probs=return_log_probs,
-            return_context_logits=False,
-            return_generation_logits=False,
-            return_perf_metrics=return_perf_metrics,
-            stop_words_list=torch.tensor(stop_words_list, dtype=torch.int32)
-            if stop_words_list else None,
-            **kwargs)
+        if llm_request is not None:
+            super().__init__(llm_request)
+        else:
+            super().__init__(
+                *args,
+                client_id=client_id,
+                return_log_probs=return_log_probs,
+                return_context_logits=False,
+                return_generation_logits=False,
+                return_perf_metrics=return_perf_metrics,
+                stop_words_list=torch.tensor(stop_words_list, dtype=torch.int32)
+                if stop_words_list else None,
+                **kwargs)
         self.py_client_id = client_id
         self.py_request_id = self.request_id
         self.py_llm_request_type = self.llm_request_type
@@ -326,6 +332,7 @@ def __init__(
                                   return_log_probs, return_context_logits,
                                   return_generation_logits,
                                   exclude_last_generation_logits)
+        self.children = []
 
     def is_generation_only_request(self):
         return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
@@ -337,7 +344,8 @@ def create_response(
         result, is_final = super().create_serialized_result(
             use_fast_logits, mpi_world_rank)
         return LlmResponse(
-            request_id=self.py_request_id,
+            request_id=self.py_request_id
+            if self.is_child else self.parent_request_id,
             result=LlmResult(result, self.py_result, is_final),
             client_id=self.py_client_id) if len(result) > 0 else None
 
@@ -350,6 +358,26 @@ def finish_by(self, reason: FinishReason, beam: int) -> None:
         self.state = LlmRequestState.GENERATION_COMPLETE
         self.set_finished_reason(reason, beam)
 
+    def create_child_request(self, child_id):
+        child = super().create_child_request(child_id)
+        py_request = LlmRequest(llm_request=child)
+        py_request.__dict__.update(**self.__dict__)
+
+        py_request.py_result = PyResult(
+            self.py_prompt_len, self.py_max_new_tokens,
+            self.py_return_logits_device_memory, self.streaming,
+            self.py_return_log_probs, self.py_return_context_logits,
+            self.py_return_generation_logits)
+        py_request.py_request_id = child.request_id
+        py_request.children = []
+
+        assert py_request.is_child
+        assert py_request.request_id == child.request_id
+        assert py_request.parent_request_id == self.request_id
+        assert py_request.sampling_config.random_seed != self.sampling_config.random_seed
+
+        return py_request
+
 
 def convert_wordlist(word_list) -> List[List[int]]:
     """Converts a wordlist from format:
@@ -391,6 +419,7 @@ def convert_wordlist(word_list) -> List[List[int]]:
 def executor_request_to_llm_request(
         req_id: int,
         executor_request: ExecutorRequest,
+        child_req_ids: List[int],
         exclude_last_generation_logits: bool,
         input_token_ids: Optional[List] = None) -> LlmRequest:
     executor_sampling_config = executor_request.sampling_config
@@ -475,4 +504,9 @@ def executor_request_to_llm_request(
         context_phase_params=executor_request.context_phase_params,
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None))
+    if child_req_ids:
+        for child_id in child_req_ids:
+            llm_request.children.append(
+                llm_request.create_child_request(child_id))
+
     return llm_request
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -57,6 +57,7 @@
 class RequestQueueItem:
     id: int
     request: Optional[ExecutorRequest] = None
+    child_req_ids: Optional[list] = None
     is_canceled_request: bool = False
     query: Optional[list] = None  # only used in `StarAttention`
 
@@ -88,6 +89,13 @@ def _get_from_request_queue(
     return items
 
 
+def _get_num_child_requests(request: ExecutorRequest) -> int:
+    sampling_config = request.sampling_config
+    logger.info(sampling_config)
+    return 0 if sampling_config.beam_width > 1 else (
+        sampling_config.num_return_sequences or 1) - 1
+
+
 def _get_from_waiting_queue(
     waiting_queue: deque[RequestQueueItem],
     max_req_count: int,
@@ -108,8 +116,9 @@ def _get_from_waiting_queue(
     items = []
     req_count = 0
     while req_count < max_req_count and waiting_queue:
-        items.append(waiting_queue.popleft())
-        req_count += 1
+        req_item = waiting_queue.popleft()
+        items.append(req_item)
+        req_count += 1 + _get_num_child_requests(req_item.request)
     return items
 
 
@@ -359,9 +368,16 @@ def enqueue_requests(self, requests: List[ExecutorRequest]):
             start_time = time.time()
             for request in requests:
                 self.start_times[self.next_req_id] = start_time
-                self.request_queue.put(
-                    RequestQueueItem(self.next_req_id, request))
+                req_id = self.next_req_id
                 req_ids.append(self.next_req_id)
+
+                child_req_ids = []
+                num_child_requests = _get_num_child_requests(request)
+                for _ in range(num_child_requests):
+                    self.next_req_id += 1
+                    child_req_ids.append(self.next_req_id)
+                self.request_queue.put(
+                    RequestQueueItem(req_id, request, child_req_ids))
                 self.next_req_id += 1
         finally:
             self.enqueue_lock.release()
@@ -472,14 +488,23 @@ def enqueue_request(self,
         try:
             self.enqueue_lock.acquire()
             assert self.active, "PyExecutor has already been shutdown."
+            logger.info(
+                f"Enqueuing new Executor request with id {self.next_req_id}")
             req_id = self.next_req_id
             if self.enable_iter_perf_stats:
                 self.start_times[req_id] = time.time()
 
             if query is not None:
-                self.request_queue.put(RequestQueueItem(req_id, request, query))
+                self.request_queue.put(
+                    RequestQueueItem(req_id, request, [], False, query))
             else:
-                self.request_queue.put(RequestQueueItem(req_id, request))
+                child_req_ids = []
+                num_child_requests = _get_num_child_requests(request)
+                for _ in range(num_child_requests):
+                    self.next_req_id += 1
+                    child_req_ids.append(self.next_req_id)
+                self.request_queue.put(
+                    RequestQueueItem(req_id, request, child_req_ids))
             self.next_req_id += 1
         finally:
             self.enqueue_lock.release()
@@ -1506,12 +1531,15 @@ def _merge_requests(self, new_requests: list[RequestQueueItem]):
             else:
                 raise NotImplementedError(f'unsupport cp type {cp_type}')
         else:
-            return [
-                executor_request_to_llm_request(
-                    req_item.id, req_item.request,
+            req_with_children = []
+            for req_item in new_requests:
+                req = executor_request_to_llm_request(
+                    req_item.id, req_item.request, req_item.child_req_ids,
                     self._should_exclude_last_generation_logits())
-                for req_item in new_requests
-            ]
+                req_with_children.append(req)
+                for child in req.children:
+                    req_with_children.append(child)
+            return req_with_children
 
     @nvtx_range("_schedule")
     def _schedule(self):
@@ -1977,7 +2005,7 @@ def _handle_canceled_requests(self):
                                    if req.id not in self.canceled_req_ids)
 
         for request in self.active_requests:
-            req_id = request.py_request_id
+            req_id = request.py_request_id if not request.is_child else request.parent_request_id
             if req_id in self.canceled_req_ids:
                 # Mark requests as finished, then, we reuse all existing code
                 # to clean up the KV cache resources.
@@ -1991,7 +2019,7 @@ def _handle_canceled_requests(self):
             self.canceled_req_ids.clear()
 
     @nvtx_range("_enqueue_responses")
-    def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
+    def _enqueue_responses(self, responses: List[Tuple[int, LlmResponse]]):
         if 0 not in self.dist.mapping.tp_group and not self.gather_all_responses:
             return
 
@@ -2003,18 +2031,18 @@ def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
             else:
                 responses_list = self.dist.allgather(responses)
             if self.dist.rank == 0 or self.gather_all_responses:
-                gather_responses = {}
+                gather_responses = []
                 if responses_list is not None:
                     for resp in responses_list:
                         if resp is not None:
-                            gather_responses.update(resp)
+                            gather_responses.append(resp)
                     responses = gather_responses
         logger.debug(
             f'after gather, rank = {self.dist.rank}, responses = {responses}')
 
         if self.dist.rank == 0 or self.gather_all_responses:
             with self.response_cv:
-                for req_id, resp in responses.items():
+                for req_id, resp in responses:
                     if req_id in self.responses.keys():
                         self.responses[req_id].append(resp)
                     else:
@@ -2023,20 +2051,20 @@ def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
 
     @nvtx_range("_handle_first_token_response")
     def _handle_first_token_response(self, scheduled_batch):
-        new_responses = {}
+        new_responses = []
         for req in scheduled_batch.generation_requests:
             if req.py_decoding_iter == 1:
                 logger.debug(
                     f'Send first token response for request {req.py_request_id}'
                 )
                 response = req.create_response(False, self.dist.rank)
-                new_responses.update({req.py_request_id: response})
+                new_responses.append((req.py_request_id, response))
 
         self._enqueue_responses(new_responses)
 
     @nvtx_range("_handle_responses")
     def _handle_responses(self):
-        new_responses = {}
+        new_responses = []
         requests_to_terminate = []
         new_active_requests = []
         logger.debug(
@@ -2070,14 +2098,17 @@ def _handle_responses(self):
                     request.py_decoding_iter % self.stream_interval == 0:
                 response = request.create_response(False, self.dist.rank)
                 if response:
-                    request_done = response.result.is_final
-                    new_responses.update({req_id: response})
+                    request_done = request.is_finished
+                    new_responses.append((req_id, response))
 
             if request_done:
                 if request.is_disagg_context_transmission_state:
                     self.ctx_in_transmission_requests.append(request)
                 else:
-                    requests_to_terminate.append(request)
+                    if response.result.is_final:
+                        requests_to_terminate.append(request)
+                        for child in request.children:
+                            requests_to_terminate.append(child)
             else:
                 new_active_requests.append(request)
         self.active_requests = new_active_requests
diff --git a/tests/unittest/_torch/test_best_of_n.py b/tests/unittest/_torch/test_best_of_n.py

Original file line number	Diff line number	Diff line change
`@@ -467,6 +467,9 @@ class GenericLlmRequest`
`467`	`467`	`initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);`
`468`	`468`	`}`
`469`	`469`
	`470`	`+ GenericLlmRequest(GenericLlmRequest&& request) = default;`
	`471`	`+ GenericLlmRequest(GenericLlmRequest const& request) = default;`
	`472`	`+`
`470`	`473`	`void setExcludeInputFromOutput(bool exclude)`
`471`	`474`	`{`
`472`	`475`	`mExcludeInputFromOutput = exclude;`
`@@ -2315,6 +2318,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>`
`2315`	`2318`	`mKvCacheRetentionConfig = request.getKvCacheRetentionConfig();`
`2316`	`2319`	`}`
`2317`	`2320`
	`2321`	`+ LlmRequest(LlmRequest&& request) = default;`
	`2322`	`+ LlmRequest(LlmRequest const& request) = default;`
	`2323`	`+`
`2318`	`2324`	`/// @brief Create a Response from the current state of the request`
`2319`	`2325`	`/// @details Note that there is some dependency on the order of operations in this method. Modify with care!`
`2320`	`2326`	`/// @return An optional Response`