fix: multi-GPU gather responses

evezhier · evezhier · commit f6c7b25f8af6 · 2025-08-03T21:08:34.000+02:00
Signed-off-by: Olya Kozlova &lt;okozlova@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -123,15 +123,15 @@ def _get_from_waiting_queue(
                 req_item.child_req_ids) if req_item.child_req_ids else 0
             if (req_count + 1 + num_children) > max_req_count:
                 break
-            req_count += 1 + num_children
             req_item = waiting_queue.popleft()
             can_process = self._can_process_attention_dp_request(
                 req_item, scheduling_all_ranks_num_active_requests
             ) if enable_attention_dp else True
 
             if can_process:
                 items.append(req_item)
-                else:
+                req_count += 1 + num_children
+            else:
                 pending_requests.append(req_item)
 
         # Put the pending requests back to the waiting queue
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1451,7 +1451,7 @@ def _enqueue_responses(self, responses: List[Tuple[int, LlmResponse]]):
                 if responses_list is not None:
                     for resp in responses_list:
                         if resp is not None:
-                            gather_responses.append(resp)
+                            gather_responses.extend(resp)
                     responses = gather_responses
         logger.debug(
             f'after gather, rank = {self.dist.rank}, responses = {responses}')