Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import torch

from tensorrt_llm._utils import nvtx_range
from tensorrt_llm.bindings.executor import RequestType

from ..distributed import Distributed
from .llm_request import ExecutorRequest, executor_request_to_llm_request
Expand Down Expand Up @@ -61,7 +60,6 @@ def __init__(self, dist: Distributed, enable_attention_dp: bool,
self.num_fetch_requests_cur_rank = 0
self.expected_num_active_requests = 0
self.new_active_requests_queue_latency_ms = 0
self.has_context_request = False
self.is_shutdown = False
self.should_exclude_last_generation_logits = False

Expand Down Expand Up @@ -318,7 +316,6 @@ def _balance_requests_across_ranks(
self, new_requests: List[RequestQueueItem],
all_ranks_num_active_requests: List[int]) -> List[RequestQueueItem]:
"""Balance requests across ranks for attention DP."""
self.has_context_request = False
new_requests_cur_rank = []

if new_requests and self.expected_num_active_requests > all_ranks_num_active_requests[
Expand Down Expand Up @@ -364,15 +361,6 @@ def _balance_requests_across_ranks(
elif val.rank == self.dist.tp_rank:
break

# Check for context requests
if self.is_disaggregated:
for req_item in new_requests_cur_rank:
if req_item.request.request_type == RequestType.REQUEST_TYPE_CONTEXT_ONLY:
self.has_context_request = True
break
else:
self.has_context_request = len(new_requests_cur_rank) > 0

return new_requests_cur_rank

def _collect_py_objects_from_requests(
Expand Down
8 changes: 3 additions & 5 deletions tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def __init__(self,
self.draft_model_engine = draft_model_engine

# enqueue and _fetch_new_requests used data
self.active = True
self.next_req_id = max_batch_size # The first max_batch_size request IDs are reserved for dummy requests
self.max_beam_width = max_beam_width
self.max_draft_len = max_draft_len
Expand All @@ -196,7 +195,6 @@ def __init__(self,
self.max_num_active_requests = model_engine.get_max_num_sequences()
self.active_requests: List[LlmRequest] = []
self.expected_num_active_requests = 0
self.has_context_request = False
self.ctx_in_transmission_requests = []
self.previous_batch: Optional[BatchState] = None
self.num_scheduled_requests: int = 0
Expand Down Expand Up @@ -1151,7 +1149,7 @@ def _check_disagg_gen_transfer_status(self):
@nvtx_range("_pad_attention_dp_dummy_request")
def _pad_attention_dp_dummy_request(self):
"""
Pad with a dummy request, if required, to ensure every attention_dp rank has at least one active request.
Pad with a generation dummy request, if required, to ensure every attention_dp rank has at least one active request.
"""
if not self.enable_attention_dp:
return
Expand All @@ -1169,8 +1167,8 @@ def _pad_attention_dp_dummy_request(self):
if self.expected_num_active_requests - num_active_request > 0 and num_active_request == 0:
llm_request = self.kv_cache_manager.add_dummy_requests(
request_ids=[0],
is_gen=not self.has_context_request,
prepare_resource=not self.has_context_request,
is_gen=True,
prepare_resource=True,
max_num_draft_tokens=self.max_draft_len,
)[0]
llm_request.is_attention_dp_dummy = True
Expand Down