Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cpp/tensorrt_llm/batch_manager/assignReqSeqSlots.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ void tensorrt_llm::batch_manager::AssignReqSeqSlots::operator()(SequenceSlotMana
{
for (auto const& llmReq : requests)
{
if (llmReq->isDisaggGenerationInitState())
{
// Skip assigning sequence slot for DISAGG_GENERATION_INIT request
continue;
}
auto const isReqNew = (llmReq->isContextInitState() && !llmReq->mSeqSlot)
|| (llmReq->isDisaggGenerationTransmissionComplete());
if (isReqNew && llmReq->getReturnPerfMetrics())
Expand Down
8 changes: 7 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,8 +524,14 @@ def create_py_executor_instance(
resource_manager.resource_managers.move_to_end(
ResourceManagerType.KV_CACHE_MANAGER, last=True)

# When scheduler_capacity == 1, attention dp dummy request will prevent the scheduling of DISAGG_GENERATION_INIT.
# Enlarge scheduler capacity to avoid DISAGG_GENERATION_INIT stuck in the scheduler.
scheduler_capacity = max_num_sequences
if scheduler_capacity == 1 and mapping.enable_attention_dp and kv_cache_manager:
scheduler_capacity += 1

capacity_scheduler = BindCapacityScheduler(
max_num_sequences,
scheduler_capacity,
kv_cache_manager.impl if kv_cache_manager is not None else None,
peft_cache_manager.impl if peft_cache_manager is not None else None,
executor_config.scheduler_config.capacity_scheduler_policy,
Expand Down
6 changes: 5 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,7 +1287,6 @@ def _prepare_disagg_gen_init(self, fitting_disagg_gen_init_requests):

for resource_mgr_type in (
ResourceManagerType.KV_CACHE_MANAGER,
ResourceManagerType.SEQ_SLOT_MANAGER,
ResourceManagerType.SPEC_RESOURCE_MANAGER,
ResourceManagerType.DRAFT_KV_CACHE_MANAGER):
if (resource_mgr_type in self.resource_manager.resource_managers
Expand All @@ -1307,6 +1306,11 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
if req.is_disagg_generation_transmission_complete:
cache_trans_complete_requests.append(req)
if len(cache_trans_complete_requests) > 0:
trans_complete_to_prepare = ScheduledRequests()
trans_complete_to_prepare.context_requests = cache_trans_complete_requests
self.resource_manager.resource_managers[
ResourceManagerType.SEQ_SLOT_MANAGER].prepare_resources(
trans_complete_to_prepare)
self._setup_sampler_step(cache_trans_complete_requests)

for req in scheduled_batch.generation_requests:
Expand Down
5 changes: 5 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/seq_slot_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ def get_needed_resource_to_completion(self, request: LlmRequest) -> int:

def prepare_resources(self, scheduled_batch: ScheduledRequests) -> None:
for llm_req in scheduled_batch.all_requests():
if llm_req.is_disagg_generation_init_state:
logger.info(
f"Skip assigning sequence slot for DISAGG_GENERATION_INIT request."
)
continue
if llm_req.seq_slot is None or llm_req.is_disagg_generation_transmission_complete:
llm_req.seq_slot = self.slot_manager.add_slot(
llm_req.request_id)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
hostname: localhost
port: 8000
backend: "pytorch"
cuda_graph_config: null
free_gpu_memory_fraction: 0.2
context_servers:
num_instances: 1
max_batch_size: 1
max_num_tokens: 3000
max_seq_len: 4096
tensor_parallel_size: 2
pipeline_parallel_size: 1
enable_attention_dp: true
kv_cache_config:
free_gpu_memory_fraction: 0.2
enable_partial_reuse: False
disable_overlap_scheduler: True
cache_transceiver_config:
backend: default
urls:
- "localhost:8001"
generation_servers:
num_instances: 1
tensor_parallel_size: 2
pipeline_parallel_size: 1
enable_attention_dp: true
max_batch_size: 1
max_num_tokens: 4096
max_seq_len: 4096
kv_cache_config:
free_gpu_memory_fraction: 0.2
enable_partial_reuse: False
cache_transceiver_config:
backend: default
urls:
- "localhost:8002"
25 changes: 25 additions & 0 deletions tests/integration/defs/disaggregated/test_disaggregated.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def get_test_config(test_desc, example_dir, test_root):
"gen_only": (2, f"{test_configs_root}/disagg_config_gen_only.yaml"),
"gen_only_trt_backend":
(2, f"{test_configs_root}/disagg_config_gen_only_trt_backend.yaml"),
"gen_only_bs1":
(4, f"{test_configs_root}/disagg_config_gen_only_bs1.yaml"),
"4_ranks": (4, f"{test_configs_root}/disagg_config_ctxtp2_gentp1.yaml"),
"4_ranks_trt_backend":
(4,
Expand Down Expand Up @@ -387,6 +389,29 @@ def test_disaggregated_benchmark_gen_only_trt_backend(
cwd=llm_venv.get_working_directory())


@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
indirect=True)
def test_disaggregated_genbs1(disaggregated_test_root,
disaggregated_example_root, llm_venv,
llama_model_root):
src_dst_dict = {
llama_model_root:
f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
}
for src, dst in src_dst_dict.items():
if not os.path.islink(dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
os.symlink(src, dst, target_is_directory=True)

env = llm_venv._new_env.copy()
env['TRTLLM_DISAGG_BENCHMARK_GEN_ONLY'] = '1'
run_disaggregated_test(disaggregated_example_root,
"gen_only_bs1",
env=llm_venv._new_env,
cwd=llm_venv.get_working_directory())


@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
indirect=True)
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ l0_dgx_h100:
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2_genpp2[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_gentp2[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
Expand Down