Enlarge scheduler and slot manager capacity under disagg bs==1

yifeizhang-c · yifeizhang-c · commit c4dd3c14570a · 2025-08-04T23:34:26.000-07:00
Signed-off-by: Yifei Zhang &lt;219273404+yifeizhang-c@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -512,6 +512,10 @@ def create_py_executor_instance(
             lora_config.trtllm_modules_to_hf_modules)
 
     max_num_sequences = executor_config.max_batch_size * mapping.pp_size
+    # When max_num_sequences == 1, attention dp dummy request will prevent the scheduling of DISAGG_GENERATION_INIT.
+    # Enlarge slot and scheduler capacity to avoid DISAGG_GENERATION_INIT stuck in the scheduler.
+    if max_num_sequences == 1 and kv_cache_manager:
+        max_num_sequences += 1
 
     resources[ResourceManagerType.SEQ_SLOT_MANAGER] = SeqSlotManager(
         max_num_sequences)
@@ -564,6 +568,10 @@ def create_py_executor_instance(
 def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
                               *, max_seq_len: int, enable_mixed_sampler: bool):
     max_num_sequences = executor_config.max_batch_size * mapping.pp_size
+    # When max_num_sequences == 1, attention dp dummy request will prevent the scheduling of DISAGG_GENERATION_INIT.
+    # Enlarge sampler size to align with slot and scheduler capacity.
+    if max_num_sequences == 1 and executor_config.kv_cache_config:
+        max_num_sequences += 1
     max_draft_len = (0 if executor_config.speculative_config is None else
                      executor_config.speculative_config.max_draft_len)
     return TorchSampler.Args(