[https://nvbugs/5481385][fix] Fix max_seq_len in cuda graph warmup and intermediate_size in fused_moe_deepgemm (NVIDIA#7345)

lfr-0531 · litaotju · chang-l · commit cecf45fc6600 · 2025-09-02T10:46:29.000-07:00
Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
Co-authored-by: Tao Li @ NVIDIA &lt;tali@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -411,7 +411,7 @@ def __init__(
 
     def get_workspace(self, m_max: int, group_size: int):
         hidden_size = self.hidden_size
-        intermediate_size = self.intermediate_size
+        intermediate_size = self.intermediate_size_per_partition
         num_experts = self.expert_size_per_partition
 
         # create workspace
@@ -564,7 +564,7 @@ def forward_chunk(
         # grouped gemm 1
         h1 = set_strides(workspace["workspace_1"],
                          self.expert_size_per_partition, m_max,
-                         self.intermediate_size * 2)
+                         self.intermediate_size_per_partition * 2)
 
         deepgemm_fp8_group_blockwise_gemm(
             d=h1,
@@ -579,9 +579,9 @@ def forward_chunk(
         # activation and quantization
         act_input_fp8 = set_strides(workspace["workspace_0"],
                                     self.expert_size_per_partition, m_max,
-                                    self.intermediate_size)
+                                    self.intermediate_size_per_partition)
 
-        scale_k = fp8_utils.ceil_div(self.intermediate_size, 128)
+        scale_k = fp8_utils.ceil_div(self.intermediate_size_per_partition, 128)
         scale_k_padded = fp8_utils.align(scale_k, 4)
         act_input_sf = set_strides(workspace["workspace_sf"],
                                    self.expert_size_per_partition,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -583,7 +583,15 @@ def get_cuda_graph_warmup_request(batch_size, draft_len):
 
                 # Add one dummy request with the maximum possible sequence length.
                 # The sequence length is limited by both the max_seq_len and the number of available blocks.
+                # Also, the sequence length is limited by the max_position_embeddings.
                 token_num = max(1, min(available_tokens, self.max_seq_len - 1))
+                model_config = self.model.model_config.pretrained_config
+                max_position_embeddings = getattr(model_config,
+                                                  'max_position_embeddings',
+                                                  None)
+                if max_position_embeddings is not None:
+                    token_num = min(token_num,
+                                    max_position_embeddings - draft_len)
                 max_seq_len_request = kv_cache_manager.add_dummy_requests(
                     request_ids=[batch_size - 1],
                     token_nums=[token_num],