[None][feat] support attention dp for qwen3 dense model (#7618)

Nekofish-L · web-flow · commit 96f11b10ae83 · 2025-09-16T09:33:22.000+08:00
Signed-off-by: Nekofish-L &lt;liuxiangyang@mail.ustc.edu.cn&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3.py b/tensorrt_llm/_torch/models/modeling_qwen3.py
@@ -8,6 +8,7 @@
 
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams
+from ..distributed import AllReduceParams
 from ..model_config import ModelConfig
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
@@ -82,6 +83,8 @@ def __init__(
             model_config,
             layer_idx=layer_idx,
         )
+        self.mapping = model_config.mapping
+        self.enable_attention_dp = self.mapping.enable_attention_dp
 
         # Qwen3 has accuracy issues with deep_gemm (see: https://nvbugspro.nvidia.com/bug/5461712
         # and https://nvbugspro.nvidia.com/bug/5505402)
@@ -92,6 +95,7 @@ def __init__(
             intermediate_size=config.intermediate_size,
             bias=config.mlp_bias if hasattr(config, "mlp_bias") else False,
             dtype=config.torch_dtype,
+            overridden_tp_size=1 if self.enable_attention_dp else None,
             config=model_config,
             disable_deep_gemm=disable_deep_gemm,
         )
@@ -102,6 +106,8 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
+        self.disable_allreduce = (self.mapping.tp_size == 1
+                                  or self.enable_attention_dp)
 
     def forward(
         self,
@@ -126,13 +132,22 @@ def forward(
             hidden_states=hidden_states,
             attn_metadata=attn_metadata,
             mrope_config=mrope_config,
+            all_reduce_params=AllReduceParams(
+                enable_allreduce=not self.disable_allreduce),
             **kwargs,
         )
 
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(
+            hidden_states,
+            all_rank_num_tokens=attn_metadata.all_rank_num_tokens,
+            all_rank_max_num_tokens=attn_metadata.all_rank_max_num_tokens,
+            final_all_reduce_params=AllReduceParams(
+                enable_allreduce=not self.disable_allreduce),
+            cutlass_min_latency_mode=False,
+        )
 
         if spec_metadata is not None:
             spec_metadata.maybe_capture_hidden_states(self.layer_idx,