refactor moe loading logics.

limin2021 · limin2021 · commit d80cabec9069 · 2025-08-05T00:32:43.000-07:00
Signed-off-by: Mindy Li &lt;11663212+limin2021@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -1344,6 +1344,9 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
         params_map = {'gate_up_proj': ['gate_proj', 'up_proj']}
         all_named_modules = dict(self.named_modules())
 
+        # moe_backend: cute_dsl_group_gemm
+        # use_cute_dsl_gemm, use_cute_dsl_bmm; use_cute_dsl
+        # attention/mla, gated_mlp, linear
         if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
         ) and get_sm_version() == 100:
             for name in list(weights.keys()):
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -9,8 +9,7 @@
 from ...model_config import ModelConfig
 from ...utils import Fp4QuantizedTensor
 from .fused_moe_cutlass import CutlassFusedMoE
-from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethodCuteDsl,
-                           MoEWeightLoadingMode, UnquantizedFusedMoEMethod)
+from .quantization import MoEWeightLoadingMode
 from .routing import BaseMoeRoutingMethod
 
 
@@ -140,18 +139,6 @@ def __init__(
             layer_idx=layer_idx,
         )
 
-    def _get_quant_method(self):
-        if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
-                exclude_kv_cache=True):
-            if self.quant_config.layer_quant_mode.has_fp8_block_scales():
-                return DeepSeekFP8BlockScalesFusedMoEMethodCuteDsl()
-            else:
-                raise ValueError(
-                    f"Unsupported quantization mode: {self.quant_config.quant_mode}"
-                )
-        else:
-            return UnquantizedFusedMoEMethod()
-
     def forward_chunk(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -13,7 +13,8 @@
 from ...model_config import ModelConfig
 from ...utils import Fp4QuantizedTensor
 from .fused_moe_cutlass import CutlassFusedMoE
-from .quantization import MoEWeightLoadingMode
+from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm,
+                           MoEWeightLoadingMode, UnquantizedFusedMoEMethod)
 from .routing import BaseMoeRoutingMethod
 
 
@@ -340,6 +341,18 @@ def __init__(
             layer_idx=layer_idx,
         )
 
+    def _get_quant_method(self):
+        if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            if self.quant_config.layer_quant_mode.has_fp8_block_scales():
+                return DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm()
+            else:
+                raise ValueError(
+                    f"Unsupported quantization mode: {self.quant_config.quant_mode}"
+                )
+        else:
+            return UnquantizedFusedMoEMethod()
+
     @nvtx_range("[DG] forward")
     def forward_chunk(
         self,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -430,7 +430,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         module.fc31_input_dequant.data.copy_(max_fc31_input_scale)
 
 
-class DeepSeekFP8BlockScalesFusedMoEMethodCuteDsl(FusedMoEMethodBase):
+class DeepSeekFP8BlockScalesFusedMoEMethod(FusedMoEMethodBase):
 
     def create_weights(self, module: torch.nn.Module):
         weight_dtype = torch.float8_e4m3fn
@@ -553,8 +553,8 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
             })
 
 
-class DeepSeekFP8BlockScalesFusedMoEMethod(
-        DeepSeekFP8BlockScalesFusedMoEMethodCuteDsl):
+class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
+        DeepSeekFP8BlockScalesFusedMoEMethod):
 
     def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                      weight_loading_mode: MoEWeightLoadingMode):