fix correctness issue

Yuening-wa · Yuening-wa · commit c26bf90e662a · 2025-08-06T14:23:42.000+08:00
Signed-off-by: Yuening Li &lt;62227368+yueningl@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp
@@ -122,14 +122,15 @@ class FusedMoeRunner : public torch::CustomClassHolder
     }
 
     FusedMoeRunner(c10::ScalarType activation_dtype, c10::ScalarType weight_dtype, c10::ScalarType output_dtype,
-        bool use_deepseek_fp8_block_scale, bool use_w4a8_group_scaling, bool use_woq_group_scaling,
-        bool use_mxfp8_act_scaling)
+        bool use_deepseek_fp8_block_scale, bool use_w4a8_group_scaling, bool use_woq_per_channel,
+        bool use_woq_group_scaling, bool use_mxfp8_act_scaling)
     {
         mActivationDtype = activation_dtype;
         mWeightDtype = weight_dtype;
         mOutputDtype = output_dtype;
         mUseDeepSeekFP8BlockScaling = use_deepseek_fp8_block_scale;
         mUseW4A8GroupScaling = use_w4a8_group_scaling;
+        mUseWoqPerChannel = use_woq_per_channel;
         mUseWoqGroupScaling = use_woq_group_scaling;
         mUseMxfp8ActScaling = use_mxfp8_act_scaling;
         mInnerDimMultiplier = 1;
@@ -276,13 +277,27 @@ class FusedMoeRunner : public torch::CustomClassHolder
         }
         TORCH_CHECK(fc1_expert_weights.sizes()[0] == fc2_expert_weights.sizes()[0],
             "fc1_expert_weights and fc2_expert_weights must have the same number of experts.");
-        TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier * 2,
-            "fc1_expert_weights inter size must be 2 times fc2_expert_weights inter size.");
+
+        if (mUseWoqPerChannel)
+        {
+            TORCH_CHECK(fc1_expert_weights.sizes()[2] == fc2_expert_weights.sizes()[1] * mInnerDimMultiplier * 2,
+                "fc1_expert_weights inter size must be 2 times fc2_expert_weights inter size.");
+        }
+        else
+        {
+            TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier * 2,
+                "fc1_expert_weights inter size must be fc2_expert_weights inter size.");
+        }
 
         int experts_per_token = token_selected_experts.sizes()[1];
         int64_t num_rows = input.sizes()[0];
         int64_t hidden_size = fc2_expert_weights.sizes()[1];
         int64_t inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
+        if (mUseWoqPerChannel)
+        {
+            hidden_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
+            inter_size = fc2_expert_weights.sizes()[1];
+        }
 
         if (isWMxfp4AMxfp8Quant() || isWMxfp4AFp8Quant())
         {
@@ -506,9 +521,14 @@ class FusedMoeRunner : public torch::CustomClassHolder
         }
 
         int64_t const num_rows = input.sizes()[0];
-        int64_t const hidden_size = fc2_expert_weights.sizes()[1];
-        int64_t const inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
-        int64_t const group_size = mUseWoqGroupScaling ? 128 : -1;
+        int64_t hidden_size = fc2_expert_weights.sizes()[1];
+        int64_t inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
+        if (mUseWoqPerChannel)
+        {
+            hidden_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
+            inter_size = fc2_expert_weights.sizes()[1];
+        }
+        int64_t const group_size = mUseWoqGroupScaling or mUseW4A8GroupScaling ? 128 : -1;
         int const num_experts = static_cast<int>(fc2_expert_weights.sizes()[0] * ep_size);
 
         // Get specific profile configs according to the profile_id.
@@ -585,6 +605,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
 
     bool mUseDeepSeekFP8BlockScaling = false;
     bool mUseW4A8GroupScaling = false;
+    bool mUseWoqPerChannel = false;
     bool mUseWoqGroupScaling = false;
     bool mUseMxfp8ActScaling = false;
 
@@ -876,16 +897,15 @@ class FusedMoeRunner : public torch::CustomClassHolder
         else if (isWeightOnlyQuant())
         {
             TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for weight only quantization");
-            if (!mUseWoqGroupScaling)
+            if (mUseWoqPerChannel)
             {
                 TORCH_CHECK(quant_scales.value().size() == 2, "Expecting 2 quant scales for weight only quantization");
                 auto& fc1_weight_scales = quant_scales.value()[0];
                 auto& fc2_weight_scales = quant_scales.value()[1];
                 return kernels::QuantParams::Int(static_cast<float const*>(fc1_weight_scales.data_ptr()),
                     static_cast<float const*>(fc2_weight_scales.data_ptr()));
             }
-            // TODO: support groupwise quantization for int8 weight only
-            else if (isInt4Quant())
+            else if (isInt4Quant() && mUseWoqGroupScaling)
             {
                 TORCH_CHECK(quant_scales.value().size() == 8, "Expecting 8 quant scales for INT4 quantization");
                 auto& fc1_weight_scales = quant_scales.value()[0];
@@ -968,7 +988,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
 TORCH_LIBRARY(trtllm, m)
 {
     m.class_<torch_ext::FusedMoeRunner>("FusedMoeRunner")
-        .def(torch::init<c10::ScalarType, c10::ScalarType, c10::ScalarType, bool, bool, bool, bool>())
+        .def(torch::init<c10::ScalarType, c10::ScalarType, c10::ScalarType, bool, bool, bool, bool, bool>())
         .def("run_gemm_profile", &torch_ext::FusedMoeRunner::runGemmProfile)
         .def("get_tactic_num", &torch_ext::FusedMoeRunner::getTacticNum)
         .def("run_moe", &torch_ext::FusedMoeRunner::runMoe)
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -40,6 +40,7 @@ def __init__(
         cluster_rank: int,
         use_deepseek_fp8_block_scale: bool,
         use_w4a8_group_scaling: bool,
+        use_woq_per_channel: bool,
         use_woq_group_scaling: bool,
         use_mxfp8_act_scaling: bool,
         min_latency_mode: bool,
@@ -58,19 +59,22 @@ def __init__(
         self.enable_alltoall = False
         self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
         self.use_w4a8_group_scaling = use_w4a8_group_scaling
+        self.use_woq_per_channel = use_woq_per_channel
         self.use_woq_group_scaling = use_woq_group_scaling
         self.use_mxfp8_act_scaling = use_mxfp8_act_scaling
         self.min_latency_mode = min_latency_mode
         instance_key = (x_dtype, weight_dtype, output_dtype,
                         use_deepseek_fp8_block_scale, use_w4a8_group_scaling,
-                        use_woq_group_scaling, use_mxfp8_act_scaling)
+                        use_woq_per_channel, use_woq_group_scaling,
+                        use_mxfp8_act_scaling)
 
         if instance_key not in MoERunner.runner_dict:
             MoERunner.runner_dict[
                 instance_key] = torch.classes.trtllm.FusedMoeRunner(
                     x_dtype, weight_dtype, output_dtype,
                     use_deepseek_fp8_block_scale, use_w4a8_group_scaling,
-                    use_woq_group_scaling, use_mxfp8_act_scaling)
+                    use_woq_per_channel, use_woq_group_scaling,
+                    use_mxfp8_act_scaling)
         self.fused_moe_runner = MoERunner.runner_dict[instance_key]
 
     def get_valid_tactics(
@@ -139,6 +143,7 @@ def fused_moe(
     enable_alltoall: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
     use_w4a8_group_scaling: bool = False,
+    use_woq_per_channel: bool = False,
     use_woq_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
@@ -176,6 +181,7 @@ def fused_moe(
         cluster_rank=cluster_rank,
         use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         use_w4a8_group_scaling=use_w4a8_group_scaling,
+        use_woq_per_channel=use_woq_per_channel,
         use_woq_group_scaling=use_woq_group_scaling,
         use_mxfp8_act_scaling=use_mxfp8_act_scaling,
         min_latency_mode=min_latency_mode,
@@ -249,13 +255,17 @@ def _(
     enable_alltoall: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
     use_w4a8_group_scaling: bool = False,
+    use_woq_per_channel: bool = False,
     use_woq_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
     tune_max_num_tokens: int = 8192,
 ):
     seq_len = input.shape[0]
-    hidden_size = fc2_expert_weights.shape[1]
+    if use_woq_per_channel:
+        hidden_size = fc2_expert_weights.shape[2]
+    else:
+        hidden_size = fc2_expert_weights.shape[1]
 
     if min_latency_mode:
         num_experts_on_rank = fc2_expert_weights.shape[0]
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -144,6 +144,11 @@ def has_w4afp8(self):
         return self.quant_config and self.quant_config.quant_mode.is_int4_weight_only_per_group(
         )
 
+    @property
+    def has_woq_per_channel(self):
+        return self.quant_config.layer_quant_mode.is_weight_only(
+        ) and not self.quant_config.layer_quant_mode.has_per_group_scaling()
+
     @property
     def has_woq_per_group_scaling(self):
         return self.quant_config.layer_quant_mode.is_weight_only(
@@ -161,9 +166,7 @@ def _get_quant_method(self):
             elif self.quant_config.layer_quant_mode.is_int4_weight_only_per_group(
             ):
                 return WInt4AFP8FusedMoEMethod()
-            elif self.quant_config.layer_quant_mode.is_weight_only(
-            ) and not self.quant_config.layer_quant_mode.has_per_group_scaling(
-            ):
+            elif self.has_woq_per_channel:
                 return WeightOnlyFusedMoEMethod()
             else:
                 raise ValueError(
@@ -234,6 +237,7 @@ def forward_chunk(
         # quantize inputs
         use_deepseek_fp8_block_scale = False
         use_w4a8_group_scaling = False
+        use_woq_per_channel = False
         use_woq_group_scaling = False
         weight_dtype = self.w3_w1_weight.dtype
         x_sf = None
@@ -247,7 +251,8 @@ def forward_chunk(
                 use_w4a8_group_scaling = True
                 use_woq_group_scaling = True
                 weight_dtype = torch.quint4x2
-            # TODO: add support for weight only quantization with per group scaling
+            elif self.has_woq_per_channel:
+                use_woq_per_channel = True
             elif self.has_woq_per_group_scaling:
                 use_woq_group_scaling = True
             elif self.has_nvfp4:
@@ -269,10 +274,10 @@ def forward_chunk(
                         x, x_sf = torch.ops.trtllm.fp4_quantize(
                             x, self.fc31_input_scale, self.scaling_vector_size,
                             False, True)
-            # else:
-            #     raise ValueError(
-            #         f"unsupported quantization mode: {self.quant_config.quant_mode}"
-            #     )
+            else:
+                raise ValueError(
+                    f"unsupported quantization mode: {self.quant_config.quant_mode}"
+                )
 
         # gather inputs for attention dp
         if run_post_quant_allgather:
@@ -312,6 +317,7 @@ def forward_chunk(
             enable_alltoall=self.enable_alltoall,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
             use_w4a8_group_scaling=use_w4a8_group_scaling,
+            use_woq_per_channel=use_woq_per_channel,
             use_woq_group_scaling=use_woq_group_scaling,
             min_latency_mode=False,
             tune_max_num_tokens=self.tune_max_num_tokens,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -671,9 +671,6 @@ def create_weights(self, module: torch.nn.Module):
             requires_grad=False)
         module.register_parameter("fc2_weight_scale", fc2_weight_scale)
 
-        print(f"fc31_weight_scale.shape: {fc31_weight_scale.shape}")
-        print(f"fc2_weight_scale.shape: {fc2_weight_scale.shape}")
-
         fc31_alpha = nn.Parameter(torch.empty(module.expert_size_per_partition,
                                               1,
                                               dtype=torch.float32),
@@ -866,27 +863,31 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
 
 
 class WeightOnlyFusedMoEMethod(FusedMoEMethodBase):
-    """
-    Base class for Weight Only Quantization fused MoE methods.
-    """
 
     def create_weights(self, module: torch.nn.Module):
+        module.sm_version = get_sm_version()
+        module.sm_version = 80 if module.sm_version >= 90 else module.sm_version
+        module.preprocessor = preprocess_weights_for_mixed_gemm
+
         weight_dtype = torch.int8
         # int4 weight are packed into int8
         if module.quant_config.layer_quant_mode.is_int8_weight_only():
-            weight_id = 1
+            pass
         elif module.quant_config.layer_quant_mode.is_int4_weight_only():
-            weight_id = 2
+            pass
         else:
             raise NotImplementedError(
                 f"Weight Only Quantization is unsupported on {module.quant_config.layer_quant_mode}."
             )
 
+        # notice the weight shape for weight-only is different from the original shape,
+        # since the quantized weights have their own layout
         w3_w1_weight_shape = (module.expert_size_per_partition,
-                              module.intermediate_size_per_partition * 2,
-                              module.hidden_size // weight_id)
-        w2_weight_shape = (module.expert_size_per_partition, module.hidden_size,
-                           module.intermediate_size_per_partition // weight_id)
+                              module.hidden_size,
+                              module.intermediate_size_per_partition * 2)
+        w2_weight_shape = (module.expert_size_per_partition,
+                           module.intermediate_size_per_partition,
+                           module.hidden_size)
 
         fc31_weight_scale = nn.Parameter(torch.empty(
             module.expert_size_per_partition,
@@ -938,22 +939,22 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
         w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], dim=0)
 
         # preprocess the weights for mixed gemm
-        preprocessor = preprocess_weights_for_mixed_gemm
         if module.quant_config.layer_quant_mode.is_int8_weight_only():
             weight_dtype = torch.int8
-        elif module.quant_config.layer_quant_mode.is_int4_weight_only():
-            weight_dtype = torch.quint4x2
-            packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
-            unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
-            w31_weight_shard = packer(
-                unpacker(w31_weight_shard.cpu()).T.contiguous()).to(
-                    w31_weight_shard.device)
+        # elif module.quant_config.layer_quant_mode.is_int4_weight_only():
+        #     weight_dtype = torch.quint4x2
+        #     packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
+        #     unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
+        #     w31_weight_shard = packer(
+        #         unpacker(w31_weight_shard.cpu()).T.contiguous()).to(
+        #             w31_weight_shard.device)
 
         assert module.dtype in [torch.float16, torch.bfloat16], \
             f"activation dtype should be float16 or bfloat16, got {module.dtype}"
-        w31_weight_shard = preprocessor(w31_weight_shard, weight_dtype,
-                                        module.dtype).view(
-                                            dst_w3_w1_weight.shape)
+
+        w31_weight_shard = module.preprocessor(w31_weight_shard.T.contiguous(),
+                                               weight_dtype, module.dtype,
+                                               module.sm_version).contiguous()
         dst_w3_w1_weight.copy_(w31_weight_shard.view(dst_w3_w1_weight.dtype),
                                non_blocking=True)
 
@@ -968,22 +969,22 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
                                             TensorParallelMode.ROW)
 
         # preprocess the weights for mixed gemm
-        preprocessor = preprocess_weights_for_mixed_gemm
         if module.quant_config.layer_quant_mode.is_int8_weight_only():
             weight_dtype = torch.int8
-        elif module.quant_config.layer_quant_mode.is_int4_weight_only():
-            weight_dtype = torch.quint4x2
-            packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
-            unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
-            w2_weight_shard = packer(
-                unpacker(w2_weight_shard.cpu()).T.contiguous()).to(
-                    w2_weight_shard.device)
+        # elif module.quant_config.layer_quant_mode.is_int4_weight_only():
+        #     weight_dtype = torch.quint4x2
+        #     packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
+        #     unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
+        #     w31_weight_shard = packer(
+        #         unpacker(w31_weight_shard.cpu()).T.contiguous()).to(
+        #             w31_weight_shard.device)
 
         assert module.dtype in [torch.float16, torch.bfloat16], \
             f"activation dtype should be float16 or bfloat16, got {module.dtype}"
-        w2_weight_shard = preprocessor(w2_weight_shard, weight_dtype,
-                                       module.dtype).view(dst_w2_weight.shape)
 
+        w2_weight_shard = module.preprocessor(w2_weight_shard.T.contiguous(),
+                                              weight_dtype, module.dtype,
+                                              module.sm_version).contiguous()
         dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
                             non_blocking=True)
 
diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py
@@ -75,6 +75,18 @@ def calc_diff(x, y):
     return 1 - sim
 
 
+def calc_woq_tolerence(x: torch.Tensor, weight_dtype: torch.dtype):
+    if weight_dtype == torch.int8:
+        bits_in_type = 8
+    elif weight_dtype == torch.quint4x2:
+        bits_in_type = 4
+    quant_range_scale = 1.0 / float(1 << (bits_in_type - 1))
+    max_val = torch.max(abs(x)).item()
+    atol = (max_val * quant_range_scale) * 1.5  # allow for rounding
+
+    return atol
+
+
 def reference_moe_torch(x: torch.Tensor, selected_experts: torch.Tensor,
                         final_scales: torch.Tensor, num_experts: int,
                         weights: Dict[str, torch.Tensor]) -> torch.Tensor:
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py