From aaeb67b9d074a05d7384c15a04acfaa6738df12d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hui=20Gao=C3=A2=C2=80?= <huig@nvidia.com>
Date: Mon, 9 Jun 2025 04:48:11 -0700
Subject: [PATCH 1/9] Use backend to replace macro to control enable MNVL all
 reduce
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hui Gao <huig@nvidia.com>
Signed-off-by: Hui Gaoâ <huig@nvidia.com>
---
 tensorrt_llm/_torch/distributed/ops.py        | 121 ++--
 tensorrt_llm/_torch/model_config.py           |   1 +
 .../_torch/models/modeling_deepseekv3.py      |   4 +-
 .../defs/output/gpu_monitoring.csv            | 525 ++++++++++++++++++
 .../defs/output/perf_script_test_results.csv  |   3 +
 .../defs/output/session_properties.csv        |   2 +
 .../qa/trt_llm_release_perf_test.txt          |   1 +
 .../_torch/multi_gpu/test_mnnvl_allreduce.py  |  19 +-
 8 files changed, 624 insertions(+), 52 deletions(-)
 create mode 100644 tests/integration/defs/output/gpu_monitoring.csv
 create mode 100644 tests/integration/defs/output/perf_script_test_results.csv
 create mode 100644 tests/integration/defs/output/session_properties.csv
 create mode 100644 tests/integration/test_lists/qa/trt_llm_release_perf_test.txt

diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 44ab7b1c8dd..085dff61171 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -307,14 +307,15 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype):
         super().__init__()
         self.mapping = mapping
         self.dtype = dtype
-        self.enable_mnnvl = (os.environ.get("TRTLLM_MNNVL_AR_ENABLED",
-                                            "0") == "1"
-                             and dtype in [torch.bfloat16, torch.float32]
-                             and (not mapping.has_cp()))
+        assert (dtype in MNNVLAllReduce.get_supported_dtype()
+                and (not mapping.has_cp())), ""
 
-        if self.enable_mnnvl:
-            self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace(
-                self.mapping, dtype)
+        self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace(
+            self.mapping, dtype)
+
+    @staticmethod
+    def get_supported_dtype():
+        return [torch.bfloat16, torch.float32]
 
     def forward(
         self,
@@ -330,7 +331,7 @@ def forward(
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
         """
-        if not self.enable_mnnvl or input.numel() > self.max_num_elements_mnnvl:
+        if input.numel() > self.max_num_elements_mnnvl:
             return None
 
         fusion_op = all_reduce_params.fusion_op
@@ -368,12 +369,63 @@ def forward(
         return None
 
 
+class TLLMAllReduce(nn.Module):
+    """A specialized AllReduce implementation for Multi-Node NVLink communication.
+
+    This class handles the MNNVL-specific allreduce operations, which can be more efficient
+    for certain operations when using NVLink for multi-node communication.
+    """
+
+    def __init__(self, mapping: Mapping, strategy: AllReduceStrategy = AllReduceStrategy.AUTO):
+        super().__init__()
+        self.mapping = mapping
+        self.strategy = strategy
+        self.workspace = None
+
+        self.force_low_precision_env = os.environ.get(
+            "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY")
+        # When Strategy is UB, it is guaranteed that the workspace is not used.
+        if self.strategy != AllReduceStrategy.UB:
+            if self.strategy == AllReduceStrategy.LOWPRECISION or self.force_low_precision_env is not None:
+                allocate_low_presicion_allreduce_workspace(self.mapping)
+            self.workspace = get_allreduce_workspace(self.mapping)
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        all_reduce_params: AllReduceParams,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        """Forward pass for MNNVL AllReduce.
+
+        Args:
+            input (torch.Tensor): Input tensor to be reduced
+            all_reduce_params (Optional[AllReduceParams]): Parameters for fused operations
+
+        Returns:
+            Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
+        """
+        output = torch.ops.trtllm.allreduce(
+            input=input,
+            residual=all_reduce_params.residual,
+            norm_weight=all_reduce_params.norm_weight,
+            scale=all_reduce_params.scale,
+            bias=all_reduce_params.bias,
+            workspace=self.workspace,
+            group=self.mapping.tp_group,
+            strategy=self.strategy,
+            op=all_reduce_params.fusion_op,
+            eps=all_reduce_params.eps,
+        )
+        return output
+
+
 class AllReduce(nn.Module):
 
     def __init__(self,
                  mapping: Mapping,
                  strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
-                 dtype: Optional[torch.dtype] = None):
+                 dtype: Optional[torch.dtype] = None,
+                 ar_backend: str = "TRTLLM"):
         super().__init__()
         """
         AllReduce is a module that performs an all-reduce operation on a tensor.
@@ -415,23 +467,23 @@ def __init__(self,
             or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using
             the AUTO strategy.
         """
+        self.skip_ar = self.mapping.tp_size == 1
+        self._mnvl_allreduce = None
+        self._tllm_allreduce = None
+        self._create_allreduce(mapping, ar_backend, strategy, dtype)
 
-        self.mapping = mapping
-        self.workspace = None
-        self.strategy = strategy
+    def _create_allreduce(self, mapping, backend, strategy, dtype):
+        if mapping.tp_size == 1:
+            return
 
-        self.force_low_precision_env = os.environ.get(
-            "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY")
-        if self.mapping.tp_size > 1:
-            # When Strategy is UB, it is guaranteed that the workspace is not used.
-            if self.strategy != AllReduceStrategy.UB:
-                if self.strategy == AllReduceStrategy.LOWPRECISION or self.force_low_precision_env is not None:
-                    allocate_low_presicion_allreduce_workspace(self.mapping)
-                self.workspace = get_allreduce_workspace(self.mapping)
+        enable_mnnvl = (backend == "MNNVL"
+                        and (dtype
+                             and dtype in MNNVLAllReduce.get_supported_dtype())
+                        and (not mapping.has_cp()) and mapping.tp_size > 1)
+        if enable_mnnvl:
+            self._mnvl_allreduce = MNNVLAllReduce(mapping, dtype)
 
-            # Initialize MNNVL AllReduce if needed
-            self.mnnvl_allreduce = MNNVLAllReduce(mapping,
-                                                  dtype) if dtype else None
+        self._tllm_allreduce = TLLMAllReduce(mapping, strategy)
 
     def forward(
         self,
@@ -460,37 +512,26 @@ def forward(
             RESIDUAL_RMS_NORM_QUANT_FP8: [norm_quant, residual]
             RESIDUAL_RMS_NORM_OUT_QUANT_FP8: [norm, norm_quant, residual]
             RESIDUAL_RMS_NORM_QUANT_NVFP4: [norm_quant_fp4, scale_factor, residual]
-            RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]
+            RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]P
         '''
-        if self.mapping.tp_size == 1 or (all_reduce_params is not None
-                                         and all_reduce_params.enable_allreduce
-                                         == False):
+        if self.skip_ar or (all_reduce_params is not None
+                            and all_reduce_params.enable_allreduce == False):
             return input
 
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
-        # Try MNNVL AllReduce first if available
         if self.mnnvl_allreduce:
             mnnvl_output = self.mnnvl_allreduce(
                 input, all_reduce_params=all_reduce_params)
             if mnnvl_output is not None:
                 return mnnvl_output
 
-        # Fall back to regular AllReduce if MNNVL is not available or not applicable
-        output = torch.ops.trtllm.allreduce(
+        # MNVL only support part of AllReduceFusionOp provided in params.
+        output = self._tllm_allreduce(
             input=input,
-            residual=all_reduce_params.residual,
-            norm_weight=all_reduce_params.norm_weight,
-            scale=all_reduce_params.scale,
-            bias=all_reduce_params.bias,
-            workspace=self.workspace,
-            group=self.mapping.tp_group,
-            strategy=self.strategy,
-            op=all_reduce_params.fusion_op,
-            eps=all_reduce_params.eps,
+            all_reduce_params=all_reduce_params,
         )
-
         return output if len(output) > 1 else output[0]
 
 
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index c2f817c25a2..ba3d359a499 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -77,6 +77,7 @@ class ModelConfig(Generic[TConfig]):
 
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'  # options can be CUTLASS, TRTLLM
+    ar_backend: str = 'TRTLLM'  # options can be MNVL, TRTLLM
 
     # If true, enable min-latency mode. Currently only used for Llama4.
     enable_min_latency: bool = False
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index ff22d3717ce..21918ed655c 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -628,7 +628,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
         self.layer_idx = layer_idx
-        self.allreduce = AllReduce(self.mapping, dtype=config.torch_dtype)
+        self.allreduce = AllReduce(self.mapping,
+                                   dtype=config.torch_dtype,
+                                   ar_backend=model_config.ar_backend)
         self.moe_allreduce = MoEAllReduce(self.mapping)
         self.next_layer_layernorm: RMSNorm = None
 
diff --git a/tests/integration/defs/output/gpu_monitoring.csv b/tests/integration/defs/output/gpu_monitoring.csv
new file mode 100644
index 00000000000..f0a9de5818a
--- /dev/null
+++ b/tests/integration/defs/output/gpu_monitoring.csv
@@ -0,0 +1,525 @@
+gpu_id,timestamp,gpu_clock__MHz,memory_clock__MHz,graphics_clock__MHz,gpu_utilization__pct,memory_utilization__pct,encoder_utilization__pct,decoder_utilization__pct,gpu_temperature__C,memory_temperature__C,fan_speed__pct,perf_state,power_draw__W,process_num
+0,2025-05-22 07:19:59.254958,345,1593,345,0,0,0,0,33,,,0,49.336,0
+0,2025-05-22 07:20:00.255244,345,1593,345,0,0,0,0,33,,,0,49.341,0
+0,2025-05-22 07:20:01.255586,345,1593,345,0,0,0,0,33,,,0,49.335,0
+0,2025-05-22 07:20:02.255856,345,1593,345,0,0,0,0,33,,,0,49.325,0
+0,2025-05-22 07:20:03.256133,345,1593,345,0,0,0,0,33,,,0,49.338,0
+0,2025-05-22 07:20:04.256400,345,1593,345,0,0,0,0,33,,,0,49.329,0
+0,2025-05-22 07:20:05.256668,345,1593,345,0,0,0,0,33,,,0,49.333,0
+0,2025-05-22 07:20:06.256911,345,1593,345,0,0,0,0,33,,,0,49.335,0
+0,2025-05-22 07:20:07.257181,345,1593,345,0,0,0,0,33,,,0,49.341,0
+0,2025-05-22 07:20:08.257467,345,1593,345,0,0,0,0,33,,,0,49.331,0
+0,2025-05-22 07:20:09.257742,345,1593,345,0,0,0,0,33,,,0,49.325,0
+0,2025-05-22 07:20:10.258030,345,1593,345,0,0,0,0,33,,,0,49.325,0
+0,2025-05-22 07:20:11.258311,345,1593,345,0,0,0,0,33,,,0,49.329,0
+0,2025-05-22 07:20:12.258595,345,1593,345,0,0,0,0,33,,,0,49.325,0
+0,2025-05-22 07:20:13.258881,345,1593,345,0,0,0,0,33,,,0,49.327,0
+0,2025-05-22 07:20:14.259151,345,1593,345,0,0,0,0,33,,,0,49.336,0
+0,2025-05-22 07:20:15.259451,345,1593,345,0,0,0,0,33,,,0,49.325,0
+0,2025-05-22 07:20:16.259675,345,1593,345,0,0,0,0,33,,,0,49.339,0
+0,2025-05-22 07:20:17.259991,345,1593,345,0,0,0,0,33,,,0,49.349,0
+0,2025-05-22 07:20:18.260332,345,1593,345,0,0,0,0,33,,,0,49.343,0
+0,2025-05-22 07:20:19.260653,345,1593,345,0,0,0,0,33,,,0,49.34,0
+0,2025-05-22 07:20:20.260928,345,1593,345,0,0,0,0,33,,,0,49.327,0
+0,2025-05-22 07:20:21.261204,345,1593,345,0,0,0,0,33,,,0,49.325,0
+0,2025-05-22 07:20:22.261520,345,1593,345,0,0,0,0,33,,,0,49.327,0
+0,2025-05-22 07:20:23.261836,345,1593,345,0,0,0,0,33,,,0,49.329,0
+0,2025-05-22 07:20:24.262109,345,1593,345,0,0,0,0,33,,,0,49.332,0
+0,2025-05-22 07:20:25.262378,345,1593,345,0,0,0,0,33,,,0,49.327,0
+0,2025-05-22 07:20:26.262645,345,1593,345,0,0,0,0,33,,,0,49.334,0
+0,2025-05-22 07:20:27.263164,1755,1593,1755,4,0,0,0,33,,,0,56.903,1
+0,2025-05-22 07:20:28.263556,1755,1593,1755,47,1,0,0,34,,,0,83.141,1
+0,2025-05-22 07:20:29.263933,1755,1593,1755,0,0,0,0,34,,,0,82.042,1
+0,2025-05-22 07:20:30.264296,1755,1593,1755,0,0,0,0,34,,,0,81.62,1
+0,2025-05-22 07:20:31.264652,1755,1593,1755,0,0,0,0,34,,,0,81.685,1
+0,2025-05-22 07:20:32.265029,1755,1593,1755,0,0,0,0,34,,,0,81.517,1
+0,2025-05-22 07:20:33.265392,1755,1593,1755,0,0,0,0,34,,,0,81.539,1
+0,2025-05-22 07:20:34.265751,1755,1593,1755,0,0,0,0,34,,,0,81.602,1
+0,2025-05-22 07:20:35.266153,1755,1593,1755,47,18,0,0,36,,,0,149.079,1
+0,2025-05-22 07:20:36.266565,1755,1593,1755,46,18,0,0,36,,,0,160.12,1
+0,2025-05-22 07:20:37.266972,1755,1593,1755,46,18,0,0,37,,,0,159.799,1
+0,2025-05-22 07:20:38.267387,1755,1593,1755,46,18,0,0,37,,,0,161.614,1
+0,2025-05-22 07:20:39.267802,1755,1593,1755,46,18,0,0,37,,,0,161.828,1
+0,2025-05-22 07:20:40.268223,1755,1593,1755,46,18,0,0,37,,,0,163.822,1
+0,2025-05-22 07:20:41.268630,1755,1593,1755,46,19,0,0,38,,,0,165.912,1
+0,2025-05-22 07:20:42.269043,1755,1593,1755,47,19,0,0,38,,,0,164.7,1
+0,2025-05-22 07:20:43.269479,1755,1593,1755,46,18,0,0,38,,,0,164.466,1
+0,2025-05-22 07:20:44.269904,1755,1593,1755,46,18,0,0,38,,,0,166.363,1
+0,2025-05-22 07:20:45.270322,1755,1593,1755,47,19,0,0,38,,,0,167.174,1
+0,2025-05-22 07:20:46.270754,1755,1593,1755,46,19,0,0,39,,,0,166.63,1
+0,2025-05-22 07:20:47.271183,1755,1593,1755,47,19,0,0,39,,,0,166.363,1
+0,2025-05-22 07:20:48.271607,1755,1593,1755,46,18,0,0,39,,,0,163.311,1
+0,2025-05-22 07:20:49.272020,1755,1593,1755,46,18,0,0,39,,,0,160.703,1
+0,2025-05-22 07:20:50.272437,1755,1593,1755,46,18,0,0,39,,,0,160.035,1
+0,2025-05-22 07:20:51.272861,1755,1593,1755,46,18,0,0,40,,,0,160.304,1
+0,2025-05-22 07:20:52.273307,1755,1593,1755,45,17,0,0,40,,,0,162.585,1
+0,2025-05-22 07:20:53.273747,1755,1593,1755,46,18,0,0,40,,,0,163.577,1
+0,2025-05-22 07:20:54.274167,1755,1593,1755,46,18,0,0,40,,,0,165.493,1
+0,2025-05-22 07:20:55.274583,1755,1593,1755,46,18,0,0,40,,,0,166.608,1
+0,2025-05-22 07:20:56.275023,1755,1593,1755,46,18,0,0,41,,,0,167.712,1
+0,2025-05-22 07:20:57.275448,1755,1593,1755,46,19,0,0,41,,,0,164.796,1
+0,2025-05-22 07:20:58.275873,1755,1593,1755,46,18,0,0,41,,,0,161.867,1
+0,2025-05-22 07:20:59.276240,1755,1593,1755,46,18,0,0,41,,,0,168.464,1
+0,2025-05-22 07:21:00.276665,1755,1593,1755,46,18,0,0,41,,,0,168.308,1
+0,2025-05-22 07:21:01.277080,1755,1593,1755,46,19,0,0,41,,,0,167.946,1
+0,2025-05-22 07:21:02.277514,1755,1593,1755,47,19,0,0,42,,,0,170.932,1
+0,2025-05-22 07:21:03.277944,1755,1593,1755,46,18,0,0,42,,,0,170.862,1
+0,2025-05-22 07:21:04.278368,1755,1593,1755,46,18,0,0,42,,,0,169.522,1
+0,2025-05-22 07:21:05.278789,1755,1593,1755,46,18,0,0,42,,,0,165.573,1
+0,2025-05-22 07:21:06.279217,1755,1593,1755,47,19,0,0,42,,,0,165.344,1
+0,2025-05-22 07:21:07.279646,1755,1593,1755,46,19,0,0,42,,,0,167.941,1
+0,2025-05-22 07:21:08.280085,1755,1593,1755,46,18,0,0,42,,,0,166.655,1
+0,2025-05-22 07:21:09.280498,1755,1593,1755,46,18,0,0,43,,,0,165.308,1
+0,2025-05-22 07:21:10.280920,1755,1593,1755,47,19,0,0,43,,,0,168.2,1
+0,2025-05-22 07:21:11.281342,1755,1593,1755,46,18,0,0,43,,,0,166.143,1
+0,2025-05-22 07:21:12.281782,1755,1593,1755,46,18,0,0,43,,,0,164.653,1
+0,2025-05-22 07:21:13.282199,1755,1593,1755,46,36,0,0,43,,,0,165.197,1
+0,2025-05-22 07:21:14.282624,1755,1593,1755,46,18,0,0,44,,,0,165.117,1
+0,2025-05-22 07:21:15.283055,1755,1593,1755,46,18,0,0,43,,,0,164.62,1
+0,2025-05-22 07:21:16.283479,1755,1593,1755,46,18,0,0,44,,,0,165.582,1
+0,2025-05-22 07:21:17.283906,1755,1593,1755,47,18,0,0,44,,,0,168.788,1
+0,2025-05-22 07:21:18.284331,1755,1593,1755,47,18,0,0,44,,,0,166.465,1
+0,2025-05-22 07:21:19.284757,1755,1593,1755,45,18,0,0,44,,,0,163.746,1
+0,2025-05-22 07:21:20.285181,1755,1593,1755,45,18,0,0,44,,,0,163.653,1
+0,2025-05-22 07:21:21.285625,1755,1593,1755,45,18,0,0,44,,,0,163.048,1
+0,2025-05-22 07:21:22.286048,1755,1593,1755,46,18,0,0,44,,,0,162.94,1
+0,2025-05-22 07:21:23.286485,1755,1593,1755,47,19,0,0,44,,,0,163.415,1
+0,2025-05-22 07:21:24.286905,1755,1593,1755,46,18,0,0,44,,,0,164.032,1
+0,2025-05-22 07:21:25.287338,1755,1593,1755,46,18,0,0,45,,,0,163.911,1
+0,2025-05-22 07:21:26.287772,1755,1593,1755,46,18,0,0,45,,,0,164.336,1
+0,2025-05-22 07:21:27.288204,1755,1593,1755,47,18,0,0,45,,,0,165.044,1
+0,2025-05-22 07:21:28.288625,1755,1593,1755,46,18,0,0,45,,,0,168.746,1
+0,2025-05-22 07:21:29.289053,1755,1593,1755,46,18,0,0,45,,,0,172.765,1
+0,2025-05-22 07:21:30.289496,1755,1593,1755,46,18,0,0,45,,,0,171.735,1
+0,2025-05-22 07:21:31.289927,1755,1593,1755,46,18,0,0,45,,,0,170.906,1
+0,2025-05-22 07:21:32.290358,1755,1593,1755,46,18,0,0,45,,,0,170.166,1
+0,2025-05-22 07:21:33.290777,1755,1593,1755,47,18,0,0,45,,,0,167.227,1
+0,2025-05-22 07:21:34.291194,1755,1593,1755,46,18,0,0,46,,,0,163.288,1
+0,2025-05-22 07:21:35.291620,1755,1593,1755,47,18,0,0,46,,,0,163.8,1
+0,2025-05-22 07:21:36.292050,1755,1593,1755,47,19,0,0,46,,,0,164.799,1
+0,2025-05-22 07:21:37.292474,1755,1593,1755,47,19,0,0,46,,,0,168.345,1
+0,2025-05-22 07:21:38.292900,1755,1593,1755,46,18,0,0,46,,,0,169.427,1
+0,2025-05-22 07:21:39.293340,1755,1593,1755,47,18,0,0,46,,,0,168.9,1
+0,2025-05-22 07:21:40.293802,1755,1593,1755,47,19,0,0,46,,,0,169.208,1
+0,2025-05-22 07:21:41.294219,1755,1593,1755,47,19,0,0,46,,,0,168.596,1
+0,2025-05-22 07:21:42.294645,1755,1593,1755,46,18,0,0,46,,,0,166.093,1
+0,2025-05-22 07:21:43.295066,1755,1593,1755,47,18,0,0,46,,,0,169.899,1
+0,2025-05-22 07:21:44.295498,1755,1593,1755,47,19,0,0,46,,,0,171.042,1
+0,2025-05-22 07:21:45.295924,1755,1593,1755,47,18,0,0,47,,,0,172.313,1
+0,2025-05-22 07:21:46.296353,1755,1593,1755,46,18,0,0,47,,,0,171.179,1
+0,2025-05-22 07:21:47.296778,1755,1593,1755,46,18,0,0,47,,,0,173.428,1
+0,2025-05-22 07:21:48.297203,1755,1593,1755,46,18,0,0,47,,,0,172.265,1
+0,2025-05-22 07:21:49.297592,1755,1593,1755,46,18,0,0,47,,,0,169.976,1
+0,2025-05-22 07:21:50.298010,1755,1593,1755,46,18,0,0,47,,,0,167.299,1
+0,2025-05-22 07:21:51.298436,1755,1593,1755,46,18,0,0,47,,,0,169.135,1
+0,2025-05-22 07:21:52.298858,1755,1593,1755,46,18,0,0,47,,,0,168.709,1
+0,2025-05-22 07:21:53.299286,1755,1593,1755,47,18,0,0,47,,,0,172.096,1
+0,2025-05-22 07:21:54.299709,1755,1593,1755,47,18,0,0,47,,,0,169.99,1
+0,2025-05-22 07:21:55.300131,1755,1593,1755,46,18,0,0,47,,,0,170.417,1
+0,2025-05-22 07:21:56.300561,1755,1593,1755,44,17,0,0,47,,,0,168.859,1
+0,2025-05-22 07:21:57.300950,1755,1593,1755,21,2,0,0,46,,,0,110.054,1
+0,2025-05-22 07:21:58.301328,1755,1593,1755,22,2,0,0,46,,,0,95.543,1
+0,2025-05-22 07:21:59.301714,1755,1593,1755,0,0,0,0,45,,,0,90.746,1
+0,2025-05-22 07:22:00.302095,1755,1593,1755,0,0,0,0,45,,,0,88.161,1
+0,2025-05-22 07:22:01.302462,1755,1593,1755,0,0,0,0,45,,,0,87.927,1
+0,2025-05-22 07:22:02.302828,1755,1593,1755,0,0,0,0,45,,,0,87.763,1
+0,2025-05-22 07:22:03.303207,1755,1593,1755,0,0,0,0,45,,,0,87.721,1
+0,2025-05-22 07:22:04.303578,1755,1593,1755,0,0,0,0,45,,,0,87.723,1
+0,2025-05-22 07:22:05.304031,1755,1593,1755,0,0,0,0,45,,,0,87.683,1
+0,2025-05-22 07:22:06.304365,1755,1593,1755,0,0,0,0,45,,,0,87.47,1
+0,2025-05-22 07:22:07.304734,1755,1593,1755,0,0,0,0,44,,,0,87.276,1
+0,2025-05-22 07:22:08.305170,1755,1593,1755,0,0,0,0,44,,,0,87.238,1
+0,2025-05-22 07:22:09.305577,1755,1593,1755,0,0,0,0,44,,,0,87.237,1
+0,2025-05-22 07:22:10.305978,1755,1593,1755,0,0,0,0,44,,,0,87.238,1
+0,2025-05-22 07:22:11.306538,1755,1593,1755,0,0,0,0,44,,,0,87.228,1
+0,2025-05-22 07:22:12.306949,1755,1593,1755,0,0,0,0,44,,,0,87.067,1
+0,2025-05-22 07:22:13.307379,1755,1593,1755,0,0,0,0,44,,,0,86.889,1
+0,2025-05-22 07:22:14.307773,1755,1593,1755,0,0,0,0,44,,,0,86.763,1
+0,2025-05-22 07:22:15.308163,1755,1593,1755,0,0,0,0,44,,,0,86.763,1
+0,2025-05-22 07:22:16.308542,1755,1593,1755,0,0,0,0,44,,,0,86.754,1
+0,2025-05-22 07:22:17.308965,1755,1593,1755,9,2,0,0,44,,,0,86.934,1
+0,2025-05-22 07:22:18.309381,1755,1593,1755,0,0,0,0,43,,,0,87.99,1
+0,2025-05-22 07:22:19.309721,1755,1593,1755,0,0,0,0,43,,,0,86.727,1
+0,2025-05-22 07:22:20.310356,1755,1593,1755,0,0,0,0,43,,,0,86.641,1
+0,2025-05-22 07:22:21.310966,1755,1593,1755,0,0,0,0,43,,,0,86.467,1
+0,2025-05-22 07:22:22.311493,1755,1593,1755,0,0,0,0,43,,,0,86.361,1
+0,2025-05-22 07:22:23.311834,1755,1593,1755,0,0,0,0,43,,,0,86.282,1
+0,2025-05-22 07:22:24.312224,1755,1593,1755,0,0,0,0,43,,,0,86.283,1
+0,2025-05-22 07:22:25.312567,1755,1593,1755,0,0,0,0,43,,,0,86.27,1
+0,2025-05-22 07:22:26.312947,1755,1593,1755,0,0,0,0,43,,,0,86.279,1
+0,2025-05-22 07:22:27.313382,1755,1593,1755,5,0,0,0,43,,,0,86.288,1
+0,2025-05-22 07:22:28.313886,1755,1593,1755,5,0,0,0,43,,,0,86.289,1
+0,2025-05-22 07:22:29.314428,1755,1593,1755,5,0,0,0,43,,,0,86.269,1
+0,2025-05-22 07:22:30.314975,1755,1593,1755,16,1,0,0,43,,,0,87.319,1
+0,2025-05-22 07:22:31.315483,1755,1593,1755,33,15,0,0,43,,,0,102.736,1
+0,2025-05-22 07:22:32.316918,1755,1593,1755,0,0,0,0,42,,,0,86.925,1
+0,2025-05-22 07:22:33.317411,1575,1593,1575,92,54,0,0,44,,,0,107.134,1
+0,2025-05-22 07:22:34.317939,1650,1593,1650,95,44,0,0,46,,,0,294.726,1
+0,2025-05-22 07:22:35.318568,1755,1593,1755,0,0,0,0,44,,,0,216.303,1
+0,2025-05-22 07:22:36.319054,1755,1593,1755,0,0,0,0,43,,,0,87.898,1
+0,2025-05-22 07:22:37.319525,1755,1593,1755,0,0,0,0,43,,,0,87.171,1
+0,2025-05-22 07:22:38.320042,1755,1593,1755,0,0,0,0,43,,,0,86.841,1
+0,2025-05-22 07:22:39.320730,1755,1593,1755,0,0,0,0,43,,,0,86.811,1
+0,2025-05-22 07:22:40.321245,1470,1593,1470,87,50,0,0,45,,,0,117.286,1
+0,2025-05-22 07:22:41.321758,1755,1593,1755,0,0,0,0,43,,,0,109.895,1
+0,2025-05-22 07:22:42.322124,1755,1593,1755,0,0,0,0,43,,,0,87.044,1
+0,2025-05-22 07:22:43.322659,1755,1593,1755,1,0,0,0,43,,,0,88.142,1
+0,2025-05-22 07:22:44.323156,1755,1593,1755,0,0,0,0,43,,,0,97.358,1
+0,2025-05-22 07:22:45.323659,1755,1593,1755,0,0,0,0,43,,,0,86.308,1
+0,2025-05-22 07:22:46.324230,1755,1593,1755,0,0,0,0,43,,,0,86.378,1
+0,2025-05-22 07:22:47.324918,1755,1593,1755,0,0,0,0,43,,,0,86.507,1
+0,2025-05-22 07:22:48.325446,1755,1593,1755,28,17,0,0,43,,,0,101.569,1
+0,2025-05-22 07:22:49.325933,1755,1593,1755,0,0,0,0,42,,,0,86.457,1
+0,2025-05-22 07:22:50.326447,1755,1593,1755,0,0,0,0,42,,,0,86.26,1
+0,2025-05-22 07:22:51.327157,1755,1593,1755,0,0,0,0,42,,,0,86.104,1
+0,2025-05-22 07:22:52.327672,1755,1593,1755,2,0,0,0,42,,,0,90.009,1
+0,2025-05-22 07:22:53.328189,1755,1593,1755,1,0,0,0,42,,,0,108.039,1
+0,2025-05-22 07:22:54.328655,1755,1593,1755,1,0,0,0,42,,,0,86.184,1
+0,2025-05-22 07:22:55.329292,1755,1593,1755,0,0,0,0,42,,,0,86.352,1
+0,2025-05-22 07:22:56.329804,1755,1593,1755,20,7,0,0,42,,,0,91.852,1
+0,2025-05-22 07:22:57.330330,1755,1593,1755,0,0,0,0,42,,,0,121.037,1
+0,2025-05-22 07:22:58.330822,1755,1593,1755,1,0,0,0,42,,,0,86.13,1
+0,2025-05-22 07:22:59.331312,1755,1593,1755,1,0,0,0,42,,,0,86.203,1
+0,2025-05-22 07:23:00.332012,1755,1593,1755,2,0,0,0,42,,,0,86.286,1
+0,2025-05-22 07:23:01.332519,1755,1593,1755,0,0,0,0,43,,,0,116.845,1
+0,2025-05-22 07:23:02.333020,1680,1593,1680,64,39,0,0,46,,,0,259.542,1
+0,2025-05-22 07:23:03.333544,1755,1593,1755,8,3,0,0,43,,,0,144.303,1
+0,2025-05-22 07:23:04.334043,1755,1593,1755,8,3,0,0,43,,,0,89.614,1
+0,2025-05-22 07:23:05.334730,1755,1593,1755,0,0,0,0,42,,,0,88.458,1
+0,2025-05-22 07:23:06.335262,1755,1593,1755,70,1,0,0,42,,,0,88.137,1
+0,2025-05-22 07:23:07.335801,1755,1593,1755,0,0,0,0,42,,,0,87.369,1
+0,2025-05-22 07:23:08.336352,1755,1593,1755,0,0,0,0,42,,,0,86.252,1
+0,2025-05-22 07:23:09.336868,1755,1593,1755,73,2,0,0,42,,,0,88.397,1
+0,2025-05-22 07:23:10.337356,1755,1593,1755,0,0,0,0,42,,,0,86.663,1
+0,2025-05-22 07:23:11.337725,1755,1593,1755,0,0,0,0,42,,,0,89.116,1
+0,2025-05-22 07:23:12.338087,1755,1593,1755,0,0,0,0,42,,,0,88.719,1
+0,2025-05-22 07:23:13.338431,1755,1593,1755,0,0,0,0,42,,,0,88.816,1
+0,2025-05-22 07:23:14.338828,1755,1593,1755,0,0,0,0,42,,,0,88.764,1
+0,2025-05-22 07:23:15.339223,1755,1593,1755,0,0,0,0,42,,,0,88.758,1
+0,2025-05-22 07:23:16.339634,1755,1593,1755,0,0,0,0,42,,,0,88.698,1
+0,2025-05-22 07:23:17.340017,1755,1593,1755,0,0,0,0,42,,,0,88.738,1
+0,2025-05-22 07:23:18.340426,1755,1593,1755,0,0,0,0,42,,,0,88.681,1
+0,2025-05-22 07:23:19.340830,1755,1593,1755,0,0,0,0,42,,,0,88.776,1
+0,2025-05-22 07:23:20.341239,1755,1593,1755,0,0,0,0,42,,,0,88.7,1
+0,2025-05-22 07:23:21.341650,1755,1593,1755,0,0,0,0,42,,,0,88.661,1
+0,2025-05-22 07:23:22.342095,1755,1593,1755,0,0,0,0,42,,,0,88.662,1
+0,2025-05-22 07:23:23.342384,1755,1593,1755,0,0,0,0,42,,,0,88.604,1
+0,2025-05-22 07:23:24.342789,1755,1593,1755,0,0,0,0,42,,,0,88.443,1
+0,2025-05-22 07:23:25.343156,1755,1593,1755,0,0,0,0,41,,,0,88.429,1
+0,2025-05-22 07:23:26.343491,1755,1593,1755,0,0,0,0,41,,,0,88.449,1
+0,2025-05-22 07:23:27.343838,1755,1593,1755,0,0,0,0,41,,,0,88.482,1
+0,2025-05-22 07:23:28.344139,1755,1593,1755,0,0,0,0,41,,,0,88.413,1
+0,2025-05-22 07:23:29.344480,1755,1593,1755,0,0,0,0,41,,,0,88.371,1
+0,2025-05-22 07:23:30.344988,1755,1593,1755,0,0,0,0,41,,,0,88.313,1
+0,2025-05-22 07:23:31.345457,1755,1593,1755,0,0,0,0,41,,,0,88.323,1
+0,2025-05-22 07:23:32.345857,1755,1593,1755,0,0,0,0,41,,,0,88.337,1
+0,2025-05-22 07:23:33.346245,1755,1593,1755,0,0,0,0,41,,,0,88.272,1
+0,2025-05-22 07:23:34.346634,1755,1593,1755,0,0,0,0,41,,,0,88.187,1
+0,2025-05-22 07:23:35.347022,1755,1593,1755,0,0,0,0,41,,,0,88.272,1
+0,2025-05-22 07:23:36.347411,1755,1593,1755,0,0,0,0,41,,,0,88.299,1
+0,2025-05-22 07:23:37.347798,1755,1593,1755,0,0,0,0,41,,,0,88.303,1
+0,2025-05-22 07:23:38.348184,1755,1593,1755,0,0,0,0,41,,,0,88.157,1
+0,2025-05-22 07:23:39.348575,1755,1593,1755,0,0,0,0,41,,,0,88.202,1
+0,2025-05-22 07:23:40.348966,1755,1593,1755,0,0,0,0,41,,,0,88.195,1
+0,2025-05-22 07:23:41.349371,1755,1593,1755,0,0,0,0,41,,,0,88.197,1
+0,2025-05-22 07:23:42.349764,1755,1593,1755,0,0,0,0,41,,,0,88.195,1
+0,2025-05-22 07:23:43.350176,1755,1593,1755,0,0,0,0,41,,,0,88.187,1
+0,2025-05-22 07:23:44.350577,1755,1593,1755,0,0,0,0,41,,,0,88.205,1
+0,2025-05-22 07:23:45.350967,1755,1593,1755,0,0,0,0,41,,,0,88.128,1
+0,2025-05-22 07:23:46.351372,1755,1593,1755,0,0,0,0,41,,,0,88.131,1
+0,2025-05-22 07:23:47.351771,1755,1593,1755,0,0,0,0,41,,,0,88.117,1
+0,2025-05-22 07:23:48.352181,1755,1593,1755,0,0,0,0,41,,,0,88.059,1
+0,2025-05-22 07:23:49.352571,1755,1593,1755,0,0,0,0,41,,,0,87.994,1
+0,2025-05-22 07:23:50.352952,1755,1593,1755,0,0,0,0,41,,,0,88.035,1
+0,2025-05-22 07:23:51.353492,1755,1593,1755,0,0,0,0,41,,,0,86.81,2
+0,2025-05-22 07:23:52.353930,1755,1593,1755,0,0,0,0,41,,,0,85.298,2
+0,2025-05-22 07:23:53.354392,1755,1593,1755,0,0,0,0,41,,,0,85.302,2
+0,2025-05-22 07:23:54.354974,1755,1593,1755,2,0,0,0,41,,,0,88.017,2
+0,2025-05-22 07:23:55.355581,1755,1593,1755,2,0,0,0,41,,,0,85.396,2
+0,2025-05-22 07:23:56.356378,1755,1593,1755,0,0,0,0,40,,,0,85.3,2
+0,2025-05-22 07:23:57.357162,1755,1593,1755,0,0,0,0,40,,,0,85.277,2
+0,2025-05-22 07:23:58.358047,1755,1593,1755,0,0,0,0,40,,,0,85.281,2
+0,2025-05-22 07:23:59.358757,1755,1593,1755,0,0,0,0,40,,,0,85.507,2
+0,2025-05-22 07:24:00.359353,1755,1593,1755,0,0,0,0,40,,,0,85.65,2
+0,2025-05-22 07:24:01.360023,1755,1593,1755,0,0,0,0,40,,,0,85.627,2
+0,2025-05-22 07:24:02.360661,1755,1593,1755,0,0,0,0,40,,,0,85.507,2
+0,2025-05-22 07:24:03.361365,1755,1593,1755,0,0,0,0,40,,,0,85.412,2
+0,2025-05-22 07:24:04.362059,1755,1593,1755,0,0,0,0,40,,,0,85.418,2
+0,2025-05-22 07:24:05.362757,1755,1593,1755,0,0,0,0,40,,,0,85.399,2
+0,2025-05-22 07:24:06.363425,1755,1593,1755,0,0,0,0,40,,,0,85.351,2
+0,2025-05-22 07:24:07.364247,1755,1593,1755,0,0,0,0,40,,,0,85.313,2
+0,2025-05-22 07:24:08.364937,1755,1593,1755,0,0,0,0,40,,,0,85.309,2
+0,2025-05-22 07:24:09.365401,1755,1593,1755,100,3,0,0,40,,,0,88.057,2
+0,2025-05-22 07:24:10.365922,1755,1593,1755,10,2,0,0,40,,,0,92.06,2
+0,2025-05-22 07:24:11.366400,1755,1593,1755,0,0,0,0,40,,,0,88.174,1
+0,2025-05-22 07:24:12.366750,1755,1593,1755,0,0,0,0,40,,,0,88.171,0
+0,2025-05-22 07:24:13.367080,1755,1593,1755,0,0,0,0,40,,,0,88.168,0
+0,2025-05-22 07:56:08.196552,345,1593,345,0,0,0,0,38,,,0,50.794,0
+0,2025-05-22 07:56:09.196841,345,1593,345,0,0,0,0,38,,,0,50.808,0
+0,2025-05-22 07:56:10.197111,345,1593,345,0,0,0,0,38,,,0,50.799,0
+0,2025-05-22 07:56:11.197379,345,1593,345,0,0,0,0,38,,,0,50.79,0
+0,2025-05-22 07:56:12.197628,345,1593,345,0,0,0,0,38,,,0,50.794,0
+0,2025-05-22 07:56:13.197905,345,1593,345,0,0,0,0,37,,,0,50.805,0
+0,2025-05-22 07:56:14.198164,345,1593,345,0,0,0,0,37,,,0,50.791,0
+0,2025-05-22 07:56:15.198412,345,1593,345,0,0,0,0,37,,,0,50.809,0
+0,2025-05-22 07:56:16.198678,345,1593,345,0,0,0,0,37,,,0,50.794,0
+0,2025-05-22 07:56:17.198953,345,1593,345,0,0,0,0,37,,,0,50.777,0
+0,2025-05-22 07:56:18.199259,345,1593,345,0,0,0,0,37,,,0,50.794,0
+0,2025-05-22 07:56:19.199564,345,1593,345,0,0,0,0,37,,,0,50.783,0
+0,2025-05-22 07:56:20.199832,345,1593,345,0,0,0,0,37,,,0,50.675,0
+0,2025-05-22 07:56:21.200074,345,1593,345,0,0,0,0,37,,,0,50.665,0
+0,2025-05-22 07:56:22.200413,345,1593,345,0,0,0,0,37,,,0,50.592,0
+0,2025-05-22 07:56:23.200672,345,1593,345,0,0,0,0,37,,,0,50.53,0
+0,2025-05-22 07:56:24.201158,1755,1593,1755,19,0,0,0,37,,,0,60.532,1
+0,2025-05-22 07:56:25.201517,1755,1593,1755,50,1,0,0,38,,,0,85.131,1
+0,2025-05-22 07:56:26.201966,1755,1593,1755,0,0,0,0,38,,,0,83.837,1
+0,2025-05-22 07:56:27.202396,1755,1593,1755,0,0,0,0,38,,,0,83.498,1
+0,2025-05-22 07:56:28.202699,1755,1593,1755,0,0,0,0,38,,,0,83.305,1
+0,2025-05-22 07:56:29.203018,1755,1593,1755,0,0,0,0,38,,,0,83.494,1
+0,2025-05-22 07:56:30.203447,1755,1593,1755,0,0,0,0,38,,,0,83.418,1
+0,2025-05-22 07:56:31.203812,1755,1593,1755,0,0,0,0,38,,,0,83.382,1
+0,2025-05-22 07:56:32.204133,1755,1593,1755,16,6,0,0,39,,,0,94.906,1
+0,2025-05-22 07:56:33.204398,1755,1593,1755,47,18,0,0,39,,,0,167.074,1
+0,2025-05-22 07:56:34.204660,1755,1593,1755,46,18,0,0,40,,,0,167.234,1
+0,2025-05-22 07:56:35.204926,1755,1593,1755,47,18,0,0,40,,,0,167.269,1
+0,2025-05-22 07:56:36.205193,1755,1593,1755,47,19,0,0,40,,,0,165.85,1
+0,2025-05-22 07:56:37.205471,1755,1593,1755,47,19,0,0,40,,,0,163.462,1
+0,2025-05-22 07:56:38.205723,1755,1593,1755,46,18,0,0,41,,,0,162.383,1
+0,2025-05-22 07:56:39.205994,1755,1593,1755,47,18,0,0,41,,,0,166.122,1
+0,2025-05-22 07:56:40.206266,1755,1593,1755,47,18,0,0,41,,,0,168.347,1
+0,2025-05-22 07:56:41.206538,1755,1593,1755,47,19,0,0,41,,,0,165.281,1
+0,2025-05-22 07:56:42.206802,1755,1593,1755,46,18,0,0,41,,,0,162.953,1
+0,2025-05-22 07:56:43.207073,1755,1593,1755,47,18,0,0,42,,,0,169.075,1
+0,2025-05-22 07:56:44.207346,1755,1593,1755,47,19,0,0,42,,,0,172.15,1
+0,2025-05-22 07:56:45.207611,1755,1593,1755,47,18,0,0,42,,,0,165.18,1
+0,2025-05-22 07:56:46.207886,1755,1593,1755,46,18,0,0,42,,,0,168.316,1
+0,2025-05-22 07:56:47.208161,1755,1593,1755,47,18,0,0,42,,,0,170.096,1
+0,2025-05-22 07:56:48.208431,1755,1593,1755,47,19,0,0,42,,,0,164.557,1
+0,2025-05-22 07:56:49.208700,1755,1593,1755,46,18,0,0,43,,,0,167.98,1
+0,2025-05-22 07:56:50.208969,1755,1593,1755,46,18,0,0,43,,,0,169.937,1
+0,2025-05-22 07:56:51.209241,1755,1593,1755,47,19,0,0,43,,,0,166.501,1
+0,2025-05-22 07:56:52.209526,1755,1593,1755,47,18,0,0,43,,,0,164.754,1
+0,2025-05-22 07:56:53.209800,1755,1593,1755,47,18,0,0,43,,,0,167.458,1
+0,2025-05-22 07:56:54.210071,1755,1593,1755,46,18,0,0,43,,,0,168.481,1
+0,2025-05-22 07:56:55.210353,1755,1593,1755,47,18,0,0,43,,,0,162.787,1
+0,2025-05-22 07:56:56.210644,1755,1593,1755,47,18,0,0,44,,,0,171.912,1
+0,2025-05-22 07:56:57.210922,1755,1593,1755,46,18,0,0,44,,,0,172.141,1
+0,2025-05-22 07:56:58.211201,1755,1593,1755,46,18,0,0,44,,,0,168.677,1
+0,2025-05-22 07:56:59.211475,1755,1593,1755,47,18,0,0,44,,,0,167.171,1
+0,2025-05-22 07:57:00.211747,1755,1593,1755,47,18,0,0,44,,,0,171.103,1
+0,2025-05-22 07:57:01.212010,1755,1593,1755,46,18,0,0,44,,,0,170.555,1
+0,2025-05-22 07:57:02.212279,1755,1593,1755,47,18,0,0,44,,,0,165.931,1
+0,2025-05-22 07:57:03.212562,1755,1593,1755,47,18,0,0,44,,,0,165.1,1
+0,2025-05-22 07:57:04.212837,1755,1593,1755,47,18,0,0,45,,,0,172.173,1
+0,2025-05-22 07:57:05.213108,1755,1593,1755,46,18,0,0,45,,,0,170.051,1
+0,2025-05-22 07:57:06.213382,1755,1593,1755,47,19,0,0,45,,,0,168.109,1
+0,2025-05-22 07:57:07.213654,1755,1593,1755,47,18,0,0,45,,,0,168.907,1
+0,2025-05-22 07:57:08.213930,1755,1593,1755,47,18,0,0,45,,,0,172.39,1
+0,2025-05-22 07:57:09.214210,1755,1593,1755,46,18,0,0,45,,,0,165.724,1
+0,2025-05-22 07:57:10.214484,1755,1593,1755,47,19,0,0,45,,,0,165.909,1
+0,2025-05-22 07:57:11.214764,1755,1593,1755,47,18,0,0,45,,,0,170.276,1
+0,2025-05-22 07:57:12.215043,1755,1593,1755,46,18,0,0,46,,,0,174.191,1
+0,2025-05-22 07:57:13.215317,1755,1593,1755,47,19,0,0,46,,,0,170.56,1
+0,2025-05-22 07:57:14.215591,1755,1593,1755,47,19,0,0,46,,,0,167.852,1
+0,2025-05-22 07:57:15.215868,1755,1593,1755,47,18,0,0,46,,,0,165.921,1
+0,2025-05-22 07:57:16.216144,1755,1593,1755,46,18,0,0,46,,,0,171.141,1
+0,2025-05-22 07:57:17.216413,1755,1593,1755,47,19,0,0,46,,,0,173.218,1
+0,2025-05-22 07:57:18.216682,1755,1593,1755,47,19,0,0,46,,,0,171.344,1
+0,2025-05-22 07:57:19.216955,1755,1593,1755,45,18,0,0,46,,,0,168.651,1
+0,2025-05-22 07:57:20.217228,1755,1593,1755,44,17,0,0,46,,,0,164.911,1
+0,2025-05-22 07:57:21.217509,1755,1593,1755,47,18,0,0,46,,,0,165.042,1
+0,2025-05-22 07:57:22.217780,1755,1593,1755,46,18,0,0,47,,,0,170.733,1
+0,2025-05-22 07:57:23.218059,1755,1593,1755,47,19,0,0,47,,,0,173.968,1
+0,2025-05-22 07:57:24.218331,1755,1593,1755,47,19,0,0,47,,,0,170.041,1
+0,2025-05-22 07:57:25.218595,1755,1593,1755,46,18,0,0,47,,,0,166.017,1
+0,2025-05-22 07:57:26.218867,1755,1593,1755,46,18,0,0,47,,,0,166.092,1
+0,2025-05-22 07:57:27.219148,1755,1593,1755,47,18,0,0,47,,,0,166.04,1
+0,2025-05-22 07:57:28.219420,1755,1593,1755,47,18,0,0,47,,,0,166.789,1
+0,2025-05-22 07:57:29.219693,1755,1593,1755,46,18,0,0,47,,,0,171.284,1
+0,2025-05-22 07:57:30.219969,1755,1593,1755,46,18,0,0,47,,,0,172.726,1
+0,2025-05-22 07:57:31.220237,1755,1593,1755,47,19,0,0,47,,,0,168.112,1
+0,2025-05-22 07:57:32.220515,1755,1593,1755,47,18,0,0,47,,,0,165.731,1
+0,2025-05-22 07:57:33.220788,1755,1593,1755,46,18,0,0,47,,,0,170.052,1
+0,2025-05-22 07:57:34.221064,1755,1593,1755,44,17,0,0,47,,,0,171.256,1
+0,2025-05-22 07:57:35.221347,1755,1593,1755,46,18,0,0,47,,,0,168.027,1
+0,2025-05-22 07:57:36.221629,1755,1593,1755,47,19,0,0,48,,,0,174.341,1
+0,2025-05-22 07:57:37.221915,1755,1593,1755,47,19,0,0,48,,,0,172.187,1
+0,2025-05-22 07:57:38.222190,1755,1593,1755,47,18,0,0,48,,,0,169.331,1
+0,2025-05-22 07:57:39.222466,1755,1593,1755,47,18,0,0,48,,,0,167.628,1
+0,2025-05-22 07:57:40.222743,1755,1593,1755,47,18,0,0,48,,,0,170.182,1
+0,2025-05-22 07:57:41.223026,1755,1593,1755,47,19,0,0,48,,,0,174.246,1
+0,2025-05-22 07:57:42.223297,1755,1593,1755,46,18,0,0,48,,,0,167.776,1
+0,2025-05-22 07:57:43.223574,1755,1593,1755,47,18,0,0,48,,,0,166.927,1
+0,2025-05-22 07:57:44.223847,1755,1593,1755,47,18,0,0,48,,,0,169.149,1
+0,2025-05-22 07:57:45.224126,1755,1593,1755,47,19,0,0,48,,,0,172.39,1
+0,2025-05-22 07:57:46.224403,1755,1593,1755,46,18,0,0,48,,,0,173.222,1
+0,2025-05-22 07:57:47.224678,1755,1593,1755,47,19,0,0,48,,,0,168.754,1
+0,2025-05-22 07:57:48.224956,1755,1593,1755,47,18,0,0,48,,,0,166.112,1
+0,2025-05-22 07:57:49.225239,1755,1593,1755,47,18,0,0,48,,,0,167.64,1
+0,2025-05-22 07:57:50.225513,1755,1593,1755,46,18,0,0,49,,,0,172.844,1
+0,2025-05-22 07:57:51.225790,1755,1593,1755,47,19,0,0,48,,,0,174.932,1
+0,2025-05-22 07:57:52.226061,1755,1593,1755,47,19,0,0,48,,,0,170.855,1
+0,2025-05-22 07:57:53.226335,1755,1593,1755,47,18,0,0,49,,,0,167.784,1
+0,2025-05-22 07:57:54.226629,1755,1593,1755,25,4,0,0,48,,,0,147.565,1
+0,2025-05-22 07:57:55.226949,1755,1593,1755,21,2,0,0,47,,,0,100.056,1
+0,2025-05-22 07:57:56.227276,1755,1593,1755,20,0,0,0,47,,,0,96.246,1
+0,2025-05-22 07:57:57.227621,1755,1593,1755,0,0,0,0,47,,,0,90.367,1
+0,2025-05-22 07:57:58.227968,1755,1593,1755,0,0,0,0,47,,,0,90.068,1
+0,2025-05-22 07:57:59.228348,1755,1593,1755,0,0,0,0,46,,,0,89.976,1
+0,2025-05-22 07:58:00.228720,1755,1593,1755,0,0,0,0,46,,,0,89.757,1
+0,2025-05-22 07:58:01.229097,1755,1593,1755,0,0,0,0,46,,,0,89.561,1
+0,2025-05-22 07:58:02.229437,1755,1593,1755,0,0,0,0,46,,,0,89.59,1
+0,2025-05-22 07:58:03.229865,1755,1593,1755,0,0,0,0,46,,,0,89.428,1
+0,2025-05-22 07:58:04.230279,1755,1593,1755,0,0,0,0,46,,,0,89.245,1
+0,2025-05-22 07:58:05.230728,1755,1593,1755,0,0,0,0,46,,,0,89.023,1
+0,2025-05-22 07:58:06.231117,1755,1593,1755,0,0,0,0,45,,,0,89.008,1
+0,2025-05-22 07:58:07.231519,1755,1593,1755,0,0,0,0,45,,,0,88.965,1
+0,2025-05-22 07:58:08.231912,1755,1593,1755,0,0,0,0,45,,,0,88.933,1
+0,2025-05-22 07:58:09.232297,1755,1593,1755,0,0,0,0,45,,,0,88.857,1
+0,2025-05-22 07:58:10.232730,1755,1593,1755,0,0,0,0,45,,,0,88.59,1
+0,2025-05-22 07:58:11.233045,1755,1593,1755,0,0,0,0,45,,,0,88.561,1
+0,2025-05-22 07:58:12.233386,1755,1593,1755,0,0,0,0,45,,,0,88.489,1
+0,2025-05-22 07:58:13.233748,1755,1593,1755,0,0,0,0,45,,,0,88.455,1
+0,2025-05-22 07:58:14.234110,1755,1593,1755,0,0,0,0,45,,,0,88.431,1
+0,2025-05-22 07:58:15.234465,1755,1593,1755,0,0,0,0,45,,,0,88.493,1
+0,2025-05-22 07:58:16.234895,1755,1593,1755,0,0,0,0,44,,,0,89.577,1
+0,2025-05-22 07:58:17.235308,1755,1593,1755,0,0,0,0,44,,,0,88.129,1
+0,2025-05-22 07:58:18.235660,1755,1593,1755,0,0,0,0,44,,,0,88.035,1
+0,2025-05-22 07:58:19.235964,1755,1593,1755,0,0,0,0,44,,,0,87.992,1
+0,2025-05-22 07:58:20.236329,1755,1593,1755,0,0,0,0,44,,,0,87.978,1
+0,2025-05-22 07:58:21.236685,1755,1593,1755,0,0,0,0,44,,,0,87.939,1
+0,2025-05-22 07:58:22.236980,1755,1593,1755,0,0,0,0,44,,,0,87.879,1
+0,2025-05-22 07:58:23.237764,1755,1593,1755,0,0,0,0,44,,,0,87.843,1
+0,2025-05-22 07:58:24.238136,1755,1593,1755,0,0,0,0,44,,,0,87.707,1
+0,2025-05-22 07:58:25.238465,1755,1593,1755,0,0,0,0,44,,,0,87.632,1
+0,2025-05-22 07:58:26.238800,1755,1593,1755,0,0,0,0,44,,,0,87.545,1
+0,2025-05-22 07:58:27.239145,1755,1593,1755,0,0,0,0,44,,,0,87.51,1
+0,2025-05-22 07:58:28.239499,1755,1593,1755,0,0,0,0,43,,,0,87.499,1
+0,2025-05-22 07:58:29.239886,1755,1593,1755,0,0,0,0,43,,,0,87.47,1
+0,2025-05-22 07:58:30.240287,1755,1593,1755,5,0,0,0,43,,,0,87.489,1
+0,2025-05-22 07:58:31.240735,1755,1593,1755,5,0,0,0,43,,,0,87.547,1
+0,2025-05-22 07:58:32.241236,1755,1593,1755,5,0,0,0,43,,,0,87.536,1
+0,2025-05-22 07:58:33.241778,1755,1593,1755,5,0,0,0,43,,,0,87.501,1
+0,2025-05-22 07:58:34.242234,1755,1593,1755,22,2,0,0,43,,,0,92.456,1
+0,2025-05-22 07:58:35.242719,1755,1593,1755,0,0,0,0,43,,,0,99.021,1
+0,2025-05-22 07:58:36.243178,1755,1593,1755,0,0,0,0,43,,,0,86.497,1
+0,2025-05-22 07:58:37.243648,1440,1593,1440,94,55,0,0,47,,,0,250.361,1
+0,2025-05-22 07:58:38.244133,1725,1593,1725,66,31,0,0,46,,,0,283.307,1
+0,2025-05-22 07:58:39.244622,1755,1593,1755,0,0,0,0,44,,,0,88.187,1
+0,2025-05-22 07:58:40.245177,1755,1593,1755,0,0,0,0,44,,,0,88.658,1
+0,2025-05-22 07:58:41.245710,1755,1593,1755,0,0,0,0,44,,,0,87.237,1
+0,2025-05-22 07:58:42.246369,1755,1593,1755,0,0,0,0,43,,,0,87.238,1
+0,2025-05-22 07:58:43.247004,1755,1593,1755,68,20,0,0,44,,,0,108.265,1
+0,2025-05-22 07:58:44.247663,1755,1593,1755,0,0,0,0,43,,,0,88.374,1
+0,2025-05-22 07:58:45.248245,1755,1593,1755,4,0,0,0,43,,,0,88.026,1
+0,2025-05-22 07:58:46.248891,1755,1593,1755,0,0,0,0,43,,,0,99.235,1
+0,2025-05-22 07:58:47.249566,1755,1593,1755,0,0,0,0,43,,,0,86.765,1
+0,2025-05-22 07:58:48.250044,1755,1593,1755,0,0,0,0,43,,,0,86.521,1
+0,2025-05-22 07:58:49.250501,1755,1593,1755,0,0,0,0,43,,,0,86.45,1
+0,2025-05-22 07:58:50.250933,1755,1593,1755,0,0,0,0,43,,,0,86.317,1
+0,2025-05-22 07:58:51.251404,1755,1593,1755,0,0,0,0,43,,,0,86.281,1
+0,2025-05-22 07:58:52.251855,1755,1593,1755,0,0,0,0,43,,,0,86.279,1
+0,2025-05-22 07:58:53.252293,1755,1593,1755,0,0,0,0,43,,,0,86.27,1
+0,2025-05-22 07:58:54.252727,1755,1593,1755,0,0,0,0,43,,,0,86.27,1
+0,2025-05-22 07:58:55.253178,1755,1593,1755,0,0,0,0,43,,,0,86.269,1
+0,2025-05-22 07:58:56.253777,1755,1593,1755,0,0,0,0,43,,,0,86.26,1
+0,2025-05-22 07:58:57.254234,1755,1593,1755,0,0,0,0,42,,,0,86.175,1
+0,2025-05-22 07:58:58.254658,1755,1593,1755,0,0,0,0,42,,,0,86.144,1
+0,2025-05-22 07:58:59.255115,1755,1593,1755,0,0,0,0,42,,,0,85.909,1
+0,2025-05-22 07:59:00.255595,1755,1593,1755,0,0,0,0,42,,,0,86.051,1
+0,2025-05-22 07:59:01.256139,1755,1593,1755,16,2,0,0,42,,,0,87.435,1
+0,2025-05-22 07:59:02.256809,1755,1593,1755,0,0,0,0,42,,,0,100.382,1
+0,2025-05-22 07:59:03.257430,1755,1593,1755,0,0,0,0,42,,,0,86.028,1
+0,2025-05-22 07:59:04.258047,1755,1593,1755,4,2,0,0,42,,,0,86.141,1
+0,2025-05-22 07:59:05.258619,1755,1593,1755,5,2,0,0,42,,,0,87.176,1
+0,2025-05-22 07:59:06.259263,1755,1593,1755,0,0,0,0,42,,,0,109.326,1
+0,2025-05-22 07:59:07.259800,1755,1593,1755,1,0,0,0,42,,,0,86.754,1
+0,2025-05-22 07:59:08.260389,1755,1593,1755,0,0,0,0,42,,,0,86.754,1
+0,2025-05-22 07:59:09.261040,1755,1593,1755,0,0,0,0,42,,,0,86.708,1
+0,2025-05-22 07:59:10.261661,1755,1593,1755,43,19,0,0,43,,,0,117.79,1
+0,2025-05-22 07:59:11.262326,1755,1593,1755,1,0,0,0,42,,,0,95.483,1
+0,2025-05-22 07:59:12.262872,1755,1593,1755,1,0,0,0,42,,,0,85.586,1
+0,2025-05-22 07:59:13.263500,1755,1593,1755,2,0,0,0,42,,,0,85.569,1
+0,2025-05-22 07:59:14.263947,1755,1593,1755,0,0,0,0,42,,,0,117.181,1
+0,2025-05-22 07:59:15.264579,1695,1593,1695,81,38,0,0,45,,,0,242.684,1
+0,2025-05-22 07:59:16.265078,1755,1593,1755,9,3,0,0,43,,,0,156.85,1
+0,2025-05-22 07:59:17.265552,1755,1593,1755,8,3,0,0,42,,,0,88.204,1
+0,2025-05-22 07:59:18.266076,1755,1593,1755,0,0,0,0,42,,,0,88.05,1
+0,2025-05-22 07:59:19.266710,1755,1593,1755,70,1,0,0,42,,,0,86.629,1
+0,2025-05-22 07:59:20.267224,1755,1593,1755,0,0,0,0,42,,,0,86.029,1
+0,2025-05-22 07:59:21.267786,1755,1593,1755,70,1,0,0,42,,,0,85.58,1
+0,2025-05-22 07:59:22.268311,1755,1593,1755,0,0,0,0,42,,,0,87.914,1
+0,2025-05-22 07:59:23.268615,1755,1593,1755,1,0,0,0,42,,,0,87.2,1
+0,2025-05-22 07:59:24.268957,1755,1593,1755,0,0,0,0,42,,,0,88.245,1
+0,2025-05-22 07:59:25.269329,1755,1593,1755,0,0,0,0,42,,,0,88.254,1
+0,2025-05-22 07:59:26.269710,1755,1593,1755,0,0,0,0,42,,,0,88.32,1
+0,2025-05-22 07:59:27.270211,1755,1593,1755,0,0,0,0,42,,,0,88.207,1
+0,2025-05-22 07:59:28.270588,1755,1593,1755,0,0,0,0,42,,,0,88.207,1
+0,2025-05-22 07:59:29.270924,1755,1593,1755,0,0,0,0,42,,,0,88.197,1
+0,2025-05-22 07:59:30.271305,1755,1593,1755,0,0,0,0,42,,,0,88.13,1
+0,2025-05-22 07:59:31.271687,1755,1593,1755,0,0,0,0,42,,,0,88.197,1
+0,2025-05-22 07:59:32.272025,1755,1593,1755,0,0,0,0,42,,,0,88.14,1
+0,2025-05-22 07:59:33.272378,1755,1593,1755,0,0,0,0,42,,,0,88.121,1
+0,2025-05-22 07:59:34.272798,1755,1593,1755,0,0,0,0,42,,,0,88.003,1
+0,2025-05-22 07:59:35.273202,1755,1593,1755,0,0,0,0,41,,,0,87.874,1
+0,2025-05-22 07:59:36.273565,1755,1593,1755,0,0,0,0,41,,,0,87.818,1
+0,2025-05-22 07:59:37.273952,1755,1593,1755,0,0,0,0,41,,,0,87.777,1
+0,2025-05-22 07:59:38.274266,1755,1593,1755,0,0,0,0,41,,,0,87.76,1
+0,2025-05-22 07:59:39.274595,1755,1593,1755,0,0,0,0,41,,,0,87.731,1
+0,2025-05-22 07:59:40.274979,1755,1593,1755,0,0,0,0,41,,,0,87.786,1
+0,2025-05-22 07:59:41.275441,1755,1593,1755,0,0,0,0,41,,,0,87.777,1
+0,2025-05-22 07:59:42.275841,1755,1593,1755,0,0,0,0,41,,,0,87.722,1
+0,2025-05-22 07:59:43.276223,1755,1593,1755,0,0,0,0,41,,,0,87.72,1
+0,2025-05-22 07:59:44.276602,1755,1593,1755,0,0,0,0,41,,,0,87.78,1
+0,2025-05-22 07:59:45.276986,1755,1593,1755,0,0,0,0,41,,,0,87.722,1
+0,2025-05-22 07:59:46.277360,1755,1593,1755,0,0,0,0,41,,,0,87.77,1
+0,2025-05-22 07:59:47.277738,1755,1593,1755,0,0,0,0,41,,,0,87.718,1
+0,2025-05-22 07:59:48.278114,1755,1593,1755,0,0,0,0,41,,,0,87.72,1
+0,2025-05-22 07:59:49.278488,1755,1593,1755,0,0,0,0,41,,,0,87.798,1
+0,2025-05-22 07:59:50.278874,1755,1593,1755,0,0,0,0,41,,,0,87.683,1
+0,2025-05-22 07:59:51.279259,1755,1593,1755,0,0,0,0,41,,,0,87.654,1
+0,2025-05-22 07:59:52.279647,1755,1593,1755,0,0,0,0,41,,,0,87.653,1
+0,2025-05-22 07:59:53.280039,1755,1593,1755,0,0,0,0,41,,,0,87.596,1
+0,2025-05-22 07:59:54.280430,1755,1593,1755,0,0,0,0,41,,,0,87.555,1
+0,2025-05-22 07:59:55.280831,1755,1593,1755,0,0,0,0,41,,,0,87.516,1
+0,2025-05-22 07:59:56.281260,1755,1593,1755,0,0,0,0,41,,,0,87.452,1
+0,2025-05-22 07:59:57.281660,1755,1593,1755,0,0,0,0,41,,,0,87.48,1
+0,2025-05-22 07:59:58.282039,1755,1593,1755,0,0,0,0,41,,,0,87.462,1
+0,2025-05-22 07:59:59.282382,1755,1593,1755,0,0,0,0,41,,,0,87.299,1
+0,2025-05-22 08:00:00.282758,1755,1593,1755,0,0,0,0,41,,,0,87.298,1
+0,2025-05-22 08:00:01.283139,1755,1593,1755,0,0,0,0,41,,,0,87.245,1
+0,2025-05-22 08:00:02.283510,1755,1593,1755,0,0,0,0,41,,,0,87.255,1
+0,2025-05-22 08:00:03.283903,1755,1593,1755,0,0,0,0,41,,,0,87.245,1
+0,2025-05-22 08:00:04.284293,1755,1593,1755,0,0,0,0,41,,,0,87.234,1
+0,2025-05-22 08:00:05.284689,1755,1593,1755,0,0,0,0,41,,,0,87.232,1
+0,2025-05-22 08:00:06.285061,1755,1593,1755,0,0,0,0,41,,,0,87.236,1
+0,2025-05-22 08:00:07.285461,1755,1593,1755,0,0,0,0,41,,,0,87.232,1
+0,2025-05-22 08:00:08.286011,1755,1593,1755,0,0,0,0,40,,,0,86.191,2
+0,2025-05-22 08:00:09.286430,1755,1593,1755,0,0,0,0,40,,,0,84.978,2
+0,2025-05-22 08:00:10.286876,1755,1593,1755,0,0,0,0,40,,,0,85.173,2
+0,2025-05-22 08:00:11.287434,1755,1593,1755,2,0,0,0,40,,,0,87.85,2
+0,2025-05-22 08:00:12.288026,1755,1593,1755,2,0,0,0,40,,,0,85.315,2
+0,2025-05-22 08:00:13.288628,1755,1593,1755,0,0,0,0,40,,,0,85.125,2
+0,2025-05-22 08:00:14.289190,1755,1593,1755,0,0,0,0,40,,,0,85.039,2
+0,2025-05-22 08:00:15.289769,1755,1593,1755,0,0,0,0,40,,,0,84.981,2
+0,2025-05-22 08:00:16.290395,1755,1593,1755,0,0,0,0,40,,,0,84.978,2
+0,2025-05-22 08:00:17.291086,1755,1593,1755,0,0,0,0,40,,,0,84.887,2
+0,2025-05-22 08:00:18.291775,1755,1593,1755,0,0,0,0,40,,,0,84.855,2
+0,2025-05-22 08:00:19.292462,1755,1593,1755,0,0,0,0,40,,,0,84.815,2
+0,2025-05-22 08:00:20.293172,1755,1593,1755,0,0,0,0,40,,,0,84.807,2
+0,2025-05-22 08:00:21.293834,1755,1593,1755,0,0,0,0,40,,,0,84.815,2
+0,2025-05-22 08:00:22.294397,1755,1593,1755,0,0,0,0,40,,,0,84.809,2
+0,2025-05-22 08:00:23.294956,1755,1593,1755,0,0,0,0,40,,,0,84.811,2
+0,2025-05-22 08:00:24.295553,1755,1593,1755,0,0,0,0,40,,,0,84.807,2
+0,2025-05-22 08:00:25.296201,1755,1593,1755,0,0,0,0,40,,,0,84.807,2
+0,2025-05-22 08:00:26.296812,1755,1593,1755,0,0,0,0,40,,,0,84.803,2
+0,2025-05-22 08:00:27.297554,1755,1593,1755,0,0,0,0,40,,,0,84.811,2
+0,2025-05-22 08:00:28.298201,1755,1593,1755,0,0,0,0,40,,,0,84.799,2
+0,2025-05-22 08:00:29.298852,1755,1593,1755,0,0,0,0,40,,,0,84.805,2
+0,2025-05-22 08:00:30.299650,1755,1593,1755,0,0,0,0,40,,,0,84.805,2
+0,2025-05-22 08:00:31.300374,1755,1593,1755,0,0,0,0,40,,,0,84.805,2
+0,2025-05-22 08:00:32.300782,1755,1593,1755,100,3,0,0,40,,,0,89.233,2
+0,2025-05-22 08:00:33.301269,1755,1593,1755,0,0,0,0,40,,,0,90.663,2
+0,2025-05-22 08:00:34.301661,1755,1593,1755,0,0,0,0,40,,,0,87.319,1
+0,2025-05-22 08:00:35.301911,1755,1593,1755,0,0,0,0,40,,,0,87.299,0
+0,2025-05-22 08:00:36.302155,375,1593,375,0,0,0,0,39,,,0,80.476,0
diff --git a/tests/integration/defs/output/perf_script_test_results.csv b/tests/integration/defs/output/perf_script_test_results.csv
new file mode 100644
index 00000000000..4c256eadabe
--- /dev/null
+++ b/tests/integration/defs/output/perf_script_test_results.csv
@@ -0,0 +1,3 @@
+network_name,network_hash,sm_clk,mem_clk,gpu_idx,perf_case_name,test_name,original_test_name,raw_result,perf_metric,total_time__sec,start_timestamp,end_timestamp,state,command,threshold,absolute_threshold,metric_type
+"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",1755,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:20:22] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:20:22] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:20:25] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:20:25] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:20:25] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path:         None\nNumber of Sequences:  512\n\n-- Percentiles statistics ---------------------------------\n\n        Input              Output           Seq. Length\n-----------------------------------------------------------\nMIN:   128.0000           128.0000           256.0000\nMAX:   128.0000           128.0000           256.0000\nAVG:   128.0000           128.0000           256.0000\nP50:   128.0000           128.0000           256.0000\nP90:   128.0000           128.0000           256.0000\nP95:   128.0000           128.0000           256.0000\nP99:   128.0000           128.0000           256.0000\n===========================================================\n\n[05/22/2025-07:20:25] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:20:25] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:20:34] [TRT-LLM] [I] Starting quantization...\nRegistered <class 'transformers.models.llama.modeling_llama.LlamaAttention'> for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:21:56] [TRT-LLM] [I] Quantization done. Total time used: 82.46 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:16] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/quantized-checkpoint \nTotal time used 20.49 s.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:22:17] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:22:24] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6364, GPU 636 (MiB)\n[05/22/2025-07:22:25] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1005, GPU +6, now: CPU 7168, GPU 642 (MiB)\n[05/22/2025-07:22:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time of constructing network from module object 8.95593547821045 seconds\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:22:26] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:22:26] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:22:30] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:22:30] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:22:30] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:22:40] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:40] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:22:43] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:43] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:43] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.11304ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:22:43] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:22:46] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:46] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:47] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:47] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:47] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.4582ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:22:47] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:22:51] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:51] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:51] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:51] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:51] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.58895ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:22:51] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:22:55] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:55] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:56] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:56] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:56] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.70618ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:22:56] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:22:59] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:59] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:00] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:00] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:00] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.7238ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:23:00] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:23:05] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:23:05] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:06] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:06] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.04059ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:23:06] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:23:06] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:23:06] [TRT] [I] Engine generation completed in 36.5777 seconds.\n[05/22/2025-07:23:06] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:23:09] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:42\n[05/22/2025-07:23:09] [TRT] [I] Serialized 4959 bytes of code generator cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 1912332 bytes of compilation cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:23:09] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:23:09] [TRT-LLM] [I] Build phase peak memory: 32810.35 MB, children: 11886.42 MB\n[05/22/2025-07:23:10] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:23:29] [TRT-LLM] [I] Engine serialized. Total time: 00:00:18\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-07:23:48] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:48] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:23:50] [TRT-LLM] [W] Found worker process 94149 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-07:23:50] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:23:50] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8791 MiB\n[TensorRT-LLM][INFO] Engine load time 3243 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.46 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28322\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.32 GiB for max tokens in paged KV cache (906304).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name:		meta-llama/Llama-3.1-8B\nModel Path:		/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length:		256\nMax Batch Size:			4096\nMax Num Tokens:			8192\nQuantization:			FP8\nKV Cache Dtype:			FP8\n===========================================================\n\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",36.5777,254.122006,2025-05-22 07:19:59,2025-05-22 07:24:13,valid,  trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME
+"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",375,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:56:21] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:56:21] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:56:23] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:56:23] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:56:23] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path:         None\nNumber of Sequences:  512\n\n-- Percentiles statistics ---------------------------------\n\n        Input              Output           Seq. Length\n-----------------------------------------------------------\nMIN:   128.0000           128.0000           256.0000\nMAX:   128.0000           128.0000           256.0000\nAVG:   128.0000           128.0000           256.0000\nP50:   128.0000           128.0000           256.0000\nP90:   128.0000           128.0000           256.0000\nP95:   128.0000           128.0000           256.0000\nP99:   128.0000           128.0000           256.0000\n===========================================================\n\n[05/22/2025-07:56:23] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:56:23] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:56:31] [TRT-LLM] [I] Starting quantization...\nRegistered <class 'transformers.models.llama.modeling_llama.LlamaAttention'> for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:57:53] [TRT-LLM] [I] Quantization done. Total time used: 82.30 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/quantized-checkpoint \nTotal time used 20.10 s.\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:58:15] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:58:15] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:58:28] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6151, GPU 636 (MiB)\n[05/22/2025-07:58:29] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1220, GPU +6, now: CPU 7170, GPU 642 (MiB)\n[05/22/2025-07:58:29] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time of constructing network from module object 14.438017129898071 seconds\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:58:29] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:58:29] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:58:33] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:58:33] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:58:33] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:58:43] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:58:43] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:58:45] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:58:45] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:58:45] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 5.97675ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:58:45] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:59:00] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:00] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:01] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:01] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:01] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.52666ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:59:01] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:59:04] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:04] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:05] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:05] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:05] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.61347ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:59:05] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:59:08] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:08] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:09] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:09] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:09] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.88478ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:59:09] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:59:12] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:12] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:13] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:13] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:13] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.79197ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:59:13] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:59:18] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:18] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:18] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:18] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:18] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.09646ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:59:18] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:59:19] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:59:19] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:59:19] [TRT] [I] Engine generation completed in 46.117 seconds.\n[05/22/2025-07:59:19] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:59:21] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:51\n[05/22/2025-07:59:21] [TRT] [I] Serialized 5010 bytes of code generator cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 1654870 bytes of compilation cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:59:21] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:59:21] [TRT-LLM] [I] Build phase peak memory: 32983.74 MB, children: 11888.58 MB\n[05/22/2025-07:59:23] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:59:39] [TRT-LLM] [I] Engine serialized. Total time: 00:00:16\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-08:00:05] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:05] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-08:00:07] [TRT-LLM] [W] Found worker process 98701 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-08:00:07] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:07] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8786 MiB\n[TensorRT-LLM][INFO] Engine load time 3261 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.47 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28327\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.33 GiB for max tokens in paged KV cache (906464).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name:		meta-llama/Llama-3.1-8B\nModel Path:		/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length:		256\nMax Batch Size:			4096\nMax Num Tokens:			8192\nQuantization:			FP8\nKV Cache Dtype:			FP8\n===========================================================\n\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",46.117,268.112764,2025-05-22 07:56:08,2025-05-22 08:00:36,valid,  trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME
diff --git a/tests/integration/defs/output/session_properties.csv b/tests/integration/defs/output/session_properties.csv
new file mode 100644
index 00000000000..b86ff5e75bf
--- /dev/null
+++ b/tests/integration/defs/output/session_properties.csv
@@ -0,0 +1,2 @@
+username,start_timestamp,hostname,ip,nvidia_driver_version,nvidia_device_count,os_properties,cpu_properties,gpu_properties,trt_change_id,trt_branch,commit_timestamp,cuda_version,cublas_version,cudnn_version,end_timestamp
+,2025-05-22 07:19:47,ipp2-1606.nvidia.com,10.176.4.8,575.57.05,1,"{'os_name': 'posix', 'platform': 'Linux', 'platform_version': '#144-Ubuntu SMP Fri Feb 7 20:47:38 UTC 2025'}","{'cpu_count': 32, 'cpu_freq': {'current': 1500.167875, 'min': 1500.0, 'max': 3000.0}}","{'device_product_name': 'H100 PCIe', 'pci_device_id': 590418142}",,,,,,,2025-05-22 07:55:34
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt b/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt
new file mode 100644
index 00000000000..57e076ffb02
--- /dev/null
+++ b/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt
@@ -0,0 +1 @@
+perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
index a2902ede1a8..e26946b1fb0 100644
--- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import pickle
 import sys
 import traceback
@@ -97,16 +96,14 @@ def row_linear_residual_norm_fusion_forward(
     reference_output = tuple(t.cuda() for t in reference_output)
 
     MPI.COMM_WORLD.barrier()
-    os.environ["TRTLLM_MNNVL_AR_ENABLED"] = "1"
-
-    allreduce = AllReduce(
-        mapping=Mapping(
-            world_size=tensor_parallel_size,
-            tp_size=tensor_parallel_size,
-            rank=tensor_parallel_rank,
-        ),
-        dtype=dtype,
-    )
+
+    allreduce = AllReduce(mapping=Mapping(
+        world_size=tensor_parallel_size,
+        tp_size=tensor_parallel_size,
+        rank=tensor_parallel_rank,
+    ),
+                          dtype=dtype,
+                          ar_backend="MNVL")
 
     # Since all the modules here are provided by TRT-LLM,
     # so it has to be fullgraph compatible

From 6130848acd42508757545e6ec13c9d77fc0dc2fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hui=20Gao=C3=A2=C2=80?= <huig@nvidia.com>
Date: Mon, 9 Jun 2025 04:48:45 -0700
Subject: [PATCH 2/9] Revert some change in tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hui Gao <huig@nvidia.com>
Signed-off-by: Hui Gaoâ <huig@nvidia.com>
---
 .../defs/output/gpu_monitoring.csv            | 525 ------------------
 .../defs/output/perf_script_test_results.csv  |   3 -
 .../defs/output/session_properties.csv        |   2 -
 .../qa/trt_llm_release_perf_test.txt          |   1 -
 4 files changed, 531 deletions(-)
 delete mode 100644 tests/integration/defs/output/gpu_monitoring.csv
 delete mode 100644 tests/integration/defs/output/perf_script_test_results.csv
 delete mode 100644 tests/integration/defs/output/session_properties.csv
 delete mode 100644 tests/integration/test_lists/qa/trt_llm_release_perf_test.txt

diff --git a/tests/integration/defs/output/gpu_monitoring.csv b/tests/integration/defs/output/gpu_monitoring.csv
deleted file mode 100644
index f0a9de5818a..00000000000
--- a/tests/integration/defs/output/gpu_monitoring.csv
+++ /dev/null
@@ -1,525 +0,0 @@
-gpu_id,timestamp,gpu_clock__MHz,memory_clock__MHz,graphics_clock__MHz,gpu_utilization__pct,memory_utilization__pct,encoder_utilization__pct,decoder_utilization__pct,gpu_temperature__C,memory_temperature__C,fan_speed__pct,perf_state,power_draw__W,process_num
-0,2025-05-22 07:19:59.254958,345,1593,345,0,0,0,0,33,,,0,49.336,0
-0,2025-05-22 07:20:00.255244,345,1593,345,0,0,0,0,33,,,0,49.341,0
-0,2025-05-22 07:20:01.255586,345,1593,345,0,0,0,0,33,,,0,49.335,0
-0,2025-05-22 07:20:02.255856,345,1593,345,0,0,0,0,33,,,0,49.325,0
-0,2025-05-22 07:20:03.256133,345,1593,345,0,0,0,0,33,,,0,49.338,0
-0,2025-05-22 07:20:04.256400,345,1593,345,0,0,0,0,33,,,0,49.329,0
-0,2025-05-22 07:20:05.256668,345,1593,345,0,0,0,0,33,,,0,49.333,0
-0,2025-05-22 07:20:06.256911,345,1593,345,0,0,0,0,33,,,0,49.335,0
-0,2025-05-22 07:20:07.257181,345,1593,345,0,0,0,0,33,,,0,49.341,0
-0,2025-05-22 07:20:08.257467,345,1593,345,0,0,0,0,33,,,0,49.331,0
-0,2025-05-22 07:20:09.257742,345,1593,345,0,0,0,0,33,,,0,49.325,0
-0,2025-05-22 07:20:10.258030,345,1593,345,0,0,0,0,33,,,0,49.325,0
-0,2025-05-22 07:20:11.258311,345,1593,345,0,0,0,0,33,,,0,49.329,0
-0,2025-05-22 07:20:12.258595,345,1593,345,0,0,0,0,33,,,0,49.325,0
-0,2025-05-22 07:20:13.258881,345,1593,345,0,0,0,0,33,,,0,49.327,0
-0,2025-05-22 07:20:14.259151,345,1593,345,0,0,0,0,33,,,0,49.336,0
-0,2025-05-22 07:20:15.259451,345,1593,345,0,0,0,0,33,,,0,49.325,0
-0,2025-05-22 07:20:16.259675,345,1593,345,0,0,0,0,33,,,0,49.339,0
-0,2025-05-22 07:20:17.259991,345,1593,345,0,0,0,0,33,,,0,49.349,0
-0,2025-05-22 07:20:18.260332,345,1593,345,0,0,0,0,33,,,0,49.343,0
-0,2025-05-22 07:20:19.260653,345,1593,345,0,0,0,0,33,,,0,49.34,0
-0,2025-05-22 07:20:20.260928,345,1593,345,0,0,0,0,33,,,0,49.327,0
-0,2025-05-22 07:20:21.261204,345,1593,345,0,0,0,0,33,,,0,49.325,0
-0,2025-05-22 07:20:22.261520,345,1593,345,0,0,0,0,33,,,0,49.327,0
-0,2025-05-22 07:20:23.261836,345,1593,345,0,0,0,0,33,,,0,49.329,0
-0,2025-05-22 07:20:24.262109,345,1593,345,0,0,0,0,33,,,0,49.332,0
-0,2025-05-22 07:20:25.262378,345,1593,345,0,0,0,0,33,,,0,49.327,0
-0,2025-05-22 07:20:26.262645,345,1593,345,0,0,0,0,33,,,0,49.334,0
-0,2025-05-22 07:20:27.263164,1755,1593,1755,4,0,0,0,33,,,0,56.903,1
-0,2025-05-22 07:20:28.263556,1755,1593,1755,47,1,0,0,34,,,0,83.141,1
-0,2025-05-22 07:20:29.263933,1755,1593,1755,0,0,0,0,34,,,0,82.042,1
-0,2025-05-22 07:20:30.264296,1755,1593,1755,0,0,0,0,34,,,0,81.62,1
-0,2025-05-22 07:20:31.264652,1755,1593,1755,0,0,0,0,34,,,0,81.685,1
-0,2025-05-22 07:20:32.265029,1755,1593,1755,0,0,0,0,34,,,0,81.517,1
-0,2025-05-22 07:20:33.265392,1755,1593,1755,0,0,0,0,34,,,0,81.539,1
-0,2025-05-22 07:20:34.265751,1755,1593,1755,0,0,0,0,34,,,0,81.602,1
-0,2025-05-22 07:20:35.266153,1755,1593,1755,47,18,0,0,36,,,0,149.079,1
-0,2025-05-22 07:20:36.266565,1755,1593,1755,46,18,0,0,36,,,0,160.12,1
-0,2025-05-22 07:20:37.266972,1755,1593,1755,46,18,0,0,37,,,0,159.799,1
-0,2025-05-22 07:20:38.267387,1755,1593,1755,46,18,0,0,37,,,0,161.614,1
-0,2025-05-22 07:20:39.267802,1755,1593,1755,46,18,0,0,37,,,0,161.828,1
-0,2025-05-22 07:20:40.268223,1755,1593,1755,46,18,0,0,37,,,0,163.822,1
-0,2025-05-22 07:20:41.268630,1755,1593,1755,46,19,0,0,38,,,0,165.912,1
-0,2025-05-22 07:20:42.269043,1755,1593,1755,47,19,0,0,38,,,0,164.7,1
-0,2025-05-22 07:20:43.269479,1755,1593,1755,46,18,0,0,38,,,0,164.466,1
-0,2025-05-22 07:20:44.269904,1755,1593,1755,46,18,0,0,38,,,0,166.363,1
-0,2025-05-22 07:20:45.270322,1755,1593,1755,47,19,0,0,38,,,0,167.174,1
-0,2025-05-22 07:20:46.270754,1755,1593,1755,46,19,0,0,39,,,0,166.63,1
-0,2025-05-22 07:20:47.271183,1755,1593,1755,47,19,0,0,39,,,0,166.363,1
-0,2025-05-22 07:20:48.271607,1755,1593,1755,46,18,0,0,39,,,0,163.311,1
-0,2025-05-22 07:20:49.272020,1755,1593,1755,46,18,0,0,39,,,0,160.703,1
-0,2025-05-22 07:20:50.272437,1755,1593,1755,46,18,0,0,39,,,0,160.035,1
-0,2025-05-22 07:20:51.272861,1755,1593,1755,46,18,0,0,40,,,0,160.304,1
-0,2025-05-22 07:20:52.273307,1755,1593,1755,45,17,0,0,40,,,0,162.585,1
-0,2025-05-22 07:20:53.273747,1755,1593,1755,46,18,0,0,40,,,0,163.577,1
-0,2025-05-22 07:20:54.274167,1755,1593,1755,46,18,0,0,40,,,0,165.493,1
-0,2025-05-22 07:20:55.274583,1755,1593,1755,46,18,0,0,40,,,0,166.608,1
-0,2025-05-22 07:20:56.275023,1755,1593,1755,46,18,0,0,41,,,0,167.712,1
-0,2025-05-22 07:20:57.275448,1755,1593,1755,46,19,0,0,41,,,0,164.796,1
-0,2025-05-22 07:20:58.275873,1755,1593,1755,46,18,0,0,41,,,0,161.867,1
-0,2025-05-22 07:20:59.276240,1755,1593,1755,46,18,0,0,41,,,0,168.464,1
-0,2025-05-22 07:21:00.276665,1755,1593,1755,46,18,0,0,41,,,0,168.308,1
-0,2025-05-22 07:21:01.277080,1755,1593,1755,46,19,0,0,41,,,0,167.946,1
-0,2025-05-22 07:21:02.277514,1755,1593,1755,47,19,0,0,42,,,0,170.932,1
-0,2025-05-22 07:21:03.277944,1755,1593,1755,46,18,0,0,42,,,0,170.862,1
-0,2025-05-22 07:21:04.278368,1755,1593,1755,46,18,0,0,42,,,0,169.522,1
-0,2025-05-22 07:21:05.278789,1755,1593,1755,46,18,0,0,42,,,0,165.573,1
-0,2025-05-22 07:21:06.279217,1755,1593,1755,47,19,0,0,42,,,0,165.344,1
-0,2025-05-22 07:21:07.279646,1755,1593,1755,46,19,0,0,42,,,0,167.941,1
-0,2025-05-22 07:21:08.280085,1755,1593,1755,46,18,0,0,42,,,0,166.655,1
-0,2025-05-22 07:21:09.280498,1755,1593,1755,46,18,0,0,43,,,0,165.308,1
-0,2025-05-22 07:21:10.280920,1755,1593,1755,47,19,0,0,43,,,0,168.2,1
-0,2025-05-22 07:21:11.281342,1755,1593,1755,46,18,0,0,43,,,0,166.143,1
-0,2025-05-22 07:21:12.281782,1755,1593,1755,46,18,0,0,43,,,0,164.653,1
-0,2025-05-22 07:21:13.282199,1755,1593,1755,46,36,0,0,43,,,0,165.197,1
-0,2025-05-22 07:21:14.282624,1755,1593,1755,46,18,0,0,44,,,0,165.117,1
-0,2025-05-22 07:21:15.283055,1755,1593,1755,46,18,0,0,43,,,0,164.62,1
-0,2025-05-22 07:21:16.283479,1755,1593,1755,46,18,0,0,44,,,0,165.582,1
-0,2025-05-22 07:21:17.283906,1755,1593,1755,47,18,0,0,44,,,0,168.788,1
-0,2025-05-22 07:21:18.284331,1755,1593,1755,47,18,0,0,44,,,0,166.465,1
-0,2025-05-22 07:21:19.284757,1755,1593,1755,45,18,0,0,44,,,0,163.746,1
-0,2025-05-22 07:21:20.285181,1755,1593,1755,45,18,0,0,44,,,0,163.653,1
-0,2025-05-22 07:21:21.285625,1755,1593,1755,45,18,0,0,44,,,0,163.048,1
-0,2025-05-22 07:21:22.286048,1755,1593,1755,46,18,0,0,44,,,0,162.94,1
-0,2025-05-22 07:21:23.286485,1755,1593,1755,47,19,0,0,44,,,0,163.415,1
-0,2025-05-22 07:21:24.286905,1755,1593,1755,46,18,0,0,44,,,0,164.032,1
-0,2025-05-22 07:21:25.287338,1755,1593,1755,46,18,0,0,45,,,0,163.911,1
-0,2025-05-22 07:21:26.287772,1755,1593,1755,46,18,0,0,45,,,0,164.336,1
-0,2025-05-22 07:21:27.288204,1755,1593,1755,47,18,0,0,45,,,0,165.044,1
-0,2025-05-22 07:21:28.288625,1755,1593,1755,46,18,0,0,45,,,0,168.746,1
-0,2025-05-22 07:21:29.289053,1755,1593,1755,46,18,0,0,45,,,0,172.765,1
-0,2025-05-22 07:21:30.289496,1755,1593,1755,46,18,0,0,45,,,0,171.735,1
-0,2025-05-22 07:21:31.289927,1755,1593,1755,46,18,0,0,45,,,0,170.906,1
-0,2025-05-22 07:21:32.290358,1755,1593,1755,46,18,0,0,45,,,0,170.166,1
-0,2025-05-22 07:21:33.290777,1755,1593,1755,47,18,0,0,45,,,0,167.227,1
-0,2025-05-22 07:21:34.291194,1755,1593,1755,46,18,0,0,46,,,0,163.288,1
-0,2025-05-22 07:21:35.291620,1755,1593,1755,47,18,0,0,46,,,0,163.8,1
-0,2025-05-22 07:21:36.292050,1755,1593,1755,47,19,0,0,46,,,0,164.799,1
-0,2025-05-22 07:21:37.292474,1755,1593,1755,47,19,0,0,46,,,0,168.345,1
-0,2025-05-22 07:21:38.292900,1755,1593,1755,46,18,0,0,46,,,0,169.427,1
-0,2025-05-22 07:21:39.293340,1755,1593,1755,47,18,0,0,46,,,0,168.9,1
-0,2025-05-22 07:21:40.293802,1755,1593,1755,47,19,0,0,46,,,0,169.208,1
-0,2025-05-22 07:21:41.294219,1755,1593,1755,47,19,0,0,46,,,0,168.596,1
-0,2025-05-22 07:21:42.294645,1755,1593,1755,46,18,0,0,46,,,0,166.093,1
-0,2025-05-22 07:21:43.295066,1755,1593,1755,47,18,0,0,46,,,0,169.899,1
-0,2025-05-22 07:21:44.295498,1755,1593,1755,47,19,0,0,46,,,0,171.042,1
-0,2025-05-22 07:21:45.295924,1755,1593,1755,47,18,0,0,47,,,0,172.313,1
-0,2025-05-22 07:21:46.296353,1755,1593,1755,46,18,0,0,47,,,0,171.179,1
-0,2025-05-22 07:21:47.296778,1755,1593,1755,46,18,0,0,47,,,0,173.428,1
-0,2025-05-22 07:21:48.297203,1755,1593,1755,46,18,0,0,47,,,0,172.265,1
-0,2025-05-22 07:21:49.297592,1755,1593,1755,46,18,0,0,47,,,0,169.976,1
-0,2025-05-22 07:21:50.298010,1755,1593,1755,46,18,0,0,47,,,0,167.299,1
-0,2025-05-22 07:21:51.298436,1755,1593,1755,46,18,0,0,47,,,0,169.135,1
-0,2025-05-22 07:21:52.298858,1755,1593,1755,46,18,0,0,47,,,0,168.709,1
-0,2025-05-22 07:21:53.299286,1755,1593,1755,47,18,0,0,47,,,0,172.096,1
-0,2025-05-22 07:21:54.299709,1755,1593,1755,47,18,0,0,47,,,0,169.99,1
-0,2025-05-22 07:21:55.300131,1755,1593,1755,46,18,0,0,47,,,0,170.417,1
-0,2025-05-22 07:21:56.300561,1755,1593,1755,44,17,0,0,47,,,0,168.859,1
-0,2025-05-22 07:21:57.300950,1755,1593,1755,21,2,0,0,46,,,0,110.054,1
-0,2025-05-22 07:21:58.301328,1755,1593,1755,22,2,0,0,46,,,0,95.543,1
-0,2025-05-22 07:21:59.301714,1755,1593,1755,0,0,0,0,45,,,0,90.746,1
-0,2025-05-22 07:22:00.302095,1755,1593,1755,0,0,0,0,45,,,0,88.161,1
-0,2025-05-22 07:22:01.302462,1755,1593,1755,0,0,0,0,45,,,0,87.927,1
-0,2025-05-22 07:22:02.302828,1755,1593,1755,0,0,0,0,45,,,0,87.763,1
-0,2025-05-22 07:22:03.303207,1755,1593,1755,0,0,0,0,45,,,0,87.721,1
-0,2025-05-22 07:22:04.303578,1755,1593,1755,0,0,0,0,45,,,0,87.723,1
-0,2025-05-22 07:22:05.304031,1755,1593,1755,0,0,0,0,45,,,0,87.683,1
-0,2025-05-22 07:22:06.304365,1755,1593,1755,0,0,0,0,45,,,0,87.47,1
-0,2025-05-22 07:22:07.304734,1755,1593,1755,0,0,0,0,44,,,0,87.276,1
-0,2025-05-22 07:22:08.305170,1755,1593,1755,0,0,0,0,44,,,0,87.238,1
-0,2025-05-22 07:22:09.305577,1755,1593,1755,0,0,0,0,44,,,0,87.237,1
-0,2025-05-22 07:22:10.305978,1755,1593,1755,0,0,0,0,44,,,0,87.238,1
-0,2025-05-22 07:22:11.306538,1755,1593,1755,0,0,0,0,44,,,0,87.228,1
-0,2025-05-22 07:22:12.306949,1755,1593,1755,0,0,0,0,44,,,0,87.067,1
-0,2025-05-22 07:22:13.307379,1755,1593,1755,0,0,0,0,44,,,0,86.889,1
-0,2025-05-22 07:22:14.307773,1755,1593,1755,0,0,0,0,44,,,0,86.763,1
-0,2025-05-22 07:22:15.308163,1755,1593,1755,0,0,0,0,44,,,0,86.763,1
-0,2025-05-22 07:22:16.308542,1755,1593,1755,0,0,0,0,44,,,0,86.754,1
-0,2025-05-22 07:22:17.308965,1755,1593,1755,9,2,0,0,44,,,0,86.934,1
-0,2025-05-22 07:22:18.309381,1755,1593,1755,0,0,0,0,43,,,0,87.99,1
-0,2025-05-22 07:22:19.309721,1755,1593,1755,0,0,0,0,43,,,0,86.727,1
-0,2025-05-22 07:22:20.310356,1755,1593,1755,0,0,0,0,43,,,0,86.641,1
-0,2025-05-22 07:22:21.310966,1755,1593,1755,0,0,0,0,43,,,0,86.467,1
-0,2025-05-22 07:22:22.311493,1755,1593,1755,0,0,0,0,43,,,0,86.361,1
-0,2025-05-22 07:22:23.311834,1755,1593,1755,0,0,0,0,43,,,0,86.282,1
-0,2025-05-22 07:22:24.312224,1755,1593,1755,0,0,0,0,43,,,0,86.283,1
-0,2025-05-22 07:22:25.312567,1755,1593,1755,0,0,0,0,43,,,0,86.27,1
-0,2025-05-22 07:22:26.312947,1755,1593,1755,0,0,0,0,43,,,0,86.279,1
-0,2025-05-22 07:22:27.313382,1755,1593,1755,5,0,0,0,43,,,0,86.288,1
-0,2025-05-22 07:22:28.313886,1755,1593,1755,5,0,0,0,43,,,0,86.289,1
-0,2025-05-22 07:22:29.314428,1755,1593,1755,5,0,0,0,43,,,0,86.269,1
-0,2025-05-22 07:22:30.314975,1755,1593,1755,16,1,0,0,43,,,0,87.319,1
-0,2025-05-22 07:22:31.315483,1755,1593,1755,33,15,0,0,43,,,0,102.736,1
-0,2025-05-22 07:22:32.316918,1755,1593,1755,0,0,0,0,42,,,0,86.925,1
-0,2025-05-22 07:22:33.317411,1575,1593,1575,92,54,0,0,44,,,0,107.134,1
-0,2025-05-22 07:22:34.317939,1650,1593,1650,95,44,0,0,46,,,0,294.726,1
-0,2025-05-22 07:22:35.318568,1755,1593,1755,0,0,0,0,44,,,0,216.303,1
-0,2025-05-22 07:22:36.319054,1755,1593,1755,0,0,0,0,43,,,0,87.898,1
-0,2025-05-22 07:22:37.319525,1755,1593,1755,0,0,0,0,43,,,0,87.171,1
-0,2025-05-22 07:22:38.320042,1755,1593,1755,0,0,0,0,43,,,0,86.841,1
-0,2025-05-22 07:22:39.320730,1755,1593,1755,0,0,0,0,43,,,0,86.811,1
-0,2025-05-22 07:22:40.321245,1470,1593,1470,87,50,0,0,45,,,0,117.286,1
-0,2025-05-22 07:22:41.321758,1755,1593,1755,0,0,0,0,43,,,0,109.895,1
-0,2025-05-22 07:22:42.322124,1755,1593,1755,0,0,0,0,43,,,0,87.044,1
-0,2025-05-22 07:22:43.322659,1755,1593,1755,1,0,0,0,43,,,0,88.142,1
-0,2025-05-22 07:22:44.323156,1755,1593,1755,0,0,0,0,43,,,0,97.358,1
-0,2025-05-22 07:22:45.323659,1755,1593,1755,0,0,0,0,43,,,0,86.308,1
-0,2025-05-22 07:22:46.324230,1755,1593,1755,0,0,0,0,43,,,0,86.378,1
-0,2025-05-22 07:22:47.324918,1755,1593,1755,0,0,0,0,43,,,0,86.507,1
-0,2025-05-22 07:22:48.325446,1755,1593,1755,28,17,0,0,43,,,0,101.569,1
-0,2025-05-22 07:22:49.325933,1755,1593,1755,0,0,0,0,42,,,0,86.457,1
-0,2025-05-22 07:22:50.326447,1755,1593,1755,0,0,0,0,42,,,0,86.26,1
-0,2025-05-22 07:22:51.327157,1755,1593,1755,0,0,0,0,42,,,0,86.104,1
-0,2025-05-22 07:22:52.327672,1755,1593,1755,2,0,0,0,42,,,0,90.009,1
-0,2025-05-22 07:22:53.328189,1755,1593,1755,1,0,0,0,42,,,0,108.039,1
-0,2025-05-22 07:22:54.328655,1755,1593,1755,1,0,0,0,42,,,0,86.184,1
-0,2025-05-22 07:22:55.329292,1755,1593,1755,0,0,0,0,42,,,0,86.352,1
-0,2025-05-22 07:22:56.329804,1755,1593,1755,20,7,0,0,42,,,0,91.852,1
-0,2025-05-22 07:22:57.330330,1755,1593,1755,0,0,0,0,42,,,0,121.037,1
-0,2025-05-22 07:22:58.330822,1755,1593,1755,1,0,0,0,42,,,0,86.13,1
-0,2025-05-22 07:22:59.331312,1755,1593,1755,1,0,0,0,42,,,0,86.203,1
-0,2025-05-22 07:23:00.332012,1755,1593,1755,2,0,0,0,42,,,0,86.286,1
-0,2025-05-22 07:23:01.332519,1755,1593,1755,0,0,0,0,43,,,0,116.845,1
-0,2025-05-22 07:23:02.333020,1680,1593,1680,64,39,0,0,46,,,0,259.542,1
-0,2025-05-22 07:23:03.333544,1755,1593,1755,8,3,0,0,43,,,0,144.303,1
-0,2025-05-22 07:23:04.334043,1755,1593,1755,8,3,0,0,43,,,0,89.614,1
-0,2025-05-22 07:23:05.334730,1755,1593,1755,0,0,0,0,42,,,0,88.458,1
-0,2025-05-22 07:23:06.335262,1755,1593,1755,70,1,0,0,42,,,0,88.137,1
-0,2025-05-22 07:23:07.335801,1755,1593,1755,0,0,0,0,42,,,0,87.369,1
-0,2025-05-22 07:23:08.336352,1755,1593,1755,0,0,0,0,42,,,0,86.252,1
-0,2025-05-22 07:23:09.336868,1755,1593,1755,73,2,0,0,42,,,0,88.397,1
-0,2025-05-22 07:23:10.337356,1755,1593,1755,0,0,0,0,42,,,0,86.663,1
-0,2025-05-22 07:23:11.337725,1755,1593,1755,0,0,0,0,42,,,0,89.116,1
-0,2025-05-22 07:23:12.338087,1755,1593,1755,0,0,0,0,42,,,0,88.719,1
-0,2025-05-22 07:23:13.338431,1755,1593,1755,0,0,0,0,42,,,0,88.816,1
-0,2025-05-22 07:23:14.338828,1755,1593,1755,0,0,0,0,42,,,0,88.764,1
-0,2025-05-22 07:23:15.339223,1755,1593,1755,0,0,0,0,42,,,0,88.758,1
-0,2025-05-22 07:23:16.339634,1755,1593,1755,0,0,0,0,42,,,0,88.698,1
-0,2025-05-22 07:23:17.340017,1755,1593,1755,0,0,0,0,42,,,0,88.738,1
-0,2025-05-22 07:23:18.340426,1755,1593,1755,0,0,0,0,42,,,0,88.681,1
-0,2025-05-22 07:23:19.340830,1755,1593,1755,0,0,0,0,42,,,0,88.776,1
-0,2025-05-22 07:23:20.341239,1755,1593,1755,0,0,0,0,42,,,0,88.7,1
-0,2025-05-22 07:23:21.341650,1755,1593,1755,0,0,0,0,42,,,0,88.661,1
-0,2025-05-22 07:23:22.342095,1755,1593,1755,0,0,0,0,42,,,0,88.662,1
-0,2025-05-22 07:23:23.342384,1755,1593,1755,0,0,0,0,42,,,0,88.604,1
-0,2025-05-22 07:23:24.342789,1755,1593,1755,0,0,0,0,42,,,0,88.443,1
-0,2025-05-22 07:23:25.343156,1755,1593,1755,0,0,0,0,41,,,0,88.429,1
-0,2025-05-22 07:23:26.343491,1755,1593,1755,0,0,0,0,41,,,0,88.449,1
-0,2025-05-22 07:23:27.343838,1755,1593,1755,0,0,0,0,41,,,0,88.482,1
-0,2025-05-22 07:23:28.344139,1755,1593,1755,0,0,0,0,41,,,0,88.413,1
-0,2025-05-22 07:23:29.344480,1755,1593,1755,0,0,0,0,41,,,0,88.371,1
-0,2025-05-22 07:23:30.344988,1755,1593,1755,0,0,0,0,41,,,0,88.313,1
-0,2025-05-22 07:23:31.345457,1755,1593,1755,0,0,0,0,41,,,0,88.323,1
-0,2025-05-22 07:23:32.345857,1755,1593,1755,0,0,0,0,41,,,0,88.337,1
-0,2025-05-22 07:23:33.346245,1755,1593,1755,0,0,0,0,41,,,0,88.272,1
-0,2025-05-22 07:23:34.346634,1755,1593,1755,0,0,0,0,41,,,0,88.187,1
-0,2025-05-22 07:23:35.347022,1755,1593,1755,0,0,0,0,41,,,0,88.272,1
-0,2025-05-22 07:23:36.347411,1755,1593,1755,0,0,0,0,41,,,0,88.299,1
-0,2025-05-22 07:23:37.347798,1755,1593,1755,0,0,0,0,41,,,0,88.303,1
-0,2025-05-22 07:23:38.348184,1755,1593,1755,0,0,0,0,41,,,0,88.157,1
-0,2025-05-22 07:23:39.348575,1755,1593,1755,0,0,0,0,41,,,0,88.202,1
-0,2025-05-22 07:23:40.348966,1755,1593,1755,0,0,0,0,41,,,0,88.195,1
-0,2025-05-22 07:23:41.349371,1755,1593,1755,0,0,0,0,41,,,0,88.197,1
-0,2025-05-22 07:23:42.349764,1755,1593,1755,0,0,0,0,41,,,0,88.195,1
-0,2025-05-22 07:23:43.350176,1755,1593,1755,0,0,0,0,41,,,0,88.187,1
-0,2025-05-22 07:23:44.350577,1755,1593,1755,0,0,0,0,41,,,0,88.205,1
-0,2025-05-22 07:23:45.350967,1755,1593,1755,0,0,0,0,41,,,0,88.128,1
-0,2025-05-22 07:23:46.351372,1755,1593,1755,0,0,0,0,41,,,0,88.131,1
-0,2025-05-22 07:23:47.351771,1755,1593,1755,0,0,0,0,41,,,0,88.117,1
-0,2025-05-22 07:23:48.352181,1755,1593,1755,0,0,0,0,41,,,0,88.059,1
-0,2025-05-22 07:23:49.352571,1755,1593,1755,0,0,0,0,41,,,0,87.994,1
-0,2025-05-22 07:23:50.352952,1755,1593,1755,0,0,0,0,41,,,0,88.035,1
-0,2025-05-22 07:23:51.353492,1755,1593,1755,0,0,0,0,41,,,0,86.81,2
-0,2025-05-22 07:23:52.353930,1755,1593,1755,0,0,0,0,41,,,0,85.298,2
-0,2025-05-22 07:23:53.354392,1755,1593,1755,0,0,0,0,41,,,0,85.302,2
-0,2025-05-22 07:23:54.354974,1755,1593,1755,2,0,0,0,41,,,0,88.017,2
-0,2025-05-22 07:23:55.355581,1755,1593,1755,2,0,0,0,41,,,0,85.396,2
-0,2025-05-22 07:23:56.356378,1755,1593,1755,0,0,0,0,40,,,0,85.3,2
-0,2025-05-22 07:23:57.357162,1755,1593,1755,0,0,0,0,40,,,0,85.277,2
-0,2025-05-22 07:23:58.358047,1755,1593,1755,0,0,0,0,40,,,0,85.281,2
-0,2025-05-22 07:23:59.358757,1755,1593,1755,0,0,0,0,40,,,0,85.507,2
-0,2025-05-22 07:24:00.359353,1755,1593,1755,0,0,0,0,40,,,0,85.65,2
-0,2025-05-22 07:24:01.360023,1755,1593,1755,0,0,0,0,40,,,0,85.627,2
-0,2025-05-22 07:24:02.360661,1755,1593,1755,0,0,0,0,40,,,0,85.507,2
-0,2025-05-22 07:24:03.361365,1755,1593,1755,0,0,0,0,40,,,0,85.412,2
-0,2025-05-22 07:24:04.362059,1755,1593,1755,0,0,0,0,40,,,0,85.418,2
-0,2025-05-22 07:24:05.362757,1755,1593,1755,0,0,0,0,40,,,0,85.399,2
-0,2025-05-22 07:24:06.363425,1755,1593,1755,0,0,0,0,40,,,0,85.351,2
-0,2025-05-22 07:24:07.364247,1755,1593,1755,0,0,0,0,40,,,0,85.313,2
-0,2025-05-22 07:24:08.364937,1755,1593,1755,0,0,0,0,40,,,0,85.309,2
-0,2025-05-22 07:24:09.365401,1755,1593,1755,100,3,0,0,40,,,0,88.057,2
-0,2025-05-22 07:24:10.365922,1755,1593,1755,10,2,0,0,40,,,0,92.06,2
-0,2025-05-22 07:24:11.366400,1755,1593,1755,0,0,0,0,40,,,0,88.174,1
-0,2025-05-22 07:24:12.366750,1755,1593,1755,0,0,0,0,40,,,0,88.171,0
-0,2025-05-22 07:24:13.367080,1755,1593,1755,0,0,0,0,40,,,0,88.168,0
-0,2025-05-22 07:56:08.196552,345,1593,345,0,0,0,0,38,,,0,50.794,0
-0,2025-05-22 07:56:09.196841,345,1593,345,0,0,0,0,38,,,0,50.808,0
-0,2025-05-22 07:56:10.197111,345,1593,345,0,0,0,0,38,,,0,50.799,0
-0,2025-05-22 07:56:11.197379,345,1593,345,0,0,0,0,38,,,0,50.79,0
-0,2025-05-22 07:56:12.197628,345,1593,345,0,0,0,0,38,,,0,50.794,0
-0,2025-05-22 07:56:13.197905,345,1593,345,0,0,0,0,37,,,0,50.805,0
-0,2025-05-22 07:56:14.198164,345,1593,345,0,0,0,0,37,,,0,50.791,0
-0,2025-05-22 07:56:15.198412,345,1593,345,0,0,0,0,37,,,0,50.809,0
-0,2025-05-22 07:56:16.198678,345,1593,345,0,0,0,0,37,,,0,50.794,0
-0,2025-05-22 07:56:17.198953,345,1593,345,0,0,0,0,37,,,0,50.777,0
-0,2025-05-22 07:56:18.199259,345,1593,345,0,0,0,0,37,,,0,50.794,0
-0,2025-05-22 07:56:19.199564,345,1593,345,0,0,0,0,37,,,0,50.783,0
-0,2025-05-22 07:56:20.199832,345,1593,345,0,0,0,0,37,,,0,50.675,0
-0,2025-05-22 07:56:21.200074,345,1593,345,0,0,0,0,37,,,0,50.665,0
-0,2025-05-22 07:56:22.200413,345,1593,345,0,0,0,0,37,,,0,50.592,0
-0,2025-05-22 07:56:23.200672,345,1593,345,0,0,0,0,37,,,0,50.53,0
-0,2025-05-22 07:56:24.201158,1755,1593,1755,19,0,0,0,37,,,0,60.532,1
-0,2025-05-22 07:56:25.201517,1755,1593,1755,50,1,0,0,38,,,0,85.131,1
-0,2025-05-22 07:56:26.201966,1755,1593,1755,0,0,0,0,38,,,0,83.837,1
-0,2025-05-22 07:56:27.202396,1755,1593,1755,0,0,0,0,38,,,0,83.498,1
-0,2025-05-22 07:56:28.202699,1755,1593,1755,0,0,0,0,38,,,0,83.305,1
-0,2025-05-22 07:56:29.203018,1755,1593,1755,0,0,0,0,38,,,0,83.494,1
-0,2025-05-22 07:56:30.203447,1755,1593,1755,0,0,0,0,38,,,0,83.418,1
-0,2025-05-22 07:56:31.203812,1755,1593,1755,0,0,0,0,38,,,0,83.382,1
-0,2025-05-22 07:56:32.204133,1755,1593,1755,16,6,0,0,39,,,0,94.906,1
-0,2025-05-22 07:56:33.204398,1755,1593,1755,47,18,0,0,39,,,0,167.074,1
-0,2025-05-22 07:56:34.204660,1755,1593,1755,46,18,0,0,40,,,0,167.234,1
-0,2025-05-22 07:56:35.204926,1755,1593,1755,47,18,0,0,40,,,0,167.269,1
-0,2025-05-22 07:56:36.205193,1755,1593,1755,47,19,0,0,40,,,0,165.85,1
-0,2025-05-22 07:56:37.205471,1755,1593,1755,47,19,0,0,40,,,0,163.462,1
-0,2025-05-22 07:56:38.205723,1755,1593,1755,46,18,0,0,41,,,0,162.383,1
-0,2025-05-22 07:56:39.205994,1755,1593,1755,47,18,0,0,41,,,0,166.122,1
-0,2025-05-22 07:56:40.206266,1755,1593,1755,47,18,0,0,41,,,0,168.347,1
-0,2025-05-22 07:56:41.206538,1755,1593,1755,47,19,0,0,41,,,0,165.281,1
-0,2025-05-22 07:56:42.206802,1755,1593,1755,46,18,0,0,41,,,0,162.953,1
-0,2025-05-22 07:56:43.207073,1755,1593,1755,47,18,0,0,42,,,0,169.075,1
-0,2025-05-22 07:56:44.207346,1755,1593,1755,47,19,0,0,42,,,0,172.15,1
-0,2025-05-22 07:56:45.207611,1755,1593,1755,47,18,0,0,42,,,0,165.18,1
-0,2025-05-22 07:56:46.207886,1755,1593,1755,46,18,0,0,42,,,0,168.316,1
-0,2025-05-22 07:56:47.208161,1755,1593,1755,47,18,0,0,42,,,0,170.096,1
-0,2025-05-22 07:56:48.208431,1755,1593,1755,47,19,0,0,42,,,0,164.557,1
-0,2025-05-22 07:56:49.208700,1755,1593,1755,46,18,0,0,43,,,0,167.98,1
-0,2025-05-22 07:56:50.208969,1755,1593,1755,46,18,0,0,43,,,0,169.937,1
-0,2025-05-22 07:56:51.209241,1755,1593,1755,47,19,0,0,43,,,0,166.501,1
-0,2025-05-22 07:56:52.209526,1755,1593,1755,47,18,0,0,43,,,0,164.754,1
-0,2025-05-22 07:56:53.209800,1755,1593,1755,47,18,0,0,43,,,0,167.458,1
-0,2025-05-22 07:56:54.210071,1755,1593,1755,46,18,0,0,43,,,0,168.481,1
-0,2025-05-22 07:56:55.210353,1755,1593,1755,47,18,0,0,43,,,0,162.787,1
-0,2025-05-22 07:56:56.210644,1755,1593,1755,47,18,0,0,44,,,0,171.912,1
-0,2025-05-22 07:56:57.210922,1755,1593,1755,46,18,0,0,44,,,0,172.141,1
-0,2025-05-22 07:56:58.211201,1755,1593,1755,46,18,0,0,44,,,0,168.677,1
-0,2025-05-22 07:56:59.211475,1755,1593,1755,47,18,0,0,44,,,0,167.171,1
-0,2025-05-22 07:57:00.211747,1755,1593,1755,47,18,0,0,44,,,0,171.103,1
-0,2025-05-22 07:57:01.212010,1755,1593,1755,46,18,0,0,44,,,0,170.555,1
-0,2025-05-22 07:57:02.212279,1755,1593,1755,47,18,0,0,44,,,0,165.931,1
-0,2025-05-22 07:57:03.212562,1755,1593,1755,47,18,0,0,44,,,0,165.1,1
-0,2025-05-22 07:57:04.212837,1755,1593,1755,47,18,0,0,45,,,0,172.173,1
-0,2025-05-22 07:57:05.213108,1755,1593,1755,46,18,0,0,45,,,0,170.051,1
-0,2025-05-22 07:57:06.213382,1755,1593,1755,47,19,0,0,45,,,0,168.109,1
-0,2025-05-22 07:57:07.213654,1755,1593,1755,47,18,0,0,45,,,0,168.907,1
-0,2025-05-22 07:57:08.213930,1755,1593,1755,47,18,0,0,45,,,0,172.39,1
-0,2025-05-22 07:57:09.214210,1755,1593,1755,46,18,0,0,45,,,0,165.724,1
-0,2025-05-22 07:57:10.214484,1755,1593,1755,47,19,0,0,45,,,0,165.909,1
-0,2025-05-22 07:57:11.214764,1755,1593,1755,47,18,0,0,45,,,0,170.276,1
-0,2025-05-22 07:57:12.215043,1755,1593,1755,46,18,0,0,46,,,0,174.191,1
-0,2025-05-22 07:57:13.215317,1755,1593,1755,47,19,0,0,46,,,0,170.56,1
-0,2025-05-22 07:57:14.215591,1755,1593,1755,47,19,0,0,46,,,0,167.852,1
-0,2025-05-22 07:57:15.215868,1755,1593,1755,47,18,0,0,46,,,0,165.921,1
-0,2025-05-22 07:57:16.216144,1755,1593,1755,46,18,0,0,46,,,0,171.141,1
-0,2025-05-22 07:57:17.216413,1755,1593,1755,47,19,0,0,46,,,0,173.218,1
-0,2025-05-22 07:57:18.216682,1755,1593,1755,47,19,0,0,46,,,0,171.344,1
-0,2025-05-22 07:57:19.216955,1755,1593,1755,45,18,0,0,46,,,0,168.651,1
-0,2025-05-22 07:57:20.217228,1755,1593,1755,44,17,0,0,46,,,0,164.911,1
-0,2025-05-22 07:57:21.217509,1755,1593,1755,47,18,0,0,46,,,0,165.042,1
-0,2025-05-22 07:57:22.217780,1755,1593,1755,46,18,0,0,47,,,0,170.733,1
-0,2025-05-22 07:57:23.218059,1755,1593,1755,47,19,0,0,47,,,0,173.968,1
-0,2025-05-22 07:57:24.218331,1755,1593,1755,47,19,0,0,47,,,0,170.041,1
-0,2025-05-22 07:57:25.218595,1755,1593,1755,46,18,0,0,47,,,0,166.017,1
-0,2025-05-22 07:57:26.218867,1755,1593,1755,46,18,0,0,47,,,0,166.092,1
-0,2025-05-22 07:57:27.219148,1755,1593,1755,47,18,0,0,47,,,0,166.04,1
-0,2025-05-22 07:57:28.219420,1755,1593,1755,47,18,0,0,47,,,0,166.789,1
-0,2025-05-22 07:57:29.219693,1755,1593,1755,46,18,0,0,47,,,0,171.284,1
-0,2025-05-22 07:57:30.219969,1755,1593,1755,46,18,0,0,47,,,0,172.726,1
-0,2025-05-22 07:57:31.220237,1755,1593,1755,47,19,0,0,47,,,0,168.112,1
-0,2025-05-22 07:57:32.220515,1755,1593,1755,47,18,0,0,47,,,0,165.731,1
-0,2025-05-22 07:57:33.220788,1755,1593,1755,46,18,0,0,47,,,0,170.052,1
-0,2025-05-22 07:57:34.221064,1755,1593,1755,44,17,0,0,47,,,0,171.256,1
-0,2025-05-22 07:57:35.221347,1755,1593,1755,46,18,0,0,47,,,0,168.027,1
-0,2025-05-22 07:57:36.221629,1755,1593,1755,47,19,0,0,48,,,0,174.341,1
-0,2025-05-22 07:57:37.221915,1755,1593,1755,47,19,0,0,48,,,0,172.187,1
-0,2025-05-22 07:57:38.222190,1755,1593,1755,47,18,0,0,48,,,0,169.331,1
-0,2025-05-22 07:57:39.222466,1755,1593,1755,47,18,0,0,48,,,0,167.628,1
-0,2025-05-22 07:57:40.222743,1755,1593,1755,47,18,0,0,48,,,0,170.182,1
-0,2025-05-22 07:57:41.223026,1755,1593,1755,47,19,0,0,48,,,0,174.246,1
-0,2025-05-22 07:57:42.223297,1755,1593,1755,46,18,0,0,48,,,0,167.776,1
-0,2025-05-22 07:57:43.223574,1755,1593,1755,47,18,0,0,48,,,0,166.927,1
-0,2025-05-22 07:57:44.223847,1755,1593,1755,47,18,0,0,48,,,0,169.149,1
-0,2025-05-22 07:57:45.224126,1755,1593,1755,47,19,0,0,48,,,0,172.39,1
-0,2025-05-22 07:57:46.224403,1755,1593,1755,46,18,0,0,48,,,0,173.222,1
-0,2025-05-22 07:57:47.224678,1755,1593,1755,47,19,0,0,48,,,0,168.754,1
-0,2025-05-22 07:57:48.224956,1755,1593,1755,47,18,0,0,48,,,0,166.112,1
-0,2025-05-22 07:57:49.225239,1755,1593,1755,47,18,0,0,48,,,0,167.64,1
-0,2025-05-22 07:57:50.225513,1755,1593,1755,46,18,0,0,49,,,0,172.844,1
-0,2025-05-22 07:57:51.225790,1755,1593,1755,47,19,0,0,48,,,0,174.932,1
-0,2025-05-22 07:57:52.226061,1755,1593,1755,47,19,0,0,48,,,0,170.855,1
-0,2025-05-22 07:57:53.226335,1755,1593,1755,47,18,0,0,49,,,0,167.784,1
-0,2025-05-22 07:57:54.226629,1755,1593,1755,25,4,0,0,48,,,0,147.565,1
-0,2025-05-22 07:57:55.226949,1755,1593,1755,21,2,0,0,47,,,0,100.056,1
-0,2025-05-22 07:57:56.227276,1755,1593,1755,20,0,0,0,47,,,0,96.246,1
-0,2025-05-22 07:57:57.227621,1755,1593,1755,0,0,0,0,47,,,0,90.367,1
-0,2025-05-22 07:57:58.227968,1755,1593,1755,0,0,0,0,47,,,0,90.068,1
-0,2025-05-22 07:57:59.228348,1755,1593,1755,0,0,0,0,46,,,0,89.976,1
-0,2025-05-22 07:58:00.228720,1755,1593,1755,0,0,0,0,46,,,0,89.757,1
-0,2025-05-22 07:58:01.229097,1755,1593,1755,0,0,0,0,46,,,0,89.561,1
-0,2025-05-22 07:58:02.229437,1755,1593,1755,0,0,0,0,46,,,0,89.59,1
-0,2025-05-22 07:58:03.229865,1755,1593,1755,0,0,0,0,46,,,0,89.428,1
-0,2025-05-22 07:58:04.230279,1755,1593,1755,0,0,0,0,46,,,0,89.245,1
-0,2025-05-22 07:58:05.230728,1755,1593,1755,0,0,0,0,46,,,0,89.023,1
-0,2025-05-22 07:58:06.231117,1755,1593,1755,0,0,0,0,45,,,0,89.008,1
-0,2025-05-22 07:58:07.231519,1755,1593,1755,0,0,0,0,45,,,0,88.965,1
-0,2025-05-22 07:58:08.231912,1755,1593,1755,0,0,0,0,45,,,0,88.933,1
-0,2025-05-22 07:58:09.232297,1755,1593,1755,0,0,0,0,45,,,0,88.857,1
-0,2025-05-22 07:58:10.232730,1755,1593,1755,0,0,0,0,45,,,0,88.59,1
-0,2025-05-22 07:58:11.233045,1755,1593,1755,0,0,0,0,45,,,0,88.561,1
-0,2025-05-22 07:58:12.233386,1755,1593,1755,0,0,0,0,45,,,0,88.489,1
-0,2025-05-22 07:58:13.233748,1755,1593,1755,0,0,0,0,45,,,0,88.455,1
-0,2025-05-22 07:58:14.234110,1755,1593,1755,0,0,0,0,45,,,0,88.431,1
-0,2025-05-22 07:58:15.234465,1755,1593,1755,0,0,0,0,45,,,0,88.493,1
-0,2025-05-22 07:58:16.234895,1755,1593,1755,0,0,0,0,44,,,0,89.577,1
-0,2025-05-22 07:58:17.235308,1755,1593,1755,0,0,0,0,44,,,0,88.129,1
-0,2025-05-22 07:58:18.235660,1755,1593,1755,0,0,0,0,44,,,0,88.035,1
-0,2025-05-22 07:58:19.235964,1755,1593,1755,0,0,0,0,44,,,0,87.992,1
-0,2025-05-22 07:58:20.236329,1755,1593,1755,0,0,0,0,44,,,0,87.978,1
-0,2025-05-22 07:58:21.236685,1755,1593,1755,0,0,0,0,44,,,0,87.939,1
-0,2025-05-22 07:58:22.236980,1755,1593,1755,0,0,0,0,44,,,0,87.879,1
-0,2025-05-22 07:58:23.237764,1755,1593,1755,0,0,0,0,44,,,0,87.843,1
-0,2025-05-22 07:58:24.238136,1755,1593,1755,0,0,0,0,44,,,0,87.707,1
-0,2025-05-22 07:58:25.238465,1755,1593,1755,0,0,0,0,44,,,0,87.632,1
-0,2025-05-22 07:58:26.238800,1755,1593,1755,0,0,0,0,44,,,0,87.545,1
-0,2025-05-22 07:58:27.239145,1755,1593,1755,0,0,0,0,44,,,0,87.51,1
-0,2025-05-22 07:58:28.239499,1755,1593,1755,0,0,0,0,43,,,0,87.499,1
-0,2025-05-22 07:58:29.239886,1755,1593,1755,0,0,0,0,43,,,0,87.47,1
-0,2025-05-22 07:58:30.240287,1755,1593,1755,5,0,0,0,43,,,0,87.489,1
-0,2025-05-22 07:58:31.240735,1755,1593,1755,5,0,0,0,43,,,0,87.547,1
-0,2025-05-22 07:58:32.241236,1755,1593,1755,5,0,0,0,43,,,0,87.536,1
-0,2025-05-22 07:58:33.241778,1755,1593,1755,5,0,0,0,43,,,0,87.501,1
-0,2025-05-22 07:58:34.242234,1755,1593,1755,22,2,0,0,43,,,0,92.456,1
-0,2025-05-22 07:58:35.242719,1755,1593,1755,0,0,0,0,43,,,0,99.021,1
-0,2025-05-22 07:58:36.243178,1755,1593,1755,0,0,0,0,43,,,0,86.497,1
-0,2025-05-22 07:58:37.243648,1440,1593,1440,94,55,0,0,47,,,0,250.361,1
-0,2025-05-22 07:58:38.244133,1725,1593,1725,66,31,0,0,46,,,0,283.307,1
-0,2025-05-22 07:58:39.244622,1755,1593,1755,0,0,0,0,44,,,0,88.187,1
-0,2025-05-22 07:58:40.245177,1755,1593,1755,0,0,0,0,44,,,0,88.658,1
-0,2025-05-22 07:58:41.245710,1755,1593,1755,0,0,0,0,44,,,0,87.237,1
-0,2025-05-22 07:58:42.246369,1755,1593,1755,0,0,0,0,43,,,0,87.238,1
-0,2025-05-22 07:58:43.247004,1755,1593,1755,68,20,0,0,44,,,0,108.265,1
-0,2025-05-22 07:58:44.247663,1755,1593,1755,0,0,0,0,43,,,0,88.374,1
-0,2025-05-22 07:58:45.248245,1755,1593,1755,4,0,0,0,43,,,0,88.026,1
-0,2025-05-22 07:58:46.248891,1755,1593,1755,0,0,0,0,43,,,0,99.235,1
-0,2025-05-22 07:58:47.249566,1755,1593,1755,0,0,0,0,43,,,0,86.765,1
-0,2025-05-22 07:58:48.250044,1755,1593,1755,0,0,0,0,43,,,0,86.521,1
-0,2025-05-22 07:58:49.250501,1755,1593,1755,0,0,0,0,43,,,0,86.45,1
-0,2025-05-22 07:58:50.250933,1755,1593,1755,0,0,0,0,43,,,0,86.317,1
-0,2025-05-22 07:58:51.251404,1755,1593,1755,0,0,0,0,43,,,0,86.281,1
-0,2025-05-22 07:58:52.251855,1755,1593,1755,0,0,0,0,43,,,0,86.279,1
-0,2025-05-22 07:58:53.252293,1755,1593,1755,0,0,0,0,43,,,0,86.27,1
-0,2025-05-22 07:58:54.252727,1755,1593,1755,0,0,0,0,43,,,0,86.27,1
-0,2025-05-22 07:58:55.253178,1755,1593,1755,0,0,0,0,43,,,0,86.269,1
-0,2025-05-22 07:58:56.253777,1755,1593,1755,0,0,0,0,43,,,0,86.26,1
-0,2025-05-22 07:58:57.254234,1755,1593,1755,0,0,0,0,42,,,0,86.175,1
-0,2025-05-22 07:58:58.254658,1755,1593,1755,0,0,0,0,42,,,0,86.144,1
-0,2025-05-22 07:58:59.255115,1755,1593,1755,0,0,0,0,42,,,0,85.909,1
-0,2025-05-22 07:59:00.255595,1755,1593,1755,0,0,0,0,42,,,0,86.051,1
-0,2025-05-22 07:59:01.256139,1755,1593,1755,16,2,0,0,42,,,0,87.435,1
-0,2025-05-22 07:59:02.256809,1755,1593,1755,0,0,0,0,42,,,0,100.382,1
-0,2025-05-22 07:59:03.257430,1755,1593,1755,0,0,0,0,42,,,0,86.028,1
-0,2025-05-22 07:59:04.258047,1755,1593,1755,4,2,0,0,42,,,0,86.141,1
-0,2025-05-22 07:59:05.258619,1755,1593,1755,5,2,0,0,42,,,0,87.176,1
-0,2025-05-22 07:59:06.259263,1755,1593,1755,0,0,0,0,42,,,0,109.326,1
-0,2025-05-22 07:59:07.259800,1755,1593,1755,1,0,0,0,42,,,0,86.754,1
-0,2025-05-22 07:59:08.260389,1755,1593,1755,0,0,0,0,42,,,0,86.754,1
-0,2025-05-22 07:59:09.261040,1755,1593,1755,0,0,0,0,42,,,0,86.708,1
-0,2025-05-22 07:59:10.261661,1755,1593,1755,43,19,0,0,43,,,0,117.79,1
-0,2025-05-22 07:59:11.262326,1755,1593,1755,1,0,0,0,42,,,0,95.483,1
-0,2025-05-22 07:59:12.262872,1755,1593,1755,1,0,0,0,42,,,0,85.586,1
-0,2025-05-22 07:59:13.263500,1755,1593,1755,2,0,0,0,42,,,0,85.569,1
-0,2025-05-22 07:59:14.263947,1755,1593,1755,0,0,0,0,42,,,0,117.181,1
-0,2025-05-22 07:59:15.264579,1695,1593,1695,81,38,0,0,45,,,0,242.684,1
-0,2025-05-22 07:59:16.265078,1755,1593,1755,9,3,0,0,43,,,0,156.85,1
-0,2025-05-22 07:59:17.265552,1755,1593,1755,8,3,0,0,42,,,0,88.204,1
-0,2025-05-22 07:59:18.266076,1755,1593,1755,0,0,0,0,42,,,0,88.05,1
-0,2025-05-22 07:59:19.266710,1755,1593,1755,70,1,0,0,42,,,0,86.629,1
-0,2025-05-22 07:59:20.267224,1755,1593,1755,0,0,0,0,42,,,0,86.029,1
-0,2025-05-22 07:59:21.267786,1755,1593,1755,70,1,0,0,42,,,0,85.58,1
-0,2025-05-22 07:59:22.268311,1755,1593,1755,0,0,0,0,42,,,0,87.914,1
-0,2025-05-22 07:59:23.268615,1755,1593,1755,1,0,0,0,42,,,0,87.2,1
-0,2025-05-22 07:59:24.268957,1755,1593,1755,0,0,0,0,42,,,0,88.245,1
-0,2025-05-22 07:59:25.269329,1755,1593,1755,0,0,0,0,42,,,0,88.254,1
-0,2025-05-22 07:59:26.269710,1755,1593,1755,0,0,0,0,42,,,0,88.32,1
-0,2025-05-22 07:59:27.270211,1755,1593,1755,0,0,0,0,42,,,0,88.207,1
-0,2025-05-22 07:59:28.270588,1755,1593,1755,0,0,0,0,42,,,0,88.207,1
-0,2025-05-22 07:59:29.270924,1755,1593,1755,0,0,0,0,42,,,0,88.197,1
-0,2025-05-22 07:59:30.271305,1755,1593,1755,0,0,0,0,42,,,0,88.13,1
-0,2025-05-22 07:59:31.271687,1755,1593,1755,0,0,0,0,42,,,0,88.197,1
-0,2025-05-22 07:59:32.272025,1755,1593,1755,0,0,0,0,42,,,0,88.14,1
-0,2025-05-22 07:59:33.272378,1755,1593,1755,0,0,0,0,42,,,0,88.121,1
-0,2025-05-22 07:59:34.272798,1755,1593,1755,0,0,0,0,42,,,0,88.003,1
-0,2025-05-22 07:59:35.273202,1755,1593,1755,0,0,0,0,41,,,0,87.874,1
-0,2025-05-22 07:59:36.273565,1755,1593,1755,0,0,0,0,41,,,0,87.818,1
-0,2025-05-22 07:59:37.273952,1755,1593,1755,0,0,0,0,41,,,0,87.777,1
-0,2025-05-22 07:59:38.274266,1755,1593,1755,0,0,0,0,41,,,0,87.76,1
-0,2025-05-22 07:59:39.274595,1755,1593,1755,0,0,0,0,41,,,0,87.731,1
-0,2025-05-22 07:59:40.274979,1755,1593,1755,0,0,0,0,41,,,0,87.786,1
-0,2025-05-22 07:59:41.275441,1755,1593,1755,0,0,0,0,41,,,0,87.777,1
-0,2025-05-22 07:59:42.275841,1755,1593,1755,0,0,0,0,41,,,0,87.722,1
-0,2025-05-22 07:59:43.276223,1755,1593,1755,0,0,0,0,41,,,0,87.72,1
-0,2025-05-22 07:59:44.276602,1755,1593,1755,0,0,0,0,41,,,0,87.78,1
-0,2025-05-22 07:59:45.276986,1755,1593,1755,0,0,0,0,41,,,0,87.722,1
-0,2025-05-22 07:59:46.277360,1755,1593,1755,0,0,0,0,41,,,0,87.77,1
-0,2025-05-22 07:59:47.277738,1755,1593,1755,0,0,0,0,41,,,0,87.718,1
-0,2025-05-22 07:59:48.278114,1755,1593,1755,0,0,0,0,41,,,0,87.72,1
-0,2025-05-22 07:59:49.278488,1755,1593,1755,0,0,0,0,41,,,0,87.798,1
-0,2025-05-22 07:59:50.278874,1755,1593,1755,0,0,0,0,41,,,0,87.683,1
-0,2025-05-22 07:59:51.279259,1755,1593,1755,0,0,0,0,41,,,0,87.654,1
-0,2025-05-22 07:59:52.279647,1755,1593,1755,0,0,0,0,41,,,0,87.653,1
-0,2025-05-22 07:59:53.280039,1755,1593,1755,0,0,0,0,41,,,0,87.596,1
-0,2025-05-22 07:59:54.280430,1755,1593,1755,0,0,0,0,41,,,0,87.555,1
-0,2025-05-22 07:59:55.280831,1755,1593,1755,0,0,0,0,41,,,0,87.516,1
-0,2025-05-22 07:59:56.281260,1755,1593,1755,0,0,0,0,41,,,0,87.452,1
-0,2025-05-22 07:59:57.281660,1755,1593,1755,0,0,0,0,41,,,0,87.48,1
-0,2025-05-22 07:59:58.282039,1755,1593,1755,0,0,0,0,41,,,0,87.462,1
-0,2025-05-22 07:59:59.282382,1755,1593,1755,0,0,0,0,41,,,0,87.299,1
-0,2025-05-22 08:00:00.282758,1755,1593,1755,0,0,0,0,41,,,0,87.298,1
-0,2025-05-22 08:00:01.283139,1755,1593,1755,0,0,0,0,41,,,0,87.245,1
-0,2025-05-22 08:00:02.283510,1755,1593,1755,0,0,0,0,41,,,0,87.255,1
-0,2025-05-22 08:00:03.283903,1755,1593,1755,0,0,0,0,41,,,0,87.245,1
-0,2025-05-22 08:00:04.284293,1755,1593,1755,0,0,0,0,41,,,0,87.234,1
-0,2025-05-22 08:00:05.284689,1755,1593,1755,0,0,0,0,41,,,0,87.232,1
-0,2025-05-22 08:00:06.285061,1755,1593,1755,0,0,0,0,41,,,0,87.236,1
-0,2025-05-22 08:00:07.285461,1755,1593,1755,0,0,0,0,41,,,0,87.232,1
-0,2025-05-22 08:00:08.286011,1755,1593,1755,0,0,0,0,40,,,0,86.191,2
-0,2025-05-22 08:00:09.286430,1755,1593,1755,0,0,0,0,40,,,0,84.978,2
-0,2025-05-22 08:00:10.286876,1755,1593,1755,0,0,0,0,40,,,0,85.173,2
-0,2025-05-22 08:00:11.287434,1755,1593,1755,2,0,0,0,40,,,0,87.85,2
-0,2025-05-22 08:00:12.288026,1755,1593,1755,2,0,0,0,40,,,0,85.315,2
-0,2025-05-22 08:00:13.288628,1755,1593,1755,0,0,0,0,40,,,0,85.125,2
-0,2025-05-22 08:00:14.289190,1755,1593,1755,0,0,0,0,40,,,0,85.039,2
-0,2025-05-22 08:00:15.289769,1755,1593,1755,0,0,0,0,40,,,0,84.981,2
-0,2025-05-22 08:00:16.290395,1755,1593,1755,0,0,0,0,40,,,0,84.978,2
-0,2025-05-22 08:00:17.291086,1755,1593,1755,0,0,0,0,40,,,0,84.887,2
-0,2025-05-22 08:00:18.291775,1755,1593,1755,0,0,0,0,40,,,0,84.855,2
-0,2025-05-22 08:00:19.292462,1755,1593,1755,0,0,0,0,40,,,0,84.815,2
-0,2025-05-22 08:00:20.293172,1755,1593,1755,0,0,0,0,40,,,0,84.807,2
-0,2025-05-22 08:00:21.293834,1755,1593,1755,0,0,0,0,40,,,0,84.815,2
-0,2025-05-22 08:00:22.294397,1755,1593,1755,0,0,0,0,40,,,0,84.809,2
-0,2025-05-22 08:00:23.294956,1755,1593,1755,0,0,0,0,40,,,0,84.811,2
-0,2025-05-22 08:00:24.295553,1755,1593,1755,0,0,0,0,40,,,0,84.807,2
-0,2025-05-22 08:00:25.296201,1755,1593,1755,0,0,0,0,40,,,0,84.807,2
-0,2025-05-22 08:00:26.296812,1755,1593,1755,0,0,0,0,40,,,0,84.803,2
-0,2025-05-22 08:00:27.297554,1755,1593,1755,0,0,0,0,40,,,0,84.811,2
-0,2025-05-22 08:00:28.298201,1755,1593,1755,0,0,0,0,40,,,0,84.799,2
-0,2025-05-22 08:00:29.298852,1755,1593,1755,0,0,0,0,40,,,0,84.805,2
-0,2025-05-22 08:00:30.299650,1755,1593,1755,0,0,0,0,40,,,0,84.805,2
-0,2025-05-22 08:00:31.300374,1755,1593,1755,0,0,0,0,40,,,0,84.805,2
-0,2025-05-22 08:00:32.300782,1755,1593,1755,100,3,0,0,40,,,0,89.233,2
-0,2025-05-22 08:00:33.301269,1755,1593,1755,0,0,0,0,40,,,0,90.663,2
-0,2025-05-22 08:00:34.301661,1755,1593,1755,0,0,0,0,40,,,0,87.319,1
-0,2025-05-22 08:00:35.301911,1755,1593,1755,0,0,0,0,40,,,0,87.299,0
-0,2025-05-22 08:00:36.302155,375,1593,375,0,0,0,0,39,,,0,80.476,0
diff --git a/tests/integration/defs/output/perf_script_test_results.csv b/tests/integration/defs/output/perf_script_test_results.csv
deleted file mode 100644
index 4c256eadabe..00000000000
--- a/tests/integration/defs/output/perf_script_test_results.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-network_name,network_hash,sm_clk,mem_clk,gpu_idx,perf_case_name,test_name,original_test_name,raw_result,perf_metric,total_time__sec,start_timestamp,end_timestamp,state,command,threshold,absolute_threshold,metric_type
-"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",1755,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:20:22] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:20:22] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:20:25] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:20:25] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:20:25] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path:         None\nNumber of Sequences:  512\n\n-- Percentiles statistics ---------------------------------\n\n        Input              Output           Seq. Length\n-----------------------------------------------------------\nMIN:   128.0000           128.0000           256.0000\nMAX:   128.0000           128.0000           256.0000\nAVG:   128.0000           128.0000           256.0000\nP50:   128.0000           128.0000           256.0000\nP90:   128.0000           128.0000           256.0000\nP95:   128.0000           128.0000           256.0000\nP99:   128.0000           128.0000           256.0000\n===========================================================\n\n[05/22/2025-07:20:25] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:20:25] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:20:34] [TRT-LLM] [I] Starting quantization...\nRegistered <class 'transformers.models.llama.modeling_llama.LlamaAttention'> for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:21:56] [TRT-LLM] [I] Quantization done. Total time used: 82.46 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:16] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/quantized-checkpoint \nTotal time used 20.49 s.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:22:17] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:22:24] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6364, GPU 636 (MiB)\n[05/22/2025-07:22:25] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1005, GPU +6, now: CPU 7168, GPU 642 (MiB)\n[05/22/2025-07:22:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time of constructing network from module object 8.95593547821045 seconds\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:22:26] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:22:26] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:22:30] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:22:30] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:22:30] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:22:40] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:40] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:22:43] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:43] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:43] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.11304ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:22:43] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:22:46] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:46] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:47] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:47] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:47] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.4582ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:22:47] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:22:51] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:51] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:51] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:51] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:51] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.58895ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:22:51] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:22:55] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:55] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:56] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:56] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:56] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.70618ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:22:56] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:22:59] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:59] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:00] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:00] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:00] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.7238ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:23:00] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:23:05] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:23:05] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:06] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:06] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.04059ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:23:06] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:23:06] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:23:06] [TRT] [I] Engine generation completed in 36.5777 seconds.\n[05/22/2025-07:23:06] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:23:09] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:42\n[05/22/2025-07:23:09] [TRT] [I] Serialized 4959 bytes of code generator cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 1912332 bytes of compilation cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:23:09] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:23:09] [TRT-LLM] [I] Build phase peak memory: 32810.35 MB, children: 11886.42 MB\n[05/22/2025-07:23:10] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:23:29] [TRT-LLM] [I] Engine serialized. Total time: 00:00:18\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-07:23:48] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:48] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:23:50] [TRT-LLM] [W] Found worker process 94149 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-07:23:50] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:23:50] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8791 MiB\n[TensorRT-LLM][INFO] Engine load time 3243 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.46 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28322\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.32 GiB for max tokens in paged KV cache (906304).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name:		meta-llama/Llama-3.1-8B\nModel Path:		/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length:		256\nMax Batch Size:			4096\nMax Num Tokens:			8192\nQuantization:			FP8\nKV Cache Dtype:			FP8\n===========================================================\n\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",36.5777,254.122006,2025-05-22 07:19:59,2025-05-22 07:24:13,valid,  trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME
-"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",375,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:56:21] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:56:21] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:56:23] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:56:23] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:56:23] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path:         None\nNumber of Sequences:  512\n\n-- Percentiles statistics ---------------------------------\n\n        Input              Output           Seq. Length\n-----------------------------------------------------------\nMIN:   128.0000           128.0000           256.0000\nMAX:   128.0000           128.0000           256.0000\nAVG:   128.0000           128.0000           256.0000\nP50:   128.0000           128.0000           256.0000\nP90:   128.0000           128.0000           256.0000\nP95:   128.0000           128.0000           256.0000\nP99:   128.0000           128.0000           256.0000\n===========================================================\n\n[05/22/2025-07:56:23] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:56:23] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:56:31] [TRT-LLM] [I] Starting quantization...\nRegistered <class 'transformers.models.llama.modeling_llama.LlamaAttention'> for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:57:53] [TRT-LLM] [I] Quantization done. Total time used: 82.30 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/quantized-checkpoint \nTotal time used 20.10 s.\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:58:15] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:58:15] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:58:28] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6151, GPU 636 (MiB)\n[05/22/2025-07:58:29] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1220, GPU +6, now: CPU 7170, GPU 642 (MiB)\n[05/22/2025-07:58:29] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time of constructing network from module object 14.438017129898071 seconds\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:58:29] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:58:29] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:58:33] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:58:33] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:58:33] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:58:43] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:58:43] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:58:45] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:58:45] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:58:45] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 5.97675ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:58:45] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:59:00] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:00] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:01] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:01] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:01] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.52666ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:59:01] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:59:04] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:04] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:05] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:05] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:05] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.61347ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:59:05] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:59:08] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:08] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:09] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:09] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:09] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.88478ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:59:09] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:59:12] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:12] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:13] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:13] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:13] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.79197ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:59:13] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:59:18] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:18] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:18] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:18] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:18] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.09646ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:59:18] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:59:19] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:59:19] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:59:19] [TRT] [I] Engine generation completed in 46.117 seconds.\n[05/22/2025-07:59:19] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:59:21] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:51\n[05/22/2025-07:59:21] [TRT] [I] Serialized 5010 bytes of code generator cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 1654870 bytes of compilation cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:59:21] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:59:21] [TRT-LLM] [I] Build phase peak memory: 32983.74 MB, children: 11888.58 MB\n[05/22/2025-07:59:23] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:59:39] [TRT-LLM] [I] Engine serialized. Total time: 00:00:16\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-08:00:05] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:05] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-08:00:07] [TRT-LLM] [W] Found worker process 98701 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-08:00:07] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:07] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8786 MiB\n[TensorRT-LLM][INFO] Engine load time 3261 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.47 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28327\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.33 GiB for max tokens in paged KV cache (906464).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name:		meta-llama/Llama-3.1-8B\nModel Path:		/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory:	/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length:		256\nMax Batch Size:			4096\nMax Num Tokens:			8192\nQuantization:			FP8\nKV Cache Dtype:			FP8\n===========================================================\n\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",46.117,268.112764,2025-05-22 07:56:08,2025-05-22 08:00:36,valid,  trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME
diff --git a/tests/integration/defs/output/session_properties.csv b/tests/integration/defs/output/session_properties.csv
deleted file mode 100644
index b86ff5e75bf..00000000000
--- a/tests/integration/defs/output/session_properties.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-username,start_timestamp,hostname,ip,nvidia_driver_version,nvidia_device_count,os_properties,cpu_properties,gpu_properties,trt_change_id,trt_branch,commit_timestamp,cuda_version,cublas_version,cudnn_version,end_timestamp
-,2025-05-22 07:19:47,ipp2-1606.nvidia.com,10.176.4.8,575.57.05,1,"{'os_name': 'posix', 'platform': 'Linux', 'platform_version': '#144-Ubuntu SMP Fri Feb 7 20:47:38 UTC 2025'}","{'cpu_count': 32, 'cpu_freq': {'current': 1500.167875, 'min': 1500.0, 'max': 3000.0}}","{'device_product_name': 'H100 PCIe', 'pci_device_id': 590418142}",,,,,,,2025-05-22 07:55:34
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt b/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt
deleted file mode 100644
index 57e076ffb02..00000000000
--- a/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt
+++ /dev/null
@@ -1 +0,0 @@
-perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]

From 225c1b0f9ea347f29fb7065d7471b6248c429eaf Mon Sep 17 00:00:00 2001
From: Hui Gao <huig@nvidia.com>
Date: Mon, 26 May 2025 01:13:20 +0000
Subject: [PATCH 3/9] Fix MNNVL name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hui Gao <huig@nvidia.com>
Signed-off-by: Hui Gaoâ <huig@nvidia.com>
---
 tensorrt_llm/_torch/distributed/ops.py | 24 +++++++++++++++++-------
 tensorrt_llm/_torch/model_config.py    |  2 +-
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 085dff61171..5155e423d4f 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -307,8 +307,10 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype):
         super().__init__()
         self.mapping = mapping
         self.dtype = dtype
-        assert (dtype in MNNVLAllReduce.get_supported_dtype()
-                and (not mapping.has_cp())), ""
+        assert (
+            dtype in MNNVLAllReduce.get_supported_dtype()
+            and (not mapping.has_cp())
+        ), "MNNVL all reduce only support dtype {MNNVLAllReduce.get_supported_dtype()} and without cp."
 
         self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace(
             self.mapping, dtype)
@@ -331,6 +333,9 @@ def forward(
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
         """
+        if self.mapping == 1:
+            return input
+
         if input.numel() > self.max_num_elements_mnnvl:
             return None
 
@@ -376,7 +381,9 @@ class TLLMAllReduce(nn.Module):
     for certain operations when using NVLink for multi-node communication.
     """
 
-    def __init__(self, mapping: Mapping, strategy: AllReduceStrategy = AllReduceStrategy.AUTO):
+    def __init__(self,
+                 mapping: Mapping,
+                 strategy: AllReduceStrategy = AllReduceStrategy.AUTO):
         super().__init__()
         self.mapping = mapping
         self.strategy = strategy
@@ -404,6 +411,9 @@ def forward(
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
         """
+        if self.mapping == 1:
+            return input
+
         output = torch.ops.trtllm.allreduce(
             input=input,
             residual=all_reduce_params.residual,
@@ -467,7 +477,7 @@ def __init__(self,
             or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using
             the AUTO strategy.
         """
-        self.skip_ar = self.mapping.tp_size == 1
+        self.skip_ar = mapping.tp_size == 1
         self._mnvl_allreduce = None
         self._tllm_allreduce = None
         self._create_allreduce(mapping, ar_backend, strategy, dtype)
@@ -521,13 +531,13 @@ def forward(
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
-        if self.mnnvl_allreduce:
-            mnnvl_output = self.mnnvl_allreduce(
+        if self._mnvl_allreduce:
+            mnnvl_output = self._mnvl_allreduce(
                 input, all_reduce_params=all_reduce_params)
             if mnnvl_output is not None:
                 return mnnvl_output
 
-        # MNVL only support part of AllReduceFusionOp provided in params.
+        # MNNVL only support part of AllReduceFusionOp provided in params.
         output = self._tllm_allreduce(
             input=input,
             all_reduce_params=all_reduce_params,
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index ba3d359a499..e5b5ce048e8 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -77,7 +77,7 @@ class ModelConfig(Generic[TConfig]):
 
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'  # options can be CUTLASS, TRTLLM
-    ar_backend: str = 'TRTLLM'  # options can be MNVL, TRTLLM
+    ar_backend: str = 'TRTLLM'  # options can be MNNVL, TRTLLM
 
     # If true, enable min-latency mode. Currently only used for Llama4.
     enable_min_latency: bool = False

From d5b372cb53d320c642d83c90f03a5635e5064762 Mon Sep 17 00:00:00 2001
From: Hui Gao <huig@nvidia.com>
Date: Wed, 11 Jun 2025 04:33:43 +0000
Subject: [PATCH 4/9] Address comments

Signed-off-by: Hui Gao <huig@nvidia.com>
---
 tensorrt_llm/_torch/distributed/ops.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 5155e423d4f..43b15a4d9f8 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -10,7 +10,12 @@
 from tensorrt_llm._utils import mpi_barrier
 from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
+<<<<<<< HEAD
                                      AllReduceStrategy, MoEAllReduceParams)
+=======
+                                     AllReduceStrategy)
+from tensorrt_llm.logger import logger
+>>>>>>> 7f3955b17 (Address comments)
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
@@ -478,7 +483,7 @@ def __init__(self,
             the AUTO strategy.
         """
         self.skip_ar = mapping.tp_size == 1
-        self._mnvl_allreduce = None
+        self._mnnvl_allreduce = None
         self._tllm_allreduce = None
         self._create_allreduce(mapping, ar_backend, strategy, dtype)
 
@@ -491,7 +496,7 @@ def _create_allreduce(self, mapping, backend, strategy, dtype):
                              and dtype in MNNVLAllReduce.get_supported_dtype())
                         and (not mapping.has_cp()) and mapping.tp_size > 1)
         if enable_mnnvl:
-            self._mnvl_allreduce = MNNVLAllReduce(mapping, dtype)
+            self._mnnvl_allreduce = MNNVLAllReduce(mapping, dtype)
 
         self._tllm_allreduce = TLLMAllReduce(mapping, strategy)
 
@@ -522,7 +527,7 @@ def forward(
             RESIDUAL_RMS_NORM_QUANT_FP8: [norm_quant, residual]
             RESIDUAL_RMS_NORM_OUT_QUANT_FP8: [norm, norm_quant, residual]
             RESIDUAL_RMS_NORM_QUANT_NVFP4: [norm_quant_fp4, scale_factor, residual]
-            RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]P
+            RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]
         '''
         if self.skip_ar or (all_reduce_params is not None
                             and all_reduce_params.enable_allreduce == False):
@@ -531,11 +536,12 @@ def forward(
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
-        if self._mnvl_allreduce:
-            mnnvl_output = self._mnvl_allreduce(
+        if self._mnnvl_allreduce:
+            mnnvl_output = self._mnnvl_allreduce(
                 input, all_reduce_params=all_reduce_params)
             if mnnvl_output is not None:
                 return mnnvl_output
+            logger.info(f"Fallback to tllm_allreduce.")
 
         # MNNVL only support part of AllReduceFusionOp provided in params.
         output = self._tllm_allreduce(

From bd1183d4c7b76ecdfafd1eb2228e6f5e33277da2 Mon Sep 17 00:00:00 2001
From: Hui Gao <huig@nvidia.com>
Date: Wed, 11 Jun 2025 04:35:17 +0000
Subject: [PATCH 5/9] Add strategy support in extra llm api config

Signed-off-by: Hui Gao <huig@nvidia.com>
---
 cpp/tensorrt_llm/thop/allreduceOp.cpp         |    9 +-
 tensorrt_llm/_torch/distributed/ops.py        |  165 +-
 tensorrt_llm/_torch/model_config.py           |   21 +-
 .../_torch/models/modeling_deepseekv3.py      |    7 +-
 tensorrt_llm/_torch/models/modeling_llama.py  |    4 +-
 .../_torch/models/modeling_qwen3_moe.py       |    4 +-
 tensorrt_llm/_torch/modules/fused_moe.py      | 2513 +++++++++++++++++
 .../modules/fused_moe/fused_moe_vanilla.py    |    3 +-
 .../_torch/modules/fused_moe/interface.py     |    3 +-
 tensorrt_llm/_torch/modules/linear.py         |    4 +-
 tensorrt_llm/_torch/pyexecutor/config.py      |    1 +
 tensorrt_llm/functional.py                    |    1 +
 .../_torch/multi_gpu/test_allreduce.py        |    2 +-
 .../multi_gpu/test_lowprecision_allreduce.py  |    6 +-
 .../_torch/multi_gpu/test_mnnvl_allreduce.py  |   18 +-
 .../_torch/multi_gpu/test_user_buffers.py     |   16 +-
 16 files changed, 2636 insertions(+), 141 deletions(-)
 create mode 100755 tensorrt_llm/_torch/modules/fused_moe.py

diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
index 25af1222aa6..d86a841fab9 100644
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -621,14 +621,12 @@ class AllreduceOp
 
     AllReduceStrategyType getRuntimeStrategy(size_t seq_len, size_t size)
     {
-        static char* force_nccl_all_reduce_strategy_char = std::getenv("FORCE_NCCL_ALL_REDUCE_STRATEGY");
-        bool force_nccl_all_reduce_strategy = (force_nccl_all_reduce_strategy_char != nullptr);
         AllReduceStrategyType runtime_strategy;
         if (mStrategy == AllReduceStrategyType::UB)
         {
             runtime_strategy = AllReduceStrategyType::UB;
         }
-        else if (force_nccl_all_reduce_strategy || mStrategy == AllReduceStrategyType::NCCL)
+        else if (mStrategy == AllReduceStrategyType::NCCL)
         {
             runtime_strategy = AllReduceStrategyType::NCCL;
         }
@@ -936,10 +934,7 @@ class AllreduceOp
 
     bool isUsingLowPrecision(size_t message_size) const noexcept
     {
-        static char* force_low_precision_allreduce_strategy_char
-            = std::getenv("FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY");
-        bool force_low_precision = (force_low_precision_allreduce_strategy_char != nullptr)
-            || (mStrategy == AllReduceStrategyType::LOWPRECISION);
+        bool force_low_precision = mStrategy == AllReduceStrategyType::LOWPRECISION;
 
 #ifdef ENABLE_FP8
         // Use LowPrecision if PCIe and p2p support and message size is larger than 2MB
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 43b15a4d9f8..7e18458a0f6 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -10,15 +10,13 @@
 from tensorrt_llm._utils import mpi_barrier
 from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
-<<<<<<< HEAD
                                      AllReduceStrategy, MoEAllReduceParams)
-=======
-                                     AllReduceStrategy)
 from tensorrt_llm.logger import logger
->>>>>>> 7f3955b17 (Address comments)
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
+from ..model_config import ModelConfig
+
 _thread_local = threading.local()
 
 
@@ -338,9 +336,6 @@ def forward(
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
         """
-        if self.mapping == 1:
-            return input
-
         if input.numel() > self.max_num_elements_mnnvl:
             return None
 
@@ -379,97 +374,41 @@ def forward(
         return None
 
 
-class TLLMAllReduce(nn.Module):
-    """A specialized AllReduce implementation for Multi-Node NVLink communication.
-
-    This class handles the MNNVL-specific allreduce operations, which can be more efficient
-    for certain operations when using NVLink for multi-node communication.
-    """
-
-    def __init__(self,
-                 mapping: Mapping,
-                 strategy: AllReduceStrategy = AllReduceStrategy.AUTO):
-        super().__init__()
-        self.mapping = mapping
-        self.strategy = strategy
-        self.workspace = None
-
-        self.force_low_precision_env = os.environ.get(
-            "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY")
-        # When Strategy is UB, it is guaranteed that the workspace is not used.
-        if self.strategy != AllReduceStrategy.UB:
-            if self.strategy == AllReduceStrategy.LOWPRECISION or self.force_low_precision_env is not None:
-                allocate_low_presicion_allreduce_workspace(self.mapping)
-            self.workspace = get_allreduce_workspace(self.mapping)
-
-    def forward(
-        self,
-        input: torch.Tensor,
-        all_reduce_params: AllReduceParams,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
-        """Forward pass for MNNVL AllReduce.
-
-        Args:
-            input (torch.Tensor): Input tensor to be reduced
-            all_reduce_params (Optional[AllReduceParams]): Parameters for fused operations
-
-        Returns:
-            Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
-        """
-        if self.mapping == 1:
-            return input
-
-        output = torch.ops.trtllm.allreduce(
-            input=input,
-            residual=all_reduce_params.residual,
-            norm_weight=all_reduce_params.norm_weight,
-            scale=all_reduce_params.scale,
-            bias=all_reduce_params.bias,
-            workspace=self.workspace,
-            group=self.mapping.tp_group,
-            strategy=self.strategy,
-            op=all_reduce_params.fusion_op,
-            eps=all_reduce_params.eps,
-        )
-        return output
-
-
 class AllReduce(nn.Module):
 
     def __init__(self,
-                 mapping: Mapping,
-                 strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
                  dtype: Optional[torch.dtype] = None,
-                 ar_backend: str = "TRTLLM"):
+                 model_config: ModelConfig = ModelConfig()):
         super().__init__()
         """
         AllReduce is a module that performs an all-reduce operation on a tensor.
 
         Args:
-            mapping (Mapping):  The parallel mapping config.
-            strategy (AllReduceStrategy):
-                The following all-reduce strategies are supported:
+            model_config (ModelConfig): mapping and strategy in it are used.
+                mapping (Mapping):  The parallel mapping config.
+                strategy (AllReduceStrategy):
+                    The following all-reduce strategies are supported:
 
-                - UB: AllReduce uses user-buffer based all-reduce kernel.
+                    - UB: AllReduce uses user-buffer based all-reduce kernel.
 
-                - NCCL: Use NCCL allreduce.
+                    - NCCL: Use NCCL allreduce.
 
-                - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel.
+                    - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel.
 
-                - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy.
+                    - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy.
 
-                - LOWPRECISION: AllReduce quantizes data to lower precision for transmission.
-                  Should only be used on topologies with PCIe switches and without NVLink.
-                  This strategy may result in some precision loss but can improve performance
-                  on specific hardware configurations.
+                    - LOWPRECISION: AllReduce quantizes data to lower precision for transmission.
+                    Should only be used on topologies with PCIe switches and without NVLink.
+                    This strategy may result in some precision loss but can improve performance
+                    on specific hardware configurations.
 
-            All strategies support the following operations:
-                - NONE (AllReduce only)
-                - RESIDUAL_RMS_NORM
-                - RESIDUAL_RMS_NORM_QUANT_FP8
-                - RESIDUAL_RMS_NORM_QUANT_NVFP4
-                - RESIDUAL_RMS_NORM_OUT_QUANT_FP8
-                - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4
+                All strategies support the following operations:
+                    - NONE (AllReduce only)
+                    - RESIDUAL_RMS_NORM
+                    - RESIDUAL_RMS_NORM_QUANT_FP8
+                    - RESIDUAL_RMS_NORM_QUANT_NVFP4
+                    - RESIDUAL_RMS_NORM_OUT_QUANT_FP8
+                    - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4
 
             Note: NCCL, UB, and LOWPRECISION strategies only support consequent kernel calls
         instead of fused operations.
@@ -482,23 +421,27 @@ def __init__(self,
             or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using
             the AUTO strategy.
         """
-        self.skip_ar = mapping.tp_size == 1
-        self._mnnvl_allreduce = None
-        self._tllm_allreduce = None
-        self._create_allreduce(mapping, ar_backend, strategy, dtype)
 
-    def _create_allreduce(self, mapping, backend, strategy, dtype):
-        if mapping.tp_size == 1:
-            return
-
-        enable_mnnvl = (backend == "MNNVL"
-                        and (dtype
-                             and dtype in MNNVLAllReduce.get_supported_dtype())
-                        and (not mapping.has_cp()) and mapping.tp_size > 1)
-        if enable_mnnvl:
-            self._mnnvl_allreduce = MNNVLAllReduce(mapping, dtype)
+        self.mapping = model_config.mapping
+        self.workspace = None
+        self.strategy = model_config.allreduce_backend
+        self.mnnvl_allreduce = None
 
-        self._tllm_allreduce = TLLMAllReduce(mapping, strategy)
+        self.force_low_precision_env = os.environ.get(
+            "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY")
+        if self.mapping.tp_size > 1:
+            # When Strategy is UB, it is guaranteed that the workspace is not used.
+            if self.strategy != AllReduceStrategy.UB:
+                if self.strategy == AllReduceStrategy.LOWPRECISION:
+                    allocate_low_presicion_allreduce_workspace(self.mapping)
+                self.workspace = get_allreduce_workspace(self.mapping)
+
+            # Initialize MNNVL AllReduce if needed
+            if self.strategy == AllReduceStrategy.MNNVL and (
+                    dtype and dtype in MNNVLAllReduce.get_supported_dtype()
+            ) and (not self.mapping.has_cp()):
+                self.mnnvl_allreduce = MNNVLAllReduce(self.mapping,
+                                                      dtype) if dtype else None
 
     def forward(
         self,
@@ -529,25 +472,35 @@ def forward(
             RESIDUAL_RMS_NORM_QUANT_NVFP4: [norm_quant_fp4, scale_factor, residual]
             RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]
         '''
-        if self.skip_ar or (all_reduce_params is not None
-                            and all_reduce_params.enable_allreduce == False):
+        if self.mapping.tp_size == 1 or (all_reduce_params is not None
+                                         and all_reduce_params.enable_allreduce
+                                         == False):
             return input
 
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
-        if self._mnnvl_allreduce:
-            mnnvl_output = self._mnnvl_allreduce(
+        # Try MNNVL AllReduce first if available
+        if self.mnnvl_allreduce:
+            mnnvl_output = self.mnnvl_allreduce(
                 input, all_reduce_params=all_reduce_params)
             if mnnvl_output is not None:
                 return mnnvl_output
-            logger.info(f"Fallback to tllm_allreduce.")
 
-        # MNNVL only support part of AllReduceFusionOp provided in params.
-        output = self._tllm_allreduce(
+        # Fall back to regular AllReduce if MNNVL is not available or not applicable
+        output = torch.ops.trtllm.allreduce(
             input=input,
-            all_reduce_params=all_reduce_params,
+            residual=all_reduce_params.residual,
+            norm_weight=all_reduce_params.norm_weight,
+            scale=all_reduce_params.scale,
+            bias=all_reduce_params.bias,
+            workspace=self.workspace,
+            group=self.mapping.tp_group,
+            strategy=self.strategy,
+            op=all_reduce_params.fusion_op,
+            eps=all_reduce_params.eps,
         )
+
         return output if len(output) > 1 else output[0]
 
 
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index e5b5ce048e8..0b0e8a9210e 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -8,6 +8,8 @@
 
 from tensorrt_llm import logger
 from tensorrt_llm._utils import torch_dtype_to_binding
+from tensorrt_llm.functional import AllReduceStrategy
+from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization.mode import QuantAlgo
@@ -77,7 +79,7 @@ class ModelConfig(Generic[TConfig]):
 
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'  # options can be CUTLASS, TRTLLM
-    ar_backend: str = 'TRTLLM'  # options can be MNNVL, TRTLLM
+    allreduce_backend: AllReduceStrategy = AllReduceStrategy.AUTO
 
     # If true, enable min-latency mode. Currently only used for Llama4.
     enable_min_latency: bool = False
@@ -107,6 +109,23 @@ def __post_init__(self):
             self.is_generation = self.is_generation_model(
                 self.pretrained_config.architectures)
 
+        def map_ar_strategy(strategy: str = "AUTO"):
+            maps = {
+                "AUTO": AllReduceStrategy.AUTO,
+                "NCCL": AllReduceStrategy.NCCL,
+                "UB": AllReduceStrategy.UB,
+                "MIN_LATENCY": AllReduceStrategy.MIN_LATENCY,
+                "ONESHOT": AllReduceStrategy.ONESHOT,
+                "TWOSHOT": AllReduceStrategy.TWOSHOT,
+                "LOWPRECISION": AllReduceStrategy.LOWPRECISION,
+                "MNNVL": AllReduceStrategy.MNNVL
+            }
+            key = strategy.upper()
+            return maps[key] if key in maps else AllReduceStrategy.AUTO
+
+        if isinstance(self.allreduce_backend, str):
+            self.allreduce_backend = map_ar_strategy(self.allreduce_backend)
+
     @property
     def fuse_pos_embd(self):
         if self.attn_backend == 'TRTLLM':
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 21918ed655c..67973dc90ba 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -399,7 +399,7 @@ def __init__(self,
             overridden_tp_size=shared_tp_size,
             reduce_output=False)
 
-        self.allreduce = AllReduce(self.mapping)
+        self.allreduce = AllReduce(model_config=model_config)
         self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared]
         self.event_dict = {
             key: torch.cuda.Event()
@@ -628,9 +628,8 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
         self.layer_idx = layer_idx
-        self.allreduce = AllReduce(self.mapping,
-                                   dtype=config.torch_dtype,
-                                   ar_backend=model_config.ar_backend)
+        self.allreduce = AllReduce(dtype=config.torch_dtype,
+                                   model_config=model_config)
         self.moe_allreduce = MoEAllReduce(self.mapping)
         self.next_layer_layernorm: RMSNorm = None
 
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index 600808c6b61..a852560af10 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -282,7 +282,7 @@ def __init__(
                              quant_config=None)
 
         self.mapping = model_config.mapping
-        self.all_reduce = AllReduce(self.mapping)
+        self.all_reduce = AllReduce(model_config=model_config)
         self.moe_event = [torch.cuda.Event(), torch.cuda.Event()]
         self.aux_stream = aux_stream
 
@@ -414,7 +414,7 @@ def __init__(
                                                 dtype=config.torch_dtype)
 
         self.mapping = model_config.mapping
-        self.all_reduce = AllReduce(self.mapping)
+        self.all_reduce = AllReduce(model_config=model_config)
         self.next_layer_layernorm: RMSNorm = None
         self.next_attn: LlamaAttention = None
 
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
index f15a21df31d..6a1e13b1467 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -89,7 +89,7 @@ def __init__(
         self.top_k = config.num_experts_per_tok
         self.enable_attention_dp = model_config.mapping.enable_attention_dp
         self.mapping = model_config.mapping
-        self.allreduce = AllReduce(self.mapping)
+        self.allreduce = AllReduce(model_config=model_config)
         self.enable_alltoall = Qwen3MoE.should_enable_alltoall(
             model_config, self.top_k)
         if self.enable_alltoall:
@@ -202,7 +202,7 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig],
                                                 dtype=config.torch_dtype)
         self.layer_idx = layer_idx
 
-        self.allreduce = AllReduce(self.mapping)
+        self.allreduce = AllReduce(model_config=model_config)
         self.next_layer_layernorm: RMSNorm = None
 
         self.fusion_config = EagerFusionConfig()
diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py
new file mode 100755
index 00000000000..7df752814e2
--- /dev/null
+++ b/tensorrt_llm/_torch/modules/fused_moe.py
@@ -0,0 +1,2513 @@
+import copy
+import math
+import os
+import threading
+from enum import Enum, IntEnum
+from typing import Dict, List, NamedTuple, Optional, Union
+
+import torch
+from torch import nn
+
+from tensorrt_llm._mnnvl_utils import MnnvlMoe, MoEAlltoallInfo
+from tensorrt_llm._utils import get_sm_version, logger
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.quantization.utils import fp4_utils
+from tensorrt_llm.quantization.utils.fp4_utils import (
+    get_reorder_rows_for_gated_act_gemm_row_indices,
+    get_shuffle_matrix_a_row_indices, get_shuffle_matrix_sf_a_row_indices,
+    shuffle_matrix_a, shuffle_matrix_sf_a)
+
+from ...quantization.utils.fp4_utils import float4_sf_dtype
+from ..distributed import allgather, reducescatter
+from ..expert_statistic import ExpertStatistic
+from ..model_config import ModelConfig, MoeLoadBalancerConfig
+from ..utils import (EventType, Fp4QuantizedTensor, disable_fp4_allgather,
+                     reswizzle_sf, swizzle_sf, unswizzle_sf)
+from .gated_mlp import GatedMLP
+from .linear import TensorParallelMode, load_weight_shard
+from .moe_load_balancer import MoeLoadBalancer
+
+# The declarations aligns with moe_kernels.h
+# pack inputs into int64, e.g. 4 x bf16 input values
+FUSED_MOE_NVFP4_INPUT_DTYPE = torch.int64
+# pack weights into int64, e.g. 16 x nvfp4 weight values
+FUSED_MOE_NVFP4_WEIGHT_DTYPE = torch.int64
+# pack weight block scales into int32, e.g. 4 x fp8 weight values
+FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE = torch.int32
+
+
+# The type of method in top-K routing, for use in torch custom op
+# Please keep this in sync with the counterpart defined in cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
+class RoutingMethodType(IntEnum):
+    # Default: Softmax -> TopK
+    Default = 0,
+    # Renormalize: TopK -> Softmax
+    Renormalize = 1,
+    # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts from the Top4 groups
+    DeepSeekV3 = 2,
+    # Llama4: Top1 -> Sigmoid
+    Llama4 = 3,
+    # Qwen3: Softmax -> TopK -> Renormalize
+    Qwen3 = 4,
+    # Unspecified
+    Unspecified = 5.
+
+
+class BaseMoeRoutingMethod(nn.Module):
+
+    def apply(self, _router_logits) -> (torch.Tensor, torch.Tensor):
+        """
+        Applies the routing method to the router logits.
+        Router logits are usually the output of the router Linear layer, but can be any type for more complex routing methods.
+        Returns (token_selected_experts: torch.Tensor<int32>, token_final_scales: torch.Tensor<float32>):
+            token_selected_experts: shape (num_tokens, experts_per_token).
+                It is a list of selected expert indices for each token
+            token_final_scales: shape (num_tokens, experts_per_token). May be None
+                It contains a final scaling/weighting factor applied to the output of each selected expert before summing the results
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
+    def get_experts_per_token(self):
+        return self.top_k
+
+    @property
+    def experts_per_token(self):
+        return self.get_experts_per_token()
+
+    @property
+    def routing_method_type(self):
+        return RoutingMethodType.Unspecified
+
+
+class DefaultMoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(self, top_k: int):
+        super().__init__()
+        self.top_k = top_k
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        topk_values, topk_indices = torch.topk(torch.nn.functional.softmax(
+            router_logits.float(), dim=-1),
+                                               k=self.top_k,
+                                               dim=-1)
+        return topk_indices.to(torch.int32), topk_values
+
+    @property
+    def routing_method_type(self):
+        return RoutingMethodType.Default
+
+
+class DeepSeekV3MoeRoutingMethod(BaseMoeRoutingMethod):
+
+    # Intentionally leave apply() unimplemented.
+    # See comments in DeepseekV3Gate on why routing is done by DeepseekV3Gate.
+    def __init__(self, top_k: int):
+        super().__init__()
+        self.top_k = top_k
+
+    @property
+    def routing_method_type(self):
+        return RoutingMethodType.DeepSeekV3
+
+
+class RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(
+        self,
+        top_k: int,
+    ):
+        super().__init__()
+        self.top_k = top_k
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        topk_values, topk_indices = torch.topk(router_logits,
+                                               k=self.top_k,
+                                               dim=-1)
+        return topk_indices.to(torch.int32), torch.nn.functional.softmax(
+            topk_values.float(), dim=-1)
+
+    @property
+    def routing_method_type(self):
+        return RoutingMethodType.Renormalize
+
+
+class Llama4RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(self, top_k: int):
+        super().__init__()
+        self.top_k = top_k
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        topk_values, topk_indices = torch.topk(router_logits,
+                                               k=self.top_k,
+                                               dim=-1)
+        return topk_indices.to(torch.int32), torch.sigmoid(topk_values.float())
+
+    @property
+    def routing_method_type(self):
+        return RoutingMethodType.Llama4
+
+
+# TODO: re-enable this once the custom op is working.
+# class Llama4RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod):
+
+#     def __init__(self, top_k: int, num_experts_total: int, ep_size: int,
+#                  ep_rank: int):
+#         super().__init__()
+#         self.top_k = top_k
+#         self.num_experts_total = num_experts_total
+#         self.num_experts_per_node = self.num_experts_total // ep_size
+#         self.start_expert = self.num_experts_per_node * ep_rank
+#         self.end_expert = self.start_expert + self.num_experts_per_node
+
+#     def apply(self,
+#               router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+#         unpermuted_scales, indices = torch.ops.trtllm.fused_topk_softmax(
+#             router_logits, self.top_k, self.num_experts_total,
+#             self.start_expert, self.end_expert)
+#         return indices, unpermuted_scales
+
+
+# TODO Test this for Phi models
+class SparseMixerMoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(self, top_k: int, eps: float):
+        super().__init__()
+        self.top_k = top_k
+        self.eps = eps
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        router_logits = router_logits.float()
+        topk_values = torch.empty(router_logits.shape[0],
+                                  self.top_k,
+                                  device=router_logits.device,
+                                  dtype=torch.float32)
+        topk_indices = torch.empty(router_logits.shape[0],
+                                   self.top_k,
+                                   device=router_logits.device,
+                                   dtype=torch.int32)
+        for i in range(self.top_k):
+            if i > 0:
+                max_elem = torch.argmax(router_logits, dim=-1)
+                # Mask out the previously selected indices to negative infinity
+                router_logits.scatter_(-1, max_elem.unsqueeze(-1),
+                                       -float('inf'))
+            # Get the max value of the remaining indices
+            max_values, max_indices = torch.max(router_logits,
+                                                dim=-1,
+                                                keepdim=True)
+            assert torch.all(max_values != -float('inf'))
+
+            topk_indices[:, i] = max_indices.squeeze(-1)
+
+            # Mask out any values that fail the condition '(max - value) / std::max(abs(value), max) > 2 * epsilon'
+            mask = (
+                (max_values - router_logits) /
+                torch.max(torch.abs(router_logits), max_values)) > 2 * self.eps
+            masked_logits = torch.where(mask, -float('inf'), router_logits)
+            softmax_masked_logits = torch.nn.functional.softmax(masked_logits,
+                                                                dim=-1)
+            selected_values = torch.gather(softmax_masked_logits, -1,
+                                           max_indices)
+            topk_values[:, i] = selected_values.squeeze(-1)
+
+        return topk_indices.to(torch.int32), topk_values
+
+
+class StaticMoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(self,
+                 routing_tensor: torch.Tensor,
+                 routing_scales: Optional[torch.Tensor] = None):
+        super().__init__()
+        assert routing_tensor.dtype == torch.int32
+        if routing_scales is not None:
+            assert routing_tensor.shape[0] == routing_scales.shape[0]
+            assert routing_tensor.shape[1] == routing_scales.shape[1]
+            assert routing_scales.dtype == torch.float32
+        self.routing_tensor = routing_tensor
+        self.routing_scales = routing_scales
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        return self.routing_tensor, self.routing_scales
+
+    def get_experts_per_token(self):
+        return self.routing_tensor.shape[1]
+
+
+class LoadBalancedMoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(self, top_k: int):
+        super().__init__()
+        self.top_k = top_k
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+        balanced_values = torch.ones(router_logits.shape[0],
+                                     self.top_k,
+                                     device=router_logits.device,
+                                     dtype=torch.float32)
+        balanced_indices = torch.empty(router_logits.shape[0],
+                                       self.top_k,
+                                       device=router_logits.device,
+                                       dtype=torch.int32)
+
+        # Fill the balanced_indices with each expert in round-robin fashion
+        final_size = router_logits.shape[0] * self.top_k
+        repeat_count = math.ceil(final_size / router_logits.shape[1])
+        indices = torch.arange(router_logits.shape[1],
+                               device=router_logits.device,
+                               dtype=torch.int32)
+        indices = indices.repeat(repeat_count)
+        indices = indices[:final_size]
+        balanced_indices = indices.view(router_logits.shape[0],
+                                        self.top_k).contiguous()
+
+        return balanced_indices, balanced_values
+
+
+class Qwen3MoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(self, top_k: int):
+        super().__init__()
+        self.top_k = top_k
+
+    def apply(self,
+              router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor):
+
+        routing_weights = torch.nn.functional.softmax(router_logits,
+                                                      dim=1,
+                                                      dtype=torch.float)
+        topk_values, topk_indices = torch.topk(routing_weights,
+                                               k=self.top_k,
+                                               dim=-1)
+        topk_values /= topk_values.sum(dim=-1, keepdim=True)
+        return topk_indices.to(torch.int32), topk_values
+
+    @property
+    def routing_method_type(self) -> RoutingMethodType:
+        return RoutingMethodType.Qwen3
+
+
+class MoEWeightLoadingMode(Enum):
+    VANILLA = 0
+    FUSED_GATE_UP_PROJ = 1
+
+
+class VanillaMoE(nn.ModuleList):
+
+    def __init__(
+        self,
+        *,
+        routing_method: BaseMoeRoutingMethod,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        model_config: ModelConfig = ModelConfig(),
+        aux_stream: Optional[torch.cuda.Stream] = None,
+        weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
+        VANILLA,
+        apply_router_weight_on_input: bool = False,
+        enable_alltoall: bool = False,
+        pack_weights: bool = False,
+    ):
+        from ..distributed import AllReduce
+
+        super().__init__()
+        self.routing_method = routing_method
+        self.num_experts = num_experts
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.weight_loading_mode = weight_loading_mode
+        self.pack_weights = pack_weights
+
+        self.dtype = dtype
+        self.reduce_results = reduce_results
+        self.model_config = model_config
+        # could be modified later
+        self.quant_config = model_config.quant_config
+
+        self.cluster_rank = model_config.mapping.moe_cluster_rank
+        self.cluster_size = model_config.mapping.moe_cluster_size
+        self.smart_router = True if self.cluster_size > 1 else False
+        assert not self.smart_router, (
+            "Smart router is not supported in vanilla MoE, "
+            "please set moe_cluster_size to 1.")
+
+        self.rank = model_config.mapping.rank
+
+        self.tp_rank = model_config.mapping.moe_tp_rank
+        self.tp_size = model_config.mapping.moe_tp_size
+
+        self.ep_size = model_config.mapping.moe_ep_size
+        self.ep_rank = model_config.mapping.moe_ep_rank
+        self.moe_backend = model_config.moe_backend
+        self.use_dp = model_config.mapping.enable_attention_dp
+
+        # All ranks participate in allreduce regardless of EP/TP combination
+        self.mapping = model_config.mapping
+        self.parallel_size = self.mapping.tp_size
+
+        self.all_reduce = AllReduce(self.mapping)
+
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+
+        self.expert_size_per_partition = num_experts // self.ep_size
+        self.expert_start = self.ep_rank * self.expert_size_per_partition
+        self.expert_end = min(
+            self.expert_start + self.expert_size_per_partition,
+            self.num_experts)
+        self.expert_size_per_partition = self.expert_end - self.expert_start
+
+        max_num_tokens = model_config.max_num_tokens
+        # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
+        if self.use_dp:
+            max_num_tokens *= model_config.mapping.world_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens if model_config.moe_max_num_tokens is not None else max_num_tokens
+
+        self.enable_alltoall = False
+
+        self._weights_created = False
+        if not model_config.skip_create_weights_in_init:
+            self.create_weights()
+
+        # If True, the router weight will be multiplied on the input rather than at the end of FC2
+        self.apply_router_weight_on_input = apply_router_weight_on_input
+
+    def create_experts(self, module_list: nn.ModuleList = None):
+        if module_list is None:
+            module_list = self
+        model_config = copy.copy(self.model_config)
+        model_config.mapping = Mapping(
+            world_size=self.mapping.moe_tp_size,
+            tp_size=self.mapping.moe_tp_size,
+            rank=self.mapping.moe_tp_rank,
+        )
+        model_config.quant_config = self.quant_config
+        model_config.skip_create_weights_in_init = False
+        for expert_idx in range(self.num_experts):
+            if self.expert_start <= expert_idx < self.expert_end:
+                module_list[expert_idx] = GatedMLP(
+                    hidden_size=self.hidden_size,
+                    intermediate_size=self.intermediate_size,
+                    bias=False,
+                    dtype=self.dtype,
+                    config=model_config,
+                    reduce_output=False,
+                )
+            else:
+                # use identity as placeholder for unused experts
+                module_list[expert_idx] = nn.Identity()
+
+    def create_weights(self):
+        if self._weights_created:
+            return
+        self._weights_created = True
+
+        if not self.pack_weights:
+            self.create_experts()
+            return
+
+        self.has_any_quant = False
+        self.has_fp8_qdq = False
+        self.has_fp8_block_scales = False
+        self.has_nvfp4 = False
+        gate_up_proj_shape = (
+            self.expert_size_per_partition,
+            self.intermediate_size_per_partition * 2,
+            self.hidden_size,
+        )
+        down_proj_shape = (
+            self.expert_size_per_partition,
+            self.hidden_size,
+            self.intermediate_size_per_partition,
+        )
+        if self.quant_config and self.quant_config.layer_quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            self.has_any_quant = True
+            qc = self.quant_config
+            if qc.layer_quant_mode.has_fp8_qdq():
+                self.has_fp8_qdq = True
+
+                self.gate_up_proj_weight = nn.Parameter(
+                    torch.empty(
+                        gate_up_proj_shape,
+                        dtype=torch.float8_e4m3fn,
+                    ),
+                    requires_grad=False,
+                )
+                self.gate_up_proj_weight_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.gate_up_proj_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.gate_up_proj_inv_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+
+                self.down_proj_weight = nn.Parameter(
+                    torch.empty(
+                        down_proj_shape,
+                        dtype=torch.float8_e4m3fn,
+                    ),
+                    requires_grad=False,
+                )
+                self.down_proj_weight_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.down_proj_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.down_proj_inv_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+            elif qc.layer_quant_mode.has_fp8_block_scales():
+                self.has_fp8_block_scales = True
+
+                self.gate_up_proj_weight = nn.Parameter(
+                    torch.empty(
+                        gate_up_proj_shape,
+                        dtype=torch.float8_e4m3fn,
+                    ),
+                    requires_grad=False,
+                )
+                gate_up_proj_scale_shape = (
+                    self.expert_size_per_partition,
+                    math.ceil(self.intermediate_size_per_partition * 2 / 128),
+                    math.ceil(self.hidden_size / 128),
+                )
+                self.gate_up_proj_weight_scale = nn.Parameter(
+                    torch.empty(
+                        gate_up_proj_scale_shape,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                # Not really used for Gemm now.
+                # Only used to quantize output of FP8 attention.
+                self.gate_up_proj_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.gate_up_proj_inv_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+
+                self.down_proj_weight = nn.Parameter(
+                    torch.empty(
+                        down_proj_shape,
+                        dtype=torch.float8_e4m3fn,
+                    ),
+                    requires_grad=False,
+                )
+                down_proj_scale_shape = (
+                    self.expert_size_per_partition,
+                    math.ceil(self.hidden_size / 128),
+                    math.ceil(self.intermediate_size_per_partition / 128),
+                )
+                self.down_proj_weight_scale = nn.Parameter(
+                    torch.empty(
+                        down_proj_scale_shape,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                # Not really used for Gemm now.
+                # Only used to quantize output of FP8 attention.
+                self.down_proj_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.down_proj_inv_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+            elif qc.layer_quant_mode.has_nvfp4():
+                self.has_nvfp4 = True
+                self.scaling_vector_size = 16
+
+                assert self.hidden_size % self.scaling_vector_size == 0, f"hidden_size {self.hidden_size} must be divisible by scaling_vector_size {self.scaling_vector_size}"
+
+                # Quantized weights
+                self.gate_up_proj_weight = nn.Parameter(
+                    torch.empty(
+                        [
+                            self.expert_size_per_partition,
+                            self.intermediate_size_per_partition * 2,
+                            self.hidden_size // 2,
+                        ],
+                        dtype=fp4_utils.float4_e2m1x2,
+                    ),
+                    requires_grad=False,
+                )
+
+                # FP8 per-block scaling factors. dtype must be aligned with SF_DTYPE
+                # Padding is required. See computeSFSize in quantization.h
+                nrows = fp4_utils.pad_up(
+                    self.intermediate_size_per_partition * 2, 128)
+                ncols = fp4_utils.pad_up(
+                    self.hidden_size // self.scaling_vector_size, 4)
+                self.gate_up_proj_weight_scale = nn.Parameter(
+                    torch.empty(
+                        [self.expert_size_per_partition, nrows * ncols],
+                        dtype=fp4_utils.float4_sf_dtype,
+                    ),
+                    requires_grad=False,
+                )
+
+                # FP32 per-tensor global scaling factor = 448*6/amax_input
+                self.gate_up_proj_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.gate_up_proj_inv_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+
+                # (amax_input*amax_weight) / (448*6*448*6)
+                self.gate_up_proj_alpha = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+
+                assert self.intermediate_size_per_partition % self.scaling_vector_size == 0, f"intermediate_size_per_partition {self.intermediate_size_per_partition} must be divisible by scaling_vector_size {self.scaling_vector_size}"
+
+                # Quantized weights
+                self.down_proj_weight = nn.Parameter(
+                    torch.empty(
+                        [
+                            self.expert_size_per_partition,
+                            self.hidden_size,
+                            self.intermediate_size_per_partition // 2,
+                        ],
+                        dtype=fp4_utils.float4_e2m1x2,
+                    ),
+                    requires_grad=False,
+                )
+
+                # FP8 per-block scaling factors. dtype must be aligned with SF_DTYPE
+                # Padding is required. See computeSFSize in quantization.h
+                nrows = fp4_utils.pad_up(self.hidden_size, 128)
+                ncols = fp4_utils.pad_up(
+                    self.intermediate_size_per_partition //
+                    self.scaling_vector_size, 4)
+                self.down_proj_weight_scale = nn.Parameter(
+                    torch.empty(
+                        [self.expert_size_per_partition, nrows * ncols],
+                        dtype=fp4_utils.float4_sf_dtype,
+                    ),
+                    requires_grad=False,
+                )
+
+                # FP32 per-tensor global scaling factor = 448*6/amax_input
+                self.down_proj_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                self.down_proj_inv_input_scale = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+
+                # (amax_input*amax_weight) / (448*6*448*6)
+                self.down_proj_alpha = nn.Parameter(
+                    torch.empty(
+                        self.expert_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+            else:
+                raise ValueError(f'unsupported quant mode: {qc.quant_mode}')
+        else:
+            self.gate_up_proj_weight = nn.Parameter(
+                torch.empty(gate_up_proj_shape, dtype=self.dtype),
+                requires_grad=False,
+            )
+            self.down_proj_weight = nn.Parameter(
+                torch.empty(down_proj_shape, dtype=self.dtype),
+                requires_grad=False,
+            )
+
+    def pack_params(self, experts, module_name: str, weight_name: str):
+        weights = []
+        for expert_idx in range(self.expert_start, self.expert_end):
+            weights.append(
+                getattr(getattr(experts[expert_idx], module_name), weight_name))
+        packed_weight = torch._utils._flatten_dense_tensors(weights)
+        weights_data = torch._utils._unflatten_dense_tensors(
+            packed_weight, weights)
+        for weight, data in zip(weights, weights_data):
+            weight.data = data
+        packed_weight = packed_weight.view(len(weights), *weights_data[0].shape)
+        getattr(self, f"{module_name}_{weight_name}").data = packed_weight
+
+    def load_weights(self, weights: List[Dict]):
+        from ..models.modeling_utils import filter_weights
+
+        assert self._weights_created
+        assert len(weights) == 1
+        weights = weights[0]
+
+        if self.pack_weights:
+            experts = nn.ModuleList([None] * self.num_experts)
+            self.create_experts(experts)
+            experts.to("cuda")
+        else:
+            experts = self
+
+        for expert_idx in range(self.expert_start, self.expert_end):
+            experts[expert_idx].gate_up_proj.load_weights([
+                filter_weights(f"{expert_idx}.w1", weights),
+                filter_weights(f"{expert_idx}.w3", weights),
+            ])
+            experts[expert_idx].down_proj.load_weights([
+                filter_weights(f"{expert_idx}.w2", weights),
+            ])
+
+        if self.pack_weights:
+            for module_name in ["gate_up_proj", "down_proj"]:
+                for weight_name, _ in getattr(experts[self.expert_start],
+                                              module_name).named_parameters():
+                    self.pack_params(experts, module_name, weight_name)
+
+    def reducescatter_or_allreduce(
+        self,
+        inputs,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+    ):
+        outputs = inputs
+        if self.parallel_size > 1 and not self.enable_alltoall:
+            if self.use_dp:
+                outputs = reducescatter(
+                    inputs,
+                    self.mapping,
+                    dim=0,
+                    sizes=None if use_dp_padding else all_rank_num_tokens)
+            elif self.reduce_results:
+                outputs = self.all_reduce(inputs)
+        return outputs
+
+    def run_experts(
+        self,
+        input: torch.Tensor,
+        expanded_inputs: torch.Tensor,
+        expanded_scales: torch.Tensor,
+        sorted_experts: torch.Tensor,
+        batch_indices: torch.Tensor,
+    ) -> torch.Tensor:
+        final_hidden_states = torch.zeros(
+            input.shape,
+            dtype=input.dtype,
+            device=input.device,
+        )
+        for expert_idx in range(self.expert_start, self.expert_end):
+            expert_mask = sorted_experts == expert_idx
+            if not torch.any(expert_mask):
+                continue
+            expanded_input = expanded_inputs[expert_mask]
+            batch_idx = batch_indices[expert_mask]
+            expanded_scale = expanded_scales[expert_mask]
+
+            output = self[expert_idx](expanded_input)
+            final_hidden_states[batch_idx] += output * expanded_scale
+        return final_hidden_states
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert x.shape[-1] == self.hidden_size
+        x = x.view(-1, self.hidden_size)
+
+        token_selected_experts, token_final_scales = self.routing_method.apply(
+            router_logits)
+
+        if self.use_dp and self.parallel_size > 1:
+            x, token_selected_experts, token_final_scales = allgather(
+                [x, token_selected_experts, token_final_scales],
+                self.mapping,
+                dim=0,
+                sizes=None if use_dp_padding else all_rank_num_tokens)
+
+        expert_masks = ((token_selected_experts >= self.expert_start)
+                        & (token_selected_experts < self.expert_end))
+        local_selected_experts = token_selected_experts[expert_masks]
+        sort_indices = torch.argsort(local_selected_experts)
+        sorted_experts = local_selected_experts[sort_indices]
+
+        batch_indices, nth_experts = torch.where(expert_masks)
+        batch_indices = batch_indices[sort_indices]
+        nth_experts = nth_experts[sort_indices]
+        expanded_inputs = x[batch_indices]
+        expanded_scales = token_final_scales[batch_indices, nth_experts, None]
+
+        final_hidden_states = self.run_experts(
+            x,
+            expanded_inputs,
+            expanded_scales,
+            sorted_experts,
+            batch_indices,
+        )
+
+        final_hidden_states = self.reducescatter_or_allreduce(
+            final_hidden_states,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=use_dp_padding,
+        )
+        return final_hidden_states
+
+
+class FusedMoE(nn.Module):
+    """
+    Fused Mixture of Experts (MoE) Layer with performance tuning.
+
+    Args:
+        num_experts (int): Number of experts in the MoE layer.
+        top_k (int): Number of top experts to select for each input token.
+        hidden_size (int): Size of the hidden state.
+        intermediate_size (int): Size of the intermediate state.
+        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
+        dtype (Optional[torch.dtype]): Data type for the weights.
+        reduce_results (bool): Whether to reduce the results across devices.
+        model_config (ModelConfig): Configuration object for the model.
+        enable_alltoall (bool): whether to enable alltoall instead of allgather/reducescatter
+
+    MoE torch custom op:
+        cutlass Backend
+            In min-latency mode:
+            Quant:
+                fp8 block scales (SM90 Hopper only):
+                    FusedMoE Op: dynamic quant + gemm1 + swiglu + gemm2 (return tensor list).
+                fp8 qdq, nvfp4:
+                    FusedMoE Op: gemm1 + swiglu + gemm2 (return tensor list).
+
+            In max-throughput mode:
+            Quant:
+                fp8 block scales (SM90 Hopper only):
+                    FusedMoE Op: dynamic quant + scatter + gemm1 + swiglu + gemm2 + finalizeMoeRoute (return one tensor)
+                p8 qdq, nvfp4:
+                    FusedMoE Op: scatter + gemm1 + swiglu + gemm2 + finalizeMoeRoute (return one tensor)
+
+        trtllm_gen backend:
+            Only support min-latency mode now (SM100 Blackwell only).
+            Quant: fp8 block scales quant and nvfp4 quant
+                FusedMoE Op: routing(topK, etc.) + scatter + gemm1 + swiglu + gemm2 + finalize MoeRoute
+
+    FusedMoE module:
+        cutlass Backend (moe_backend="CUTLASS"):
+            min-latency mode:
+                routing(topK, etc.) + FusedMoE Op
+                equals to: routing(topK, etc.) [+ dynamic quant fp8 qdq | optional dynamic quant nvfp4] + gemm1 + swiglu + gemm2
+
+            max-throughput mode:
+                routing(topK, etc.) [+ dynamic quant for fp8 qdq and nvfp4 ] [+ fp4_allgather] + FusedMoe Op[no allreduce] + reducescatter, with AttentionDP on
+                equals to: dynamic quant + routing(topK, etc.) [+ fp4_allgather] + scatter + gemm1 + swiglu + gemm2 + finalizeMoeRoute [no allreduce] + reducescatter
+
+        trtllm_gen backend (moe_backend="TRTLLM"):
+            min-latency mode (cutlass_min_latency_mode flag of forward has no effect when trtllm_gen is used):
+                dynamic quant + FusedMoe Op
+                equals to: dynamic quant + routing(topK, etc.) + scatter + gemm1 + swiglu + gemm2 + finalize MoeRoute
+
+    In min-latency mode, setting `reduce_results=False` disables the AllReduce in the FusedMoE module, so any necessary AllReduce operations must be added explicitly in the model definition.
+    AttentionDP should be turned off for min-latency mode.
+
+    When we have redundant expert, we have more weight slots than `num_experts`, in that case, we separate the concepts of expert and slot.
+    Expert is the concept from model's perspective while slot is the concept from model engine's perspective.
+    There should be at lease `num_experts` slots in the model engine. More than that is OK, in that case, some experts may have multiple replicas.
+    """
+
+    def __init__(
+        self,
+        *,
+        routing_method: BaseMoeRoutingMethod,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        model_config: ModelConfig = ModelConfig(),
+        aux_stream: Optional[torch.cuda.Stream] = None,
+        weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
+        VANILLA,
+        apply_router_weight_on_input: bool = False,
+        enable_alltoall: bool = False,
+        moe_load_balancer: Optional[MoeLoadBalancer] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        from ..distributed import AllReduce
+
+        super().__init__()
+        self.routing_method = routing_method
+        self.num_experts = num_experts
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.weight_loading_mode = weight_loading_mode
+
+        self.dtype = dtype
+        self.reduce_results = reduce_results
+        # could be modified later
+        self.quant_config = model_config.quant_config
+
+        self.cluster_rank = model_config.mapping.moe_cluster_rank
+        self.cluster_size = model_config.mapping.moe_cluster_size
+        self.smart_router = True if self.cluster_size > 1 else False
+
+        self.rank = model_config.mapping.rank
+
+        self.tp_rank = model_config.mapping.moe_tp_rank
+        self.tp_size = model_config.mapping.moe_tp_size
+
+        self.ep_size = model_config.mapping.moe_ep_size
+        self.ep_rank = model_config.mapping.moe_ep_rank
+        self.moe_backend = model_config.moe_backend
+        self.use_dp = model_config.mapping.enable_attention_dp
+
+        # All ranks participate in allreduce regardless of EP/TP combination
+        self.mapping = model_config.mapping
+        self.parallel_size = self.mapping.tp_size
+
+        self.all_reduce = AllReduce(model_config=model_config)
+
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+
+        self.layer_idx = layer_idx
+        moe_load_balancer_config = model_config.moe_load_balancer
+        if moe_load_balancer_config is None:
+            assert moe_load_balancer is None
+            # A dummy MoeLoadBalancerConfig to generate default initial_global_assignments
+            moe_load_balancer_config = MoeLoadBalancerConfig()
+            moe_load_balancer_config.setup(num_experts=num_experts,
+                                           ep_rank=self.ep_rank,
+                                           ep_size=self.ep_size)
+        else:
+            assert moe_load_balancer is not None
+
+        self.num_slots = moe_load_balancer_config.num_slots
+        if self.smart_router:
+            assert self.num_slots == self.num_experts, "Smart router should not have redundant slots"
+
+        self.initial_global_assignments = moe_load_balancer_config.get_layer_initial_global_assignments(
+            layer_idx)
+        self.expert_size_per_partition = moe_load_balancer_config.num_local_slots
+        self.slot_start = moe_load_balancer_config.slot_start
+        self.slot_end = moe_load_balancer_config.slot_end
+        self.initial_local_expert_ids = self.initial_global_assignments[
+            self.slot_start:self.slot_end]
+        assert len(
+            self.initial_local_expert_ids) == self.expert_size_per_partition
+
+        self.balancer_layer = None
+        if moe_load_balancer is not None:
+            self.balancer_layer = moe_load_balancer.add_layer(
+                expert_count=num_experts,
+                top_k=routing_method.experts_per_token,
+                slot_count_per_rank=self.expert_size_per_partition,
+            )
+            self.balancer_layer.set_initial_weight_assignments(
+                self.initial_global_assignments)
+            logger.info(
+                f"MoE load balancer enabled. num_experts = {num_experts}, num_slots = {self.num_slots}, ep_size = {self.ep_size}"
+            )
+            logger.info(
+                f"initial_global_assignments (layer {layer_idx}) = {self.initial_global_assignments}"
+            )
+
+        max_num_tokens = model_config.max_num_tokens
+        # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
+        if self.use_dp:
+            max_num_tokens *= model_config.mapping.world_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens if model_config.moe_max_num_tokens is not None else max_num_tokens
+        # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
+        if self.moe_max_num_tokens < max_num_tokens:
+            self.aux_stream = aux_stream if aux_stream is not None else torch.cuda.Stream(
+            )
+            self.event_dict = {
+                key: torch.cuda.Event()
+                for key in [EventType.Main, EventType.MoeChunkingOverlap]
+            }
+        else:
+            self.aux_stream = None
+            self.event_dict = None
+
+        # The profiler converges on the same best tactic when the number of tokens is large enough.
+        # To avoid long profiling time, the max number of tokens used in the profiling is capped to
+        # around 16k tokens per expert, which is well into the compute bound domain.
+        self.tune_max_num_tokens = min(
+            self.moe_max_num_tokens,
+            16384 * self.num_slots // routing_method.get_experts_per_token(),
+        )
+        self.has_been_profiled = False
+        self.has_been_profiled_min_latency = False
+
+        self.enable_alltoall = enable_alltoall
+        self.use_postquant_alltoall = False
+        if self.enable_alltoall:
+            assert self.use_dp and self.parallel_size > 1,\
+                "alltoall should only enabled with attention dp and parallel_size > 1"
+            qm = self.quant_config.quant_mode
+            self.use_postquant_alltoall = (os.environ.get(
+                "TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1")
+                                           == "1") and qm.has_nvfp4()
+        self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
+            model_config.mapping) if enable_alltoall else None
+
+        self._weights_created = False
+        if not model_config.skip_create_weights_in_init:
+            self.create_weights()
+
+        # If True, the router weight will be multiplied on the input rather than at the end of FC2
+        self.apply_router_weight_on_input = apply_router_weight_on_input
+        self._check_configs()
+
+    @property
+    def has_any_quant(self):
+        return self.quant_config and self.quant_config.quant_mode.has_any_quant(
+            exclude_kv_cache=True)
+
+    def _check_configs(self):
+        if self.enable_alltoall:
+            assert self.use_dp and self.parallel_size > 1,\
+                "alltoall should only enabled with attention dp and parallel_size > 1"
+
+        if self.is_trtllm():
+            # trtllm_gen backend only support min-latency mode now
+            assert not self.apply_router_weight_on_input, "TRTLLM backend does not support applying router weight on input yet."
+            assert not self.reduce_results
+            assert self.quant_config and (
+                self.quant_config.quant_mode.has_nvfp4()
+                | self.quant_config.quant_mode.has_fp8_block_scales()
+            ), "The TRTLLM backend of FusedMoE only supports fp8_block_scaling and nvfp4 dtypes."
+        else:
+            if self.apply_router_weight_on_input:
+                assert self.routing_method.top_k == 1, "Current walkaround only supports top-1 routing"
+            if self.quant_config and self.quant_config.quant_mode.has_any_quant(
+                    exclude_kv_cache=True):
+                if not (self.quant_config.quant_mode.has_nvfp4()
+                        | self.quant_config.quant_mode.has_fp8_block_scales()
+                        | self.quant_config.quant_mode.has_fp8_qdq()
+                        | self.quant_config.quant_mode.
+                        is_int4_weight_only_per_group()):
+                    raise ValueError(
+                        f"unsupported quantization mode: {self.quant_config.quant_mode}"
+                    )
+
+    def setup_quant_scales(self):
+        self.quant_scales = None
+        if not self.has_any_quant:
+            return
+        if self.has_fp8_qdq:
+            self.quant_scales = FusedMoEQuantScalesFP8(
+                fc1_dequant=self.fc31_dequant,
+                fc2_quant=self.fc2_quant,
+                fc2_dequant=self.fc2_dequant,
+                fc1_input_dequant=self.fc31_input_dequant,
+            )
+        elif self.has_fp8_block_scales:
+            self.quant_scales = FusedMoEQuantScalesFP8BlockScales(
+                fc_weight_scales=self.w3_w1_weight_scaling_factor,
+                proj_weight_scales=self.w2_weight_scaling_factor,
+            )
+        elif self.has_nvfp4:
+            self.quant_scales = FusedMoEQuantScalesNVFP4(
+                fc1_act_global=self.fc31_input_scale,
+                fc1_weight_block=self.w3_w1_weight_scale,
+                fc1_global=self.fc31_alpha,
+                fc2_act_global=self.fc2_input_scale,
+                fc2_weight_block=self.w2_weight_scale,
+                fc2_global=self.fc2_alpha,
+            )
+        elif self.has_w4afp8:
+            self.quant_scales = FusedMoEQuantScalesW4A8(
+                scale_1_interleaved=self.fc31_weight_scale,
+                scale_2_interleaved=self.fc2_weight_scale,
+                pre_quant_scale_1=self.fc31_act_scale,
+                pre_quant_scale_2=self.fc2_act_scale,
+                zero_1=torch.Tensor(),
+                zero_2=torch.Tensor(),
+                alpha_1=self.fc31_alpha,
+                alpha_2=self.fc2_alpha,
+            )
+
+    def is_trtllm(self):
+        return self.moe_backend == "TRTLLM" and self.has_any_quant
+
+    def is_cutlass(self):
+        return not self.is_trtllm()
+
+    def get_quant_scales(self, slot_start, slot_end):
+        assert self.smart_router
+
+        if self.has_fp8_block_scales:
+            return FusedMoEQuantScalesFP8BlockScales(
+                fc_weight_scales=self.w3_w1_weight_scaling_factor.narrow(
+                    0, slot_start, slot_end - slot_start),
+                proj_weight_scales=self.w2_weight_scaling_factor.narrow(
+                    0, slot_start, slot_end - slot_start),
+            )
+        elif self.has_nvfp4:
+            return FusedMoEQuantScalesNVFP4(
+                fc1_act_global=self.fc31_input_scale,
+                fc1_weight_block=self.w3_w1_weight_scale.narrow(
+                    0, slot_start, slot_end - slot_start),
+                fc1_global=self.fc31_alpha.narrow(0, slot_start,
+                                                  slot_end - slot_start),
+                fc2_act_global=self.fc2_input_scale,
+                fc2_weight_block=self.w2_weight_scale.narrow(
+                    0, slot_start, slot_end - slot_start),
+                fc2_global=self.fc2_alpha.narrow(0, slot_start,
+                                                 slot_end - slot_start),
+            )
+        elif self.has_w4afp8:
+            return FusedMoEQuantScalesW4A8(
+                scale_1_interleaved=self.fc31_weight_scale.narrow(
+                    0, slot_start, slot_end - slot_start),
+                scale_2_interleaved=self.fc2_weight_scale.narrow(
+                    0, slot_start, slot_end - slot_start),
+                pre_quant_scale_1=self.fc31_act_scale.narrow(
+                    0, slot_start, slot_end - slot_start),
+                pre_quant_scale_2=self.fc2_act_scale.narrow(
+                    0, slot_start, slot_end - slot_start),
+                zero_1=torch.Tensor(),
+                zero_2=torch.Tensor(),
+                alpha_1=self.fc31_alpha.narrow(0, slot_start,
+                                               slot_end - slot_start),
+                alpha_2=self.fc2_alpha.narrow(0, slot_start,
+                                              slot_end - slot_start),
+            )
+        else:
+            return self.quant_scales
+
+    def create_weights(self):
+        if self._weights_created:
+            return
+        weight_dtype = self.dtype
+        w3_w1_weight_shape = (self.expert_size_per_partition,
+                              self.intermediate_size_per_partition * 2,
+                              self.hidden_size)
+        w2_weight_shape = (
+            self.expert_size_per_partition,
+            self.hidden_size,
+            self.intermediate_size_per_partition,
+        )
+
+        self.quant_scales = []
+        self.has_fp8_qdq = False
+        self.has_fp8_block_scales = False
+        self.has_nvfp4 = False
+        self.has_w4afp8 = False
+        if self.quant_config and self.quant_config.quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            qc = self.quant_config
+            if qc.quant_mode.has_fp8_qdq():
+                self.has_fp8_qdq = True
+                weight_dtype = torch.float8_e4m3fn
+
+                fc31_dequant = nn.Parameter(torch.empty(
+                    self.expert_size_per_partition, dtype=torch.float32),
+                                            requires_grad=False)
+                self.register_parameter("fc31_dequant", fc31_dequant)
+
+                fc2_dequant = nn.Parameter(torch.empty(
+                    self.expert_size_per_partition, dtype=torch.float32),
+                                           requires_grad=False)
+                self.register_parameter("fc2_dequant", fc2_dequant)
+
+                fc2_quant = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                         requires_grad=False)
+                self.register_parameter("fc2_quant", fc2_quant)
+
+                fc31_input_dequant = nn.Parameter(torch.tensor(
+                    1., dtype=torch.float32),
+                                                  requires_grad=False)
+                self.register_parameter("fc31_input_dequant",
+                                        fc31_input_dequant)
+            elif qc.quant_mode.has_fp8_block_scales():
+                self.has_fp8_block_scales = True
+                weight_dtype = torch.float8_e4m3fn
+                cell_div = lambda x, y: (x + y - 1) // y
+                w3_w1_weight_scaling_factor = nn.Parameter(torch.empty(
+                    (self.expert_size_per_partition,
+                     cell_div(self.intermediate_size_per_partition, 128) * 2,
+                     cell_div(w3_w1_weight_shape[2], 128)),
+                    dtype=torch.float32),
+                                                           requires_grad=False)
+                self.register_parameter("w3_w1_weight_scaling_factor",
+                                        w3_w1_weight_scaling_factor)
+
+                w2_weight_scaling_factor = nn.Parameter(torch.empty(
+                    (self.expert_size_per_partition,
+                     cell_div(w2_weight_shape[1],
+                              128), cell_div(w2_weight_shape[2], 128)),
+                    dtype=torch.float32),
+                                                        requires_grad=False)
+                self.register_parameter("w2_weight_scaling_factor",
+                                        w2_weight_scaling_factor)
+            elif qc.quant_mode.is_int4_weight_only_per_group():
+                self.has_w4afp8 = True
+                self.sm_version = get_sm_version()
+                if self.sm_version == 89:
+                    self.interleave = [1, 1]
+                elif self.sm_version == 90:
+                    self.interleave = []
+                    for k_shape in [
+                            self.hidden_size,
+                            self.intermediate_size_per_partition
+                    ]:
+                        if k_shape % 512 == 0:
+                            self.interleave.append(4)
+                        elif k_shape % 256 == 0:
+                            self.interleave.append(2)
+                        elif k_shape % 128 == 0:
+                            self.interleave.append(1)
+                        else:
+                            raise NotImplementedError(
+                                f"K shape is required to be multiple of 128, received {k_shape}."
+                            )
+                else:
+                    raise NotImplementedError(
+                        f"W4AFP8 MoE is unsupported on SM{self.sm_version}.")
+                weight_dtype = torch.int8
+                w3_w1_weight_shape = (self.expert_size_per_partition,
+                                      self.intermediate_size_per_partition * 2,
+                                      self.hidden_size // 2)
+                w2_weight_shape = (self.expert_size_per_partition,
+                                   self.hidden_size,
+                                   self.intermediate_size_per_partition // 2)
+
+                fc31_act_scale = nn.Parameter(torch.empty(
+                    self.expert_size_per_partition, 1, dtype=self.dtype),
+                                              requires_grad=False)
+                self.register_parameter("fc31_act_scale", fc31_act_scale)
+
+                fc2_act_scale = nn.Parameter(torch.empty(
+                    self.expert_size_per_partition, 1, dtype=self.dtype),
+                                             requires_grad=False)
+                self.register_parameter("fc2_act_scale", fc2_act_scale)
+
+                # col parallel
+                fc31_weight_scale = nn.Parameter(
+                    torch.empty(self.expert_size_per_partition,
+                                self.hidden_size // (128 * self.interleave[0]),
+                                self.intermediate_size_per_partition * 2 *
+                                self.interleave[0],
+                                dtype=self.dtype),
+                    requires_grad=False)
+                self.register_parameter("fc31_weight_scale", fc31_weight_scale)
+
+                # row parallel
+                fc2_weight_scale = nn.Parameter(
+                    torch.empty(self.expert_size_per_partition,
+                                self.intermediate_size_per_partition //
+                                (128 * self.interleave[1]),
+                                self.hidden_size * self.interleave[1],
+                                dtype=self.dtype),
+                    requires_grad=False)
+                self.register_parameter("fc2_weight_scale", fc2_weight_scale)
+
+                fc31_alpha = nn.Parameter(torch.empty(
+                    self.expert_size_per_partition, 1, dtype=torch.float32),
+                                          requires_grad=False)
+                self.register_parameter("fc31_alpha", fc31_alpha)
+
+                fc2_alpha = nn.Parameter(torch.empty(
+                    self.expert_size_per_partition, 1, dtype=torch.float32),
+                                         requires_grad=False)
+                self.register_parameter("fc2_alpha", fc2_alpha)
+            elif qc.quant_mode.has_nvfp4():
+                self.has_nvfp4 = True
+                if self.is_trtllm():
+                    weight_dtype = float4_sf_dtype
+                    weight_vec_size = torch.iinfo(weight_dtype).bits // 4
+                    block_scales_dtype = torch.float8_e4m3fn
+                    block_scales_vec_size = 1
+                else:
+                    weight_dtype = FUSED_MOE_NVFP4_WEIGHT_DTYPE
+                    weight_vec_size = torch.iinfo(weight_dtype).bits // 4
+                    block_scales_dtype = FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE
+                    block_scales_vec_size = torch.iinfo(
+                        block_scales_dtype).bits // 8
+
+                self.scaling_vector_size = 16
+                # Divide by 16 because we use int64 to pack 16 fp4 values
+                w3_w1_weight_shape = (self.expert_size_per_partition,
+                                      self.intermediate_size_per_partition * 2,
+                                      self.hidden_size // weight_vec_size)
+                w2_weight_shape = (self.expert_size_per_partition,
+                                   self.hidden_size,
+                                   self.intermediate_size_per_partition //
+                                   weight_vec_size)
+
+                # Divide by 4 because we use int32 to pack 4 fp8 values
+                # column parallel
+                w3_w1_weight_scale = nn.Parameter(
+                    torch.ones(self.expert_size_per_partition,
+                               self.intermediate_size_per_partition * 2,
+                               self.hidden_size // self.scaling_vector_size //
+                               block_scales_vec_size,
+                               dtype=block_scales_dtype),
+                    requires_grad=False)
+                self.register_parameter("w3_w1_weight_scale",
+                                        w3_w1_weight_scale)
+
+                # row parallel
+                w2_weight_scale = nn.Parameter(torch.ones(
+                    self.expert_size_per_partition,
+                    self.hidden_size,
+                    self.intermediate_size_per_partition //
+                    self.scaling_vector_size // block_scales_vec_size,
+                    dtype=block_scales_dtype),
+                                               requires_grad=False)
+                self.register_parameter("w2_weight_scale", w2_weight_scale)
+
+                fc31_input_scale = nn.Parameter(torch.tensor(
+                    1., dtype=torch.float32),
+                                                requires_grad=False)
+                self.register_parameter("fc31_input_scale", fc31_input_scale)
+
+                fc2_input_scale = nn.Parameter(torch.tensor(
+                    1., dtype=torch.float32),
+                                               requires_grad=False)
+                self.register_parameter("fc2_input_scale", fc2_input_scale)
+
+                fc31_alpha = nn.Parameter(torch.ones(
+                    self.expert_size_per_partition, dtype=torch.float32),
+                                          requires_grad=False)
+                self.register_parameter("fc31_alpha", fc31_alpha)
+
+                fc2_alpha = nn.Parameter(torch.ones(
+                    self.expert_size_per_partition, dtype=torch.float32),
+                                         requires_grad=False)
+                self.register_parameter("fc2_alpha", fc2_alpha)
+
+                if self.is_trtllm():
+                    fc31_scale_c = nn.Parameter(torch.ones(
+                        self.expert_size_per_partition, dtype=torch.float32),
+                                                requires_grad=False)
+                    self.register_parameter("fc31_scale_c", fc31_scale_c)
+
+            else:
+                # TODO: support other quant mode
+                raise ValueError(
+                    f"unsupported quantization mode: {qc.quant_mode}")
+            self.setup_quant_scales()
+
+        # Fused gate_up_proj (column parallel)
+        w3_w1_weight = nn.Parameter(torch.empty(w3_w1_weight_shape,
+                                                dtype=weight_dtype),
+                                    requires_grad=False)
+        self.register_parameter("w3_w1_weight", w3_w1_weight)
+
+        # down_proj (row parallel)
+        w2_weight = nn.Parameter(torch.empty(w2_weight_shape,
+                                             dtype=weight_dtype),
+                                 requires_grad=False)
+        self.register_parameter("w2_weight", w2_weight)
+        self._weights_created = True
+
+    def reducescatter_or_allreduce(
+        self,
+        inputs,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+    ):
+        outputs = inputs
+        if self.parallel_size > 1 and not self.enable_alltoall:
+            if self.use_dp:
+                outputs = reducescatter(
+                    inputs,
+                    self.mapping,
+                    dim=0,
+                    sizes=None if use_dp_padding else all_rank_num_tokens)
+            elif self.reduce_results:
+                outputs = self.all_reduce(inputs)
+        return outputs
+
+    def forward_chunk(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        cutlass_min_latency_mode: bool = False,
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+    ) -> torch.Tensor:
+        if isinstance(x, Fp4QuantizedTensor):
+            assert output_dtype is not None
+            output_dtype = output_dtype
+        else:
+            output_dtype = x.dtype
+
+        use_fp8_block_scaling = False
+        use_w4a8_group_scaling = False
+        weight_dtype = self.w3_w1_weight.dtype
+
+        token_selected_experts, token_final_scales = self.routing_method.apply(
+            router_logits)
+        if self.balancer_layer is None:
+            token_selected_slots = token_selected_experts
+        else:
+            # If attention DP is enabled, token_selected_experts is a local rank tensor,
+            # so we need to offset the round robin position by ep_rank
+            token_selected_slots = self.balancer_layer.route(
+                token_selected_experts, offset_by_ep_rank=self.use_dp)
+
+        # If load balancer is disabled, the statistics are collected from expert IDs.
+        # If load balancer is enabled, the statistics are collected from expert slot IDs.
+        ExpertStatistic.set_layer(self.layer_idx)
+        ExpertStatistic.maybe_add_info(self.num_slots, token_selected_slots)
+
+        assert token_selected_slots.shape[
+            1] == self.routing_method.experts_per_token
+        assert token_selected_slots.shape == token_final_scales.shape
+        assert token_selected_slots.shape[0] == router_logits.shape[0]
+        assert token_final_scales.dtype == torch.float32
+        assert token_selected_slots.dtype == torch.int32
+
+        if self.apply_router_weight_on_input:
+            assert self.routing_method.top_k == 1, "Current workaround only supports top-1 routing"
+            assert x.dtype != torch.float8_e4m3fn, "Current workaround for apply_router_weight_on_input does not support fp8 input"
+            x = x * token_final_scales.to(x.dtype)
+            # TODO: remove this once we have correct fusedmoe kernel ready
+            token_final_scales = None
+
+        token_count = x.shape[0]
+
+        alltoall_info = None
+
+        if self.enable_alltoall:
+            x, token_selected_slots, token_final_scales, alltoall_info = \
+                self.alltoall_prepare_maybe_dispatch(all_rank_num_tokens,
+                                                     x,
+                                                     token_selected_slots,
+                                                     token_final_scales)
+
+        x_sf = None
+        if self.has_any_quant:
+            if self.has_fp8_qdq:
+                x, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                    x, self.fc31_input_dequant)
+            elif self.has_nvfp4:
+                if not disable_fp4_allgather() or self.use_postquant_alltoall:
+                    if isinstance(x, Fp4QuantizedTensor):
+                        x, x_sf = x.fp4_tensor, x.scaling_factor
+                        x_row = x.shape[0]
+                        # note: we use uint8 to store 2 fp4 values
+                        x_col = x.shape[1] * 2
+                    else:
+                        x_row = x.shape[0]
+                        x_col = x.shape[1]
+                        x, x_sf = torch.ops.trtllm.fp4_quantize(
+                            x, self.fc31_input_scale, self.scaling_vector_size,
+                            False)
+
+            elif self.has_fp8_block_scales:
+                use_fp8_block_scaling = True
+            elif self.has_w4afp8:
+                use_w4a8_group_scaling = True
+                weight_dtype = torch.quint4x2
+            else:
+                raise ValueError(
+                    f"unsupported quantization mode: {self.quant_config.quant_mode}"
+                )
+
+        if self.use_dp and self.parallel_size > 1 and not disable_fp4_allgather(
+        ) and not self.enable_alltoall:
+            x, x_sf, token_selected_slots, token_final_scales = allgather(
+                [x, x_sf, token_selected_slots, token_final_scales],
+                self.mapping,
+                dim=0,
+                sizes=None if use_dp_padding else all_rank_num_tokens)
+            # Fp4 gemm has extra scaling factor
+            if x_sf is not None:
+                x_sf = reswizzle_sf(x_sf, x_row, x_col,
+                                    self.scaling_vector_size)
+
+        if self.smart_router and not cutlass_min_latency_mode:
+            ep_size = self.cluster_size
+            ep_rank = self.cluster_rank
+            expert_start = ep_rank * self.num_experts // ep_size
+            expert_end = min(self.num_experts,
+                             (ep_rank + 1) * self.num_experts // ep_size)
+            w3_w1_weight = self.w3_w1_weight.narrow(0, expert_start,
+                                                    expert_end - expert_start)
+            w2_weight = self.w2_weight.narrow(0, expert_start,
+                                              expert_end - expert_start)
+            cluster_size = self.ep_size
+            cluster_rank = self.ep_rank
+            quant_scales = self.get_quant_scales(expert_start, expert_end)
+        else:
+            ep_size = self.ep_size
+            ep_rank = self.ep_rank
+            w3_w1_weight = self.w3_w1_weight
+            w2_weight = self.w2_weight
+            cluster_size = self.cluster_size
+            cluster_rank = self.cluster_rank
+            quant_scales = self.quant_scales
+
+        if self.use_postquant_alltoall:
+            x, x_sf = self.alltoall_postquant_dispatch(x, x_sf, x_row, x_col,
+                                                       alltoall_info)
+
+        final_hidden_states = torch.ops.trtllm.fused_moe(
+            x,
+            token_selected_slots,
+            token_final_scales,
+            w3_w1_weight.view(weight_dtype),
+            w2_weight.view(weight_dtype),
+            output_dtype,
+            quant_scales=quant_scales,
+            input_sf=x_sf,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            ep_size=ep_size,
+            ep_rank=ep_rank,
+            cluster_size=cluster_size,
+            cluster_rank=cluster_rank,
+            use_fp8_block_scaling=use_fp8_block_scaling,
+            use_w4a8_group_scaling=use_w4a8_group_scaling,
+            min_latency_mode=cutlass_min_latency_mode,
+            tune_max_num_tokens=self.tune_max_num_tokens,
+        )
+
+        if cutlass_min_latency_mode:
+            assert not self.reduce_results
+            return final_hidden_states
+        else:
+            # Custom op requires all inputs are in the same type.
+            # Only in cutlass_min_latency_mode, the output is a list of tensors.
+            # Otherwise, the output should be unpacked as a single tensor.
+            final_hidden_states = final_hidden_states[0]
+
+        if not self.enable_alltoall:
+            return final_hidden_states
+        else:
+            return self.alltoall_combine(final_hidden_states, alltoall_info,
+                                         token_count)
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        cutlass_min_latency_mode: bool = False,
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+    ) -> torch.Tensor:
+        """
+        cutlass_min_latency_mode has no effect when trtllm_gen backend is enabled.
+        """
+        if self.is_cutlass():
+            return self.forward_cutlass(x, router_logits,
+                                        cutlass_min_latency_mode, output_dtype,
+                                        all_rank_num_tokens, use_dp_padding)
+        elif self.is_trtllm():
+            return self.forward_trtllmgen(x, router_logits)
+        else:
+            raise NotImplementedError(
+                f"FusedMoE only supports CUTLASS or TRTLLM backends, not {self.moe_backend}"
+            )
+
+    def forward_cutlass(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        cutlass_min_latency_mode: bool = False,
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+    ) -> torch.Tensor:
+        assert self.is_cutlass()
+
+        if self.use_dp:
+            assert all_rank_num_tokens is not None
+            assert use_dp_padding is not None
+            num_rows = sum(all_rank_num_tokens)
+        else:
+            num_rows = x.shape[0]
+
+        # in case of num_rows is larger than max_chunk_size, we need to split the input into multiple chunks
+        num_chunks = (num_rows + self.moe_max_num_tokens -
+                      1) // self.moe_max_num_tokens
+
+        if cutlass_min_latency_mode:
+            assert num_chunks == 1 and (
+                not self.reduce_results
+            ), "cutlass_min_latency_mode must be used with a single chunk and reduce_results must be False"
+
+        if use_dp_padding:
+            all_rank_num_tokens_padded = [max(all_rank_num_tokens)
+                                          ] * len(all_rank_num_tokens)
+        else:
+            all_rank_num_tokens_padded = all_rank_num_tokens
+        if num_chunks == 1:
+            outputs = self.forward_chunk(
+                x,
+                router_logits,
+                cutlass_min_latency_mode,
+                output_dtype,
+                all_rank_num_tokens=all_rank_num_tokens_padded,
+                use_dp_padding=use_dp_padding)
+            outputs = self.reducescatter_or_allreduce(
+                outputs,
+                all_rank_num_tokens=all_rank_num_tokens_padded,
+                use_dp_padding=use_dp_padding)
+        else:
+
+            def split_chunk(split_token_num: int, split_num_chunks: int):
+                val_div = split_token_num // split_num_chunks
+                val_mod = split_token_num % split_num_chunks
+                split_chunk_size_list = [val_div + 1] * val_mod + [val_div] * (
+                    split_num_chunks - val_mod)
+                return split_chunk_size_list
+
+            if self.use_dp:
+                all_rank_chunk_size_list = [
+                    split_chunk(val, num_chunks)
+                    for val in all_rank_num_tokens_padded
+                ]
+                all_rank_num_tokens_list = [[
+                    val[idx_chunk] for val in all_rank_chunk_size_list
+                ] for idx_chunk in range(num_chunks)]
+                chunk_size_list = all_rank_chunk_size_list[self.rank]
+                if self.enable_alltoall:
+                    all_rank_num_tokens_list = [[
+                        1 if val == 0 else val for val in val_list
+                    ] for val_list in all_rank_num_tokens_list]
+            else:
+                all_rank_num_tokens_list = [None] * num_chunks
+                chunk_size_list = split_chunk(x.shape[0], num_chunks)
+
+            x_list = x.split(chunk_size_list)
+            router_logits_list = router_logits.split(chunk_size_list)
+
+            if not self.enable_alltoall:
+                self.event_dict[EventType.Main].record()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.Main].wait()
+
+            outputs_list = []
+            # Postpone reduce-scatter/all-reduce to the next iteration to achieve better overlap
+            for idx_chunk, (x, router_logits) in enumerate(
+                    zip(x_list, router_logits_list)):
+                if not self.enable_alltoall:
+                    if idx_chunk % 2 == 0:
+                        with torch.cuda.stream(self.aux_stream):
+                            outputs = self.forward_chunk(
+                                x,
+                                router_logits,
+                                all_rank_num_tokens=all_rank_num_tokens_list[
+                                    idx_chunk] if self.use_dp else None,
+                                use_dp_padding=use_dp_padding)
+                        if idx_chunk > 0:
+                            outputs_list[-1] = self.reducescatter_or_allreduce(
+                                outputs_list[-1],
+                                all_rank_num_tokens=all_rank_num_tokens_list[
+                                    idx_chunk - 1],
+                                use_dp_padding=use_dp_padding)
+                    else:
+                        outputs = self.forward_chunk(
+                            x,
+                            router_logits,
+                            all_rank_num_tokens=all_rank_num_tokens_list[
+                                idx_chunk] if self.use_dp else None,
+                            use_dp_padding=use_dp_padding)
+                        with torch.cuda.stream(self.aux_stream):
+                            outputs_list[-1] = self.reducescatter_or_allreduce(
+                                outputs_list[-1],
+                                all_rank_num_tokens=all_rank_num_tokens_list[
+                                    idx_chunk - 1],
+                                use_dp_padding=use_dp_padding)
+                else:
+                    outputs = self.forward_chunk(
+                        x,
+                        router_logits,
+                        all_rank_num_tokens=all_rank_num_tokens_list[idx_chunk]
+                        if self.use_dp else None)
+
+                outputs_list.append(outputs)
+            if not self.enable_alltoall:
+                if num_chunks % 2 == 0:
+                    outputs_list[-1] = self.reducescatter_or_allreduce(
+                        outputs_list[-1],
+                        all_rank_num_tokens=all_rank_num_tokens_list[-1],
+                        use_dp_padding=use_dp_padding)
+                else:
+                    with torch.cuda.stream(self.aux_stream):
+                        outputs_list[-1] = self.reducescatter_or_allreduce(
+                            outputs_list[-1],
+                            all_rank_num_tokens=all_rank_num_tokens_list[-1],
+                            use_dp_padding=use_dp_padding)
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.MoeChunkingOverlap].record()
+                self.event_dict[EventType.MoeChunkingOverlap].wait()
+            outputs = torch.cat(outputs_list)
+        if self.use_dp:
+            rank = self.mapping.tp_rank
+            outputs = outputs[:all_rank_num_tokens[rank]]
+        return outputs
+
+    def forward_trtllmgen(self, x: torch.Tensor,
+                          router_logits: torch.Tensor) -> torch.Tensor:
+        assert self.is_trtllm()
+        assert x.dtype == torch.bfloat16
+
+        # DeepSeekV3 style routing
+        if isinstance(self.routing_method, DeepSeekV3MoeRoutingMethod):
+            top_k = self.routing_method.routing_impl.top_k
+            routing_bias = self.routing_method.e_score_correction_bias
+            n_group = self.routing_method.routing_impl.n_group
+            topk_group = self.routing_method.routing_impl.topk_group
+            routed_scaling_factor = self.routing_method.routing_impl.routed_scaling_factor
+        else:
+            top_k = self.routing_method.top_k
+            routing_bias = None
+            n_group = None
+            topk_group = None
+            routed_scaling_factor = None
+
+        # TODO: since routing kernel is integrated into moe_runner for fp8,
+        #       here we just route the I/Os for moe_runner
+        if self.quant_config and self.quant_config.quant_mode.has_fp8_block_scales(
+        ):
+            x_val, x_scale = torch.ops.trtllm.fp8_quantize_1x128(x)
+
+            final_hidden_states = torch.ops.trtllm.fp8_block_scale_moe_runner(
+                router_logits,
+                routing_bias,
+                x_val,
+                x_scale,
+                self.w3_w1_weight,
+                self.w3_w1_weight_scaling_factor,
+                self.w2_weight,
+                self.w2_weight_scaling_factor,
+                self.num_slots,
+                top_k,
+                n_group,
+                topk_group,
+                self.intermediate_size_per_partition,
+                self.
+                slot_start,  # local_expert_start;  use ep_rank if stride!=1
+                self.expert_size_per_partition,  # local_expert_size
+                routed_scaling_factor,
+                self.routing_method.routing_method_type,
+            )
+        elif self.quant_config and self.quant_config.quant_mode.has_nvfp4():
+            scale_factor_use_ue8m0 = False
+            is_scale_factor_swizzled = False  # use linear layout here
+            hidden_states_fp4, hidden_states_scale_linear_fp4 = torch.ops.trtllm.fp4_quantize(
+                x, self.fc31_input_scale, 16, scale_factor_use_ue8m0,
+                is_scale_factor_swizzled)
+
+            final_hidden_states = torch.ops.trtllm.fp4_block_scale_moe_runner(
+                router_logits,
+                routing_bias,
+                hidden_states_fp4,
+                hidden_states_scale_linear_fp4.view(torch.float8_e4m3fn),
+                self.w3_w1_weight,
+                self.w3_w1_weight_scale.view(torch.float8_e4m3fn),
+                self.w2_weight,
+                self.w2_weight_scale.view(torch.float8_e4m3fn),
+                self.fc31_scale_c.data,
+                self.fc31_alpha.data,
+                self.fc2_alpha.data,
+                self.num_slots,
+                top_k,
+                n_group,
+                topk_group,
+                self.intermediate_size_per_partition,
+                self.
+                slot_start,  # local_expert_start;  use ep_rank if stride!=1
+                self.expert_size_per_partition,  # local_expert_size
+                routed_scaling_factor,
+                self.routing_method.routing_method_type,
+            )
+        else:
+            raise NotImplementedError(
+                "The TRTLLM backend of FusedMoE only supports fp8_block_scaling and nvfp4 dtypes."
+            )
+
+        if self.reduce_results and self.parallel_size > 1:
+            final_hidden_states = self.all_reduce(final_hidden_states)
+
+        return final_hidden_states
+
+    def alltoall_prepare_maybe_dispatch(self, all_rank_num_tokens: list,
+                                        x: torch.Tensor,
+                                        token_selected_slots: torch.Tensor,
+                                        token_final_scales: torch.Tensor):
+        top_k = self.routing_method.experts_per_token
+        expert_count = self.num_experts
+        # gather router info
+        max_num_token = max(all_rank_num_tokens)
+        token_selected_slots = torch.nn.functional.pad(
+            token_selected_slots,
+            (0, 0, 0, max_num_token - token_selected_slots.shape[0]),
+            'constant', self.num_experts)
+        token_final_scales = torch.nn.functional.pad(
+            token_final_scales,
+            (0, 0, 0, max_num_token - token_final_scales.shape[0]))
+        gathered_token_selected_slots, gathered_token_final_scales = allgather(
+            [token_selected_slots, token_final_scales], self.mapping, dim=0)
+        gathered_token_selected_slots = torch.flatten(
+            gathered_token_selected_slots.contiguous(), start_dim=0, end_dim=-2)
+        gathered_token_final_scales = torch.flatten(
+            gathered_token_final_scales.contiguous(), start_dim=0, end_dim=-2)
+        gathered_target_rank_ids = MnnvlMoe.compute_target_rank_id(
+            gathered_token_selected_slots, self.num_experts, self.ep_size)
+        alltoall_info, token_selected_slots, token_final_scales = MnnvlMoe.mnnvl_moe_alltoallv_prepare(
+            gathered_target_rank_ids, None, gathered_token_selected_slots,
+            gathered_token_final_scales, max_num_token, expert_count, top_k,
+            self.ep_rank, self.ep_size)
+
+        if not self.use_postquant_alltoall:
+            assert not isinstance(
+                x, Fp4QuantizedTensor
+            ), "pre-quant alltoall doesn't support fp4 tensor"
+            x = MnnvlMoe.mnnvl_moe_alltoallv(x, alltoall_info,
+                                             self.alltoall_workspace,
+                                             self.ep_rank, self.ep_size)
+
+        return x, token_selected_slots, token_final_scales, alltoall_info
+
+    def alltoall_postquant_dispatch(self, x: torch.Tensor, x_sf: torch.Tensor,
+                                    x_row: int, x_col: int,
+                                    alltoall_info: MoEAlltoallInfo):
+        x = MnnvlMoe.mnnvl_moe_alltoallv(x, alltoall_info,
+                                         self.alltoall_workspace, self.ep_rank,
+                                         self.ep_size)
+
+        if x_sf is not None:
+            if self.has_nvfp4:
+                x_sf = unswizzle_sf(x_sf, x_row, x_col,
+                                    self.scaling_vector_size)
+
+            x_sf = MnnvlMoe.mnnvl_moe_alltoallv(x_sf, alltoall_info,
+                                                self.alltoall_workspace,
+                                                self.ep_rank, self.ep_size)
+
+            if self.has_nvfp4:
+                x_sf = swizzle_sf(x_sf, x.shape[0], x.shape[1] * 2,
+                                  self.scaling_vector_size)
+
+        return x, x_sf
+
+    def alltoall_combine(self, final_hidden_states: torch.Tensor,
+                         alltoall_info: MoEAlltoallInfo, token_count: int):
+        top_k = self.routing_method.experts_per_token
+        if isinstance(final_hidden_states, list):
+            final_hidden_states = final_hidden_states[0]
+        final_hidden_states = MnnvlMoe.mnnvl_moe_alltoallv_combine(
+            final_hidden_states,
+            alltoall_info,
+            self.alltoall_workspace,
+            ep_rank=self.ep_rank,
+            ep_size=self.ep_size,
+            top_k=top_k,
+            token_count=token_count)
+
+        return final_hidden_states
+
+    def load_weights(self, weights: List[Dict]):
+        assert self._weights_created
+        assert len(weights) == 1
+        weights = weights[0]
+
+        def load_expert_w3_w1_weight(w1_weight,
+                                     w3_weight,
+                                     dst_w3_w1_weight: torch.Tensor,
+                                     is_trtllm: bool = False):
+            w1_weight_shard = load_weight_shard(w1_weight, self.tp_size,
+                                                self.tp_rank,
+                                                TensorParallelMode.COLUMN)
+            w3_weight_shard = load_weight_shard(w3_weight, self.tp_size,
+                                                self.tp_rank,
+                                                TensorParallelMode.COLUMN)
+
+            if is_trtllm:
+                # FIXME: this depends on the kernel internals
+                epilogue_tile_m = 128
+
+                # Keep weights in device buffer
+                dst_w3_weight = dst_w3_w1_weight.narrow(
+                    dim=0, start=0, length=self.intermediate_size_per_partition)
+                dst_w1_weight = dst_w3_w1_weight.narrow(
+                    dim=0,
+                    start=self.intermediate_size_per_partition,
+                    length=self.intermediate_size_per_partition)
+                dst_w3_weight.copy_(w3_weight_shard.view(dst_w3_weight.dtype))
+                dst_w1_weight.copy_(w1_weight_shard.view(dst_w1_weight.dtype))
+
+                # Get permute indices and chain them together
+                permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
+                    dst_w3_w1_weight)
+                permute1 = get_shuffle_matrix_a_row_indices(
+                    dst_w3_w1_weight, epilogue_tile_m)
+                permute = permute0[permute1]
+
+                # Shuffle the weight according to permute indices
+                processed_w31_weight_shard = torch.ops.trtllm.shuffle_matrix(
+                    dst_w3_w1_weight, permute.to(dst_w3_w1_weight.device))
+                # Copy the result into device buffer
+                dst_w3_w1_weight.copy_(processed_w31_weight_shard.view(
+                    dst_w3_w1_weight.dtype),
+                                       non_blocking=True)
+                # We are done here so do not continue
+                return
+
+            w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard],
+                                         dim=0)
+
+            if self.has_w4afp8 and self.sm_version == 89:
+                import tensorrt_llm.quantization.functional
+                preprocessor = tensorrt_llm.quantization.functional.preprocess_weights_for_mixed_gemm
+                packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
+                unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
+                w31_weight_shard = packer(
+                    unpacker(w31_weight_shard.cpu()).T.contiguous()).to(
+                        w31_weight_shard.device)
+                w31_weight_shard = preprocessor(w31_weight_shard,
+                                                torch.quint4x2,
+                                                torch.float8_e4m3fn,
+                                                89).view(dst_w3_w1_weight.shape)
+            dst_w3_w1_weight.copy_(w31_weight_shard.view(
+                dst_w3_w1_weight.dtype),
+                                   non_blocking=True)
+
+        def load_expert_w2_weight(w2_weight,
+                                  dst_w2_weight: torch.Tensor,
+                                  is_trtllm: bool = False):
+            w2_weight_shard = load_weight_shard(w2_weight, self.tp_size,
+                                                self.tp_rank,
+                                                TensorParallelMode.ROW)
+            if is_trtllm:
+                # FIXME: this depends on the kernel internals
+                epilogue_tile_m = 128
+
+                # Keep weights in device buffer
+                dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
+                                    non_blocking=True)
+                # Get permuted result
+                processed_w2_weight = shuffle_matrix_a(dst_w2_weight,
+                                                       epilogue_tile_m)
+                # Copy the result into device buffer
+                dst_w2_weight.copy_(processed_w2_weight.view(
+                    dst_w2_weight.dtype),
+                                    non_blocking=True)
+                # We are done here so do not continue
+                return
+
+            if self.has_w4afp8 and self.sm_version == 89:
+                import tensorrt_llm.quantization.functional
+                preprocessor = tensorrt_llm.quantization.functional.preprocess_weights_for_mixed_gemm
+                packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
+                unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
+                w2_weight_shard = packer(
+                    unpacker(w2_weight_shard.cpu()).T.contiguous()).to(
+                        w2_weight_shard.device)
+                w2_weight_shard = preprocessor(w2_weight_shard, torch.quint4x2,
+                                               torch.float8_e4m3fn,
+                                               89).view(dst_w2_weight.shape)
+
+            dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
+                                non_blocking=True)
+
+        # Use multi-threading to load expert weights in parallel.
+        # Even though CPython has global interpreter lock (GIL),
+        # it's still faster to load weights in parallel because it can utilize
+        # CPU memory bandwidth better.
+        threads = []
+
+        for local_slot_id, expert_id in enumerate(
+                self.initial_local_expert_ids):
+            # expert_idx is the local slot index of current rank
+            expert_idx = local_slot_id
+
+            if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_weight = weights[f"{expert_id}.w1.weight"]
+                w3_weight = weights[f"{expert_id}.w3.weight"]
+                w2_weight = weights[f"{expert_id}.w2.weight"]
+            elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_w3_weight = weights["gate_up_proj"][expert_id].transpose(
+                    0, 1)
+                w1_weight, w3_weight = w1_w3_weight.chunk(2, dim=0)
+                w2_weight = weights["down_proj"][expert_id].transpose(
+                    0, 1).contiguous()
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {self.weight_loading_mode}"
+                )
+
+            is_trtllm_nvfp4 = self.is_trtllm(
+            ) and self.quant_config.quant_mode.has_nvfp4()
+
+            thread = threading.Thread(target=load_expert_w3_w1_weight,
+                                      args=(w1_weight, w3_weight,
+                                            self.w3_w1_weight.data[expert_idx],
+                                            is_trtllm_nvfp4))
+            thread.start()
+            threads.append(thread)
+
+            thread = threading.Thread(target=load_expert_w2_weight,
+                                      args=(w2_weight,
+                                            self.w2_weight.data[expert_idx],
+                                            is_trtllm_nvfp4))
+            thread.start()
+            threads.append(thread)
+
+        for thread in threads:
+            thread.join()
+
+        if self.quant_config and self.quant_config.quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            if self.quant_config.quant_mode.has_fp8_qdq():
+                self._load_fp8_qdq_scales(weights)
+            elif self.quant_config.quant_mode.has_nvfp4():
+                self._load_nvfp4_scales(weights)
+            elif self.quant_config.quant_mode.has_fp8_block_scales():
+                self._load_fp8_block_scales_scales(weights)
+            elif self.quant_config.quant_mode.is_int4_weight_only_per_group():
+                self._load_int4_groupwise_scales(weights)
+            else:
+                raise ValueError(
+                    f"unsupported quantization mode: {self.quant_config.quant_mode}"
+                )
+            # Re-setup quant scales after loading weights as the tensors may have been modified.
+            self.setup_quant_scales()
+
+    def _load_fp8_block_scales_scales(self, weights: Dict):
+        all_w2_scales = [
+            load_weight_shard(weights[f"{expert_id}.w2.weight_scale_inv"],
+                              self.tp_size, self.tp_rank,
+                              TensorParallelMode.ROW)
+            for expert_id in self.initial_local_expert_ids
+        ]
+
+        w2_scales = torch.stack(all_w2_scales)
+        self.w2_weight_scaling_factor.data.copy_(w2_scales)
+
+        all_w3_scales = [
+            load_weight_shard(weights[f"{expert_id}.w3.weight_scale_inv"],
+                              self.tp_size, self.tp_rank,
+                              TensorParallelMode.COLUMN)
+            for expert_id in self.initial_local_expert_ids
+        ]
+
+        all_w1_scales = [
+            load_weight_shard(weights[f"{expert_id}.w1.weight_scale_inv"],
+                              self.tp_size, self.tp_rank,
+                              TensorParallelMode.COLUMN)
+            for expert_id in self.initial_local_expert_ids
+        ]
+
+        w3_w1_scales = torch.cat(
+            [torch.stack(all_w3_scales),
+             torch.stack(all_w1_scales)], dim=-2)
+        self.w3_w1_weight_scaling_factor.data.copy_(w3_w1_scales)
+
+    def _load_fp8_qdq_scales(self, weights: Dict):
+        # Step1: Load input scales.
+        def load_expert_fc31_input_scale_fp8_qdq(
+                w1_input_scale, w3_input_scale,
+                dst_fc31_input_scale: torch.Tensor):
+            dst_fc31_input_scale.copy_(
+                max(w1_input_scale[...].reshape([]),
+                    w3_input_scale[...].reshape([])))
+
+        def load_expert_fc2_input_scale_fp8_qdq(
+                w2_input_scale, dst_fc2_input_scale: torch.Tensor):
+            dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([]))
+
+        tmp_fc31_input_scale = torch.empty(self.num_experts,
+                                           dtype=torch.float32)
+        tmp_fc2_input_scale = torch.empty(self.num_experts, dtype=torch.float32)
+        for expert_id in range(self.num_experts):
+            if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_input_scale = weights[f"{expert_id}.w1.input_scale"]
+                w3_input_scale = weights[f"{expert_id}.w3.input_scale"]
+                w2_input_scale = weights[f"{expert_id}.w2.input_scale"]
+            elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_input_scale = weights[f"gate_up_proj_input_scale"]
+                w3_input_scale = weights[f"gate_up_proj_input_scale"]
+                w2_input_scale = weights[f"down_proj_input_scale"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {self.weight_loading_mode}"
+                )
+
+            load_expert_fc31_input_scale_fp8_qdq(
+                w1_input_scale, w3_input_scale, tmp_fc31_input_scale[expert_id])
+
+            load_expert_fc2_input_scale_fp8_qdq(w2_input_scale,
+                                                tmp_fc2_input_scale[expert_id])
+
+        # max_fc31_input_scale is the maximum of all w1 input scales and w3 input scales.
+        # It's used to quantize fc31 input inside the MOE op
+        max_fc31_input_scale = tmp_fc31_input_scale.max()
+        # max_fc2_input_scale is the maximum of all w2 input scales.
+        max_fc2_input_scale = tmp_fc2_input_scale.max()
+
+        # Step2: Load weight scales and requantize w3_w1_weight.
+        tmp_w3_w1_weight_scale = torch.empty(self.expert_size_per_partition,
+                                             dtype=torch.float32)
+        tmp_w2_weight_scale = torch.empty(self.expert_size_per_partition,
+                                          dtype=torch.float32)
+
+        def load_expert_w3_w1_weight_scale_fp8_qdq(
+                w1_weight_scale, w3_weight_scale,
+                dst_w3_w1_weight_scale: torch.Tensor):
+            w1_weight_scale = w1_weight_scale[...].reshape([])
+            w3_weight_scale = w3_weight_scale[...].reshape([])
+            dst_w3_w1_weight_scale.copy_(max(w1_weight_scale, w3_weight_scale))
+
+        def requantize_expert_w3_w1_weight_fp8_qdq(
+                w1_weight_scale, w3_weight_scale,
+                dst_w3_w1_weight: torch.Tensor):
+            w1_weight_scale = w1_weight_scale[...].reshape([])
+            w3_weight_scale = w3_weight_scale[...].reshape([])
+            max_w3_w1_weight_scale = max(w1_weight_scale, w3_weight_scale)
+
+            w3_weight = dst_w3_w1_weight.narrow(
+                dim=0, start=0, length=self.intermediate_size_per_partition).to(
+                    dtype=self.dtype)
+            w1_weight = dst_w3_w1_weight.narrow(
+                dim=0,
+                start=self.intermediate_size_per_partition,
+                length=self.intermediate_size_per_partition).to(
+                    dtype=self.dtype)
+            dequant_w3_weight = w3_weight * w3_weight_scale
+            dequant_w1_weight = w1_weight * w1_weight_scale
+            requant_w3_weight = (dequant_w3_weight / max_w3_w1_weight_scale).to(
+                torch.float8_e4m3fn)
+            requant_w1_weight = (dequant_w1_weight / max_w3_w1_weight_scale).to(
+                torch.float8_e4m3fn)
+
+            dst_w3_w1_weight.narrow(
+                dim=0, start=0,
+                length=self.intermediate_size_per_partition).copy_(
+                    requant_w3_weight)
+            dst_w3_w1_weight.narrow(
+                dim=0,
+                start=self.intermediate_size_per_partition,
+                length=self.intermediate_size_per_partition).copy_(
+                    requant_w1_weight)
+
+        def load_expert_w2_weight_scale_fp8(w2_weight_scale,
+                                            dst_w2_weight_scale: torch.Tensor):
+            dst_w2_weight_scale.copy_(w2_weight_scale[...].reshape([]))
+
+        for local_slot_id, expert_id in enumerate(
+                self.initial_local_expert_ids):
+            if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"]
+                w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"]
+                w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"]
+            elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_weight_scale = weights[f"gate_up_proj_weight_scale"]
+                w3_weight_scale = weights[f"gate_up_proj_weight_scale"]
+                w2_weight_scale = weights[f"down_proj_weight_scale"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {self.weight_loading_mode}"
+                )
+
+            expert_idx = local_slot_id
+
+            load_expert_w3_w1_weight_scale_fp8_qdq(
+                w1_weight_scale, w3_weight_scale,
+                tmp_w3_w1_weight_scale[expert_idx])
+
+            requantize_expert_w3_w1_weight_fp8_qdq(
+                w1_weight_scale, w3_weight_scale,
+                self.w3_w1_weight.data[expert_idx])
+
+            load_expert_w2_weight_scale_fp8(w2_weight_scale,
+                                            tmp_w2_weight_scale[expert_idx])
+
+        # Step3: calculate and store final loaded weights
+        self.fc31_dequant.data.copy_(tmp_w3_w1_weight_scale *
+                                     max_fc31_input_scale)
+        self.fc2_quant.data.copy_(max_fc2_input_scale.reciprocal())
+        self.fc2_dequant.data.copy_(tmp_w2_weight_scale * max_fc2_input_scale)
+        self.fc31_input_dequant.data.copy_(max_fc31_input_scale)
+
+    def _load_nvfp4_scales(self, weights: Dict):
+        # Step1: Load input scales.
+        tmp_fc31_input_scale = torch.empty(self.num_experts,
+                                           dtype=torch.float32)
+        tmp_fc2_input_scale = torch.empty(self.num_experts, dtype=torch.float32)
+
+        def load_expert_fc31_input_scale_nvfp4(
+                w1_input_scale, w3_input_scale,
+                dst_fc31_input_scale: torch.Tensor):
+            w1_input_scale = w1_input_scale[...].reshape([])
+            w3_input_scale = w3_input_scale[...].reshape([])
+            assert torch.allclose(
+                w1_input_scale,
+                w3_input_scale), "w1_input_scale != w3_input_scale"
+            dst_fc31_input_scale.copy_(w1_input_scale)
+
+        def load_expert_fc2_input_scale_nvfp4(
+                w2_input_scale, dst_fc2_input_scale: torch.Tensor):
+            dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([]))
+
+        for expert_id in range(self.num_experts):
+            if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_input_scale = weights[f"{expert_id}.w1.input_scale"]
+                w3_input_scale = weights[f"{expert_id}.w3.input_scale"]
+                w2_input_scale = weights[f"{expert_id}.w2.input_scale"]
+            elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_input_scale = weights["gate_up_proj_input_scale"]
+                w3_input_scale = weights["gate_up_proj_input_scale"]
+                w2_input_scale = weights["down_proj_input_scale"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {self.weight_loading_mode}"
+                )
+
+            load_expert_fc31_input_scale_nvfp4(w1_input_scale, w3_input_scale,
+                                               tmp_fc31_input_scale[expert_id])
+            load_expert_fc2_input_scale_nvfp4(w2_input_scale,
+                                              tmp_fc2_input_scale[expert_id])
+
+        # fc31_input_scale is the reciprocal of the maximum of all w1 input scales and w3 input scales.
+        self.fc31_input_scale.data.copy_(
+            tmp_fc31_input_scale.max().reciprocal())
+        # fc2_input_scale is the reciprocal of the maximum of all w2 input scales.
+        self.fc2_input_scale.data.copy_(tmp_fc2_input_scale.max().reciprocal())
+
+        if self.is_trtllm():
+            block_scales_dtype = torch.float8_e4m3fn
+        else:
+            block_scales_dtype = FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE
+
+        # Step2: Load weight block scales and alphas.
+        def load_expert_w3_w1_weight_scale_nvfp4(
+                w1_weight_scale, w3_weight_scale,
+                dst_w3_w1_weight_scale: torch.Tensor, is_trtllm: bool):
+            w1_weight_scale = load_weight_shard(w1_weight_scale, self.tp_size,
+                                                self.tp_rank,
+                                                TensorParallelMode.COLUMN)
+            w3_weight_scale = load_weight_shard(w3_weight_scale, self.tp_size,
+                                                self.tp_rank,
+                                                TensorParallelMode.COLUMN)
+            # Keep weights in device buffer
+            # w3
+            dst_w3_weight_scale = dst_w3_w1_weight_scale.narrow(
+                dim=0, start=0, length=self.intermediate_size_per_partition)
+            dst_w3_weight_scale.copy_(
+                w3_weight_scale.view(dst_w3_weight_scale.dtype))
+
+            # w1
+            dst_w1_weight_scale = dst_w3_w1_weight_scale.narrow(
+                dim=0,
+                start=self.intermediate_size_per_partition,
+                length=self.intermediate_size_per_partition)
+            dst_w1_weight_scale.copy_(
+                w1_weight_scale.view(dst_w1_weight_scale.dtype))
+
+            orig_shape = dst_w3_w1_weight_scale.shape
+
+            if is_trtllm:
+                # FIXME
+                epilogue_tile_m = 128
+
+                # Get permute indices and chain them together
+                permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
+                    dst_w3_w1_weight_scale)
+                permute1 = get_shuffle_matrix_sf_a_row_indices(
+                    dst_w3_w1_weight_scale.view(float4_sf_dtype),
+                    epilogue_tile_m, 16)
+                permute = permute0[permute1]
+
+                # Shuffle the weight according to permute indices
+                w3_w1_weight_scale = torch.ops.trtllm.shuffle_matrix(
+                    dst_w3_w1_weight_scale.view(float4_sf_dtype),
+                    permute.cuda())
+                # Assert should only be removed during debugging
+                assert w3_w1_weight_scale.is_cuda, "w3_w1_weight_scale.is_cuda should be true or suffer from slow speed"
+                # Interleave the weight.
+                processed_w3_w1_weight_scale = torch.ops.tensorrt_llm.nvfp4_block_scale_interleave(
+                    w3_w1_weight_scale.view(float4_sf_dtype).reshape(
+                        orig_shape))
+                # Copy the result into device buffer
+                dst_w3_w1_weight_scale.copy_(
+                    processed_w3_w1_weight_scale.view(
+                        block_scales_dtype).reshape(orig_shape))
+            else:
+                dst_w3_w1_weight_scale.copy_(
+                    torch.ops.tensorrt_llm.nvfp4_block_scale_interleave(
+                        dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
+                            block_scales_dtype).reshape(orig_shape))
+
+        def load_expert_w2_weight_scale_nvfp4(w2_weight_scale,
+                                              dst_w2_weight_scale: torch.Tensor,
+                                              is_trtllm: bool):
+            w2_weight_scale = load_weight_shard(w2_weight_scale, self.tp_size,
+                                                self.tp_rank,
+                                                TensorParallelMode.ROW)
+            # Keep weights in device buffer
+            dst_w2_weight_scale.copy_(
+                w2_weight_scale.view(dst_w2_weight_scale.dtype))
+
+            orig_shape = dst_w2_weight_scale.shape
+            if is_trtllm:
+                epilogue_tile_m = 128  # FIXME: read from kernel
+                # Assert should only be removed during debugging
+                assert dst_w2_weight_scale.is_cuda, "dst_w2_weight_scale.is_cuda should be true or suffer from slow speed"
+                # Interleave the weight and copy
+                dst_w2_weight_scale.copy_(
+                    shuffle_matrix_sf_a(
+                        dst_w2_weight_scale.view(float4_sf_dtype),
+                        epilogue_tile_m,
+                        16).view(block_scales_dtype).reshape(orig_shape))
+            else:
+                dst_w2_weight_scale.copy_(
+                    torch.ops.tensorrt_llm.nvfp4_block_scale_interleave(
+                        dst_w2_weight_scale.view(float4_sf_dtype)).view(
+                            block_scales_dtype).reshape(orig_shape))
+
+        def load_expert_fc31_alpha_nvfp4(w1_weight_scale_2, w3_weight_scale_2,
+                                         final_fc31_input_scale: torch.Tensor,
+                                         dst_fc31_alpha: torch.Tensor):
+            w1_weight_scale_2 = w1_weight_scale_2[...].reshape([])
+            w3_weight_scale_2 = w3_weight_scale_2[...].reshape([])
+            assert torch.allclose(
+                w1_weight_scale_2,
+                w3_weight_scale_2), "w1_weight_scale_2 != w3_weight_scale_2"
+
+            w3_w1_weight_scale_2 = 1.0 / w1_weight_scale_2
+            dst_fc31_alpha.copy_(
+                1.0 / (final_fc31_input_scale * w3_w1_weight_scale_2))
+
+        def load_expert_fc2_alpha_nvfp4(w2_weight_scale_2,
+                                        final_fc2_input_scale: torch.Tensor,
+                                        dst_w2_alpha: torch.Tensor):
+            w2_weight_scale_2 = 1.0 / w2_weight_scale_2[...].reshape([])
+            dst_w2_alpha.copy_(1.0 /
+                               (final_fc2_input_scale * w2_weight_scale_2))
+
+        for local_slot_id, expert_id in enumerate(
+                self.initial_local_expert_ids):
+            if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"]
+                w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"]
+                w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"]
+                w1_weight_scale_2 = weights[f"{expert_id}.w1.weight_scale_2"]
+                w3_weight_scale_2 = weights[f"{expert_id}.w3.weight_scale_2"]
+                w2_weight_scale_2 = weights[f"{expert_id}.w2.weight_scale_2"]
+            elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_w3_weight_scale = weights["gate_up_proj_weight_scale"][
+                    expert_id].transpose(0, 1).contiguous()
+                w1_weight_scale, w3_weight_scale = w1_w3_weight_scale.chunk(
+                    2, dim=0)
+                w2_weight_scale = weights["down_proj_weight_scale"][
+                    expert_id].transpose(0, 1).contiguous()
+                w1_weight_scale_2 = weights["gate_up_proj_weight_scale_2"]
+                w3_weight_scale_2 = weights["gate_up_proj_weight_scale_2"]
+                w2_weight_scale_2 = weights["down_proj_weight_scale_2"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {self.weight_loading_mode}"
+                )
+
+            expert_idx = local_slot_id
+
+            load_expert_w3_w1_weight_scale_nvfp4(
+                w1_weight_scale, w3_weight_scale,
+                self.w3_w1_weight_scale.data[expert_idx], self.is_trtllm())
+            load_expert_w2_weight_scale_nvfp4(
+                w2_weight_scale, self.w2_weight_scale.data[expert_idx],
+                self.is_trtllm())
+
+            load_expert_fc31_alpha_nvfp4(w1_weight_scale_2, w3_weight_scale_2,
+                                         self.fc31_input_scale.data,
+                                         self.fc31_alpha.data[expert_idx])
+            load_expert_fc2_alpha_nvfp4(w2_weight_scale_2,
+                                        self.fc2_input_scale.data,
+                                        self.fc2_alpha.data[expert_idx])
+        if self.is_trtllm():
+            self.fc31_scale_c.data.copy_(self.fc2_input_scale.data *
+                                         self.fc31_alpha.data,
+                                         non_blocking=True)
+
+    def _load_int4_groupwise_scales(self, weights: Dict):
+        # fc31 scales
+        assert (len(self.interleave) == 2)
+        all_w3_input_scales = [
+            load_weight_shard(weights[f"{expert_id}.w3.input_scale"])
+            for expert_id in self.initial_local_expert_ids
+        ]
+        all_w1_input_scales = [
+            load_weight_shard(weights[f"{expert_id}.w1.input_scale"])
+            for expert_id in self.initial_local_expert_ids
+        ]
+        all_w3_w1_input_scales = torch.max(torch.stack(all_w3_input_scales),
+                                           torch.stack(all_w1_input_scales))
+        all_w3_w1_input_scales = torch.ones_like(
+            all_w3_w1_input_scales) * all_w3_w1_input_scales.max()
+        self.fc31_act_scale.data.copy_(1 / all_w3_w1_input_scales)
+        self.fc31_alpha.data.copy_(all_w3_w1_input_scales.float())
+
+        all_w3_scales = [
+            load_weight_shard(weights[f"{expert_id}.w3.weight_scale_inv"],
+                              self.tp_size, self.tp_rank,
+                              TensorParallelMode.COLUMN)
+            for expert_id in self.initial_local_expert_ids
+        ]
+        all_w1_scales = [
+            load_weight_shard(weights[f"{expert_id}.w1.weight_scale_inv"],
+                              self.tp_size, self.tp_rank,
+                              TensorParallelMode.COLUMN)
+            for expert_id in self.initial_local_expert_ids
+        ]
+        all_w3_w1_scales = torch.cat(
+            [torch.stack(all_w3_scales),
+             torch.stack(all_w1_scales)], dim=-2)
+        if self.sm_version == 89:
+            w3_w1_scales = all_w3_w1_scales.to(torch.float16).view(self.dtype)
+        else:
+            w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(self.dtype)
+        w3_w1_s_shape = w3_w1_scales.shape
+        w3_w1_scales_interleaved = w3_w1_scales.reshape(
+            w3_w1_s_shape[0], w3_w1_s_shape[1],
+            (w3_w1_s_shape[2] // self.interleave[0]), self.interleave[0])
+        w3_w1_scales_interleaved = w3_w1_scales_interleaved.permute(0, 2, 1, 3)
+        w3_w1_scales_interleaved = w3_w1_scales_interleaved.reshape(
+            w3_w1_s_shape[0], w3_w1_s_shape[2] // self.interleave[0],
+            w3_w1_s_shape[1] * self.interleave[0])
+        self.fc31_weight_scale.data.copy_(w3_w1_scales_interleaved.contiguous())
+
+        # fc2 scales
+        all_w2_input_scales = [
+            load_weight_shard(weights[f"{expert_id}.w2.input_scale"])
+            for expert_id in self.initial_local_expert_ids
+        ]
+        all_w2_input_scales = torch.stack(all_w2_input_scales).to(self.dtype)
+        all_w2_input_scales = torch.ones_like(
+            all_w2_input_scales) * all_w2_input_scales.max()
+        self.fc2_act_scale.data.copy_(1 / all_w2_input_scales)
+        self.fc2_alpha.data.copy_(all_w2_input_scales.float())
+
+        all_w2_scales = [
+            load_weight_shard(weights[f"{expert_id}.w2.weight_scale_inv"],
+                              self.tp_size, self.tp_rank,
+                              TensorParallelMode.ROW)
+            for expert_id in self.initial_local_expert_ids
+        ]
+        if self.sm_version == 89:
+            w2_scales = torch.stack(all_w2_scales).to(torch.float16).view(
+                self.dtype)
+        else:
+            w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16).view(
+                self.dtype)
+        w2_s_shape = w2_scales.shape
+        w2_scales_interleaved = w2_scales.reshape(
+            w2_s_shape[0], w2_s_shape[1], (w2_s_shape[2] // self.interleave[1]),
+            self.interleave[1])
+        w2_scales_interleaved = w2_scales_interleaved.permute(0, 2, 1, 3)
+        w2_scales_interleaved = w2_scales_interleaved.reshape(
+            w2_s_shape[0], w2_s_shape[2] // self.interleave[1],
+            w2_s_shape[1] * self.interleave[1])
+        self.fc2_weight_scale.data.copy_(w2_scales_interleaved.contiguous())
+
+
+class FusedMoEQuantScalesFP8(NamedTuple):
+    fc1_dequant: torch.Tensor
+    fc2_quant: torch.Tensor
+    fc2_dequant: torch.Tensor
+    fc1_input_dequant: torch.Tensor
+
+
+class FusedMoEQuantScalesNVFP4(NamedTuple):
+    fc1_act_global: torch.Tensor
+    fc1_weight_block: torch.Tensor
+    # fc1_global_scale = 1.0 / (fc1_weight_global_scale * fc1_act_global_scale)
+    fc1_global: torch.Tensor
+
+    fc2_act_global: torch.Tensor
+    fc2_weight_block: torch.Tensor
+    # fc2_global_scale = 1.0 / (fc2_weight_global_scale * fc2_act_global_scale)
+    fc2_global: torch.Tensor
+
+
+class FusedMoEQuantScalesFP8BlockScales(NamedTuple):
+    fc_weight_scales: torch.Tensor
+    proj_weight_scales: torch.Tensor
+
+
+class FusedMoEQuantScalesW4A8(NamedTuple):
+    scale_1_interleaved: torch.Tensor
+    scale_2_interleaved: torch.Tensor
+    pre_quant_scale_1: torch.Tensor
+    pre_quant_scale_2: torch.Tensor
+    zero_1: torch.Tensor
+    zero_2: torch.Tensor
+    alpha_1: torch.Tensor
+    alpha_2: torch.Tensor
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
index e65d96daafb..f6a0e9323f7 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
@@ -69,7 +69,8 @@ def __init__(
         self.mapping = model_config.mapping
         self.parallel_size = self.mapping.tp_size
 
-        self.all_reduce = AllReduce(self.mapping)
+        self.all_reduce = AllReduce(mapping=self.mapping,
+                                    strategy=model_config.allreduce_backend)
 
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
index bcf51067a72..3cc73d15dd8 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/interface.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -78,7 +78,8 @@ def __init__(
         self.parallel_size = self.mapping.tp_size
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
-        self.all_reduce = AllReduce(self.mapping)
+        self.all_reduce = AllReduce(mapping=self.mapping,
+                                    strategy=model_config.allreduce_backend)
 
     @abstractmethod
     def create_weights(self):
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index b0062d043e9..bd554b51089 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -17,6 +17,7 @@
 from tensorrt_llm.mapping import Mapping
 
 from ...models.modeling_utils import QuantConfig
+from ..model_config import ModelConfig
 from ..utils import Fp4QuantizedTensor
 
 
@@ -694,7 +695,8 @@ def __init__(
         self.in_features = local_in_features
         self.out_features = local_out_features
 
-        self.all_reduce = AllReduce(self.mapping) if reduce_output else None
+        self.all_reduce = AllReduce(model_config=ModelConfig(
+            mapping=self.mapping)) if reduce_output else None
         self._weights_created = False
         self.reduce_output = reduce_output
         self.use_custom_cublas_mm = use_custom_cublas_mm
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index 533b21b0502..041ee1f6dad 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -86,6 +86,7 @@ class PyTorchConfig:
 
     # If true, enable min-latency mode. Currently only used for Llama4.
     enable_min_latency: bool = False
+    allreduce_strategy: str = "AUTO"
 
 
 EXETENDED_EXECUTOR_CONFIG_FIELDS = [
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index c15e00c8568..e67156ec1ab 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -3880,6 +3880,7 @@ class AllReduceStrategy(IntEnum):
     ONESHOT = 4
     TWOSHOT = 5
     LOWPRECISION = 6
+    MNNVL = 7
 
 
 class AllReduceFusionOp(IntEnum):
diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py
index fec42bc3384..820fbb0b684 100644
--- a/tests/unittest/_torch/multi_gpu/test_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_allreduce.py
@@ -124,7 +124,7 @@ def e2m1_and_ufp8sf_scale_to_float_v2(e2m1_tensor,
     ).cuda()
     norm = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda()
 
-    allreduce = AllReduce(mapping=mapping).cuda()
+    allreduce = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
 
     scale = torch.tensor(1.0, dtype=torch.float32).cuda()
     linear.load_weights([dict(weight=weights[0])])
diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
index 3aa6871fbe4..3d8933f4115 100644
--- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
@@ -22,6 +22,7 @@
 from mpi4py.futures import MPIPoolExecutor
 
 from tensorrt_llm._torch.distributed import AllReduceStrategy
+from tensorrt_llm._torch.model_config import ModelConfig
 
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
 MPI.pickle.__init__(
@@ -87,8 +88,9 @@ def __init__(self,
                 rank=self.rank,
             )
 
-            self.allreduce = AllReduce(mapping=self.mapping,
-                                       strategy=self.strategy).cuda()
+            self.allreduce = AllReduce(model_config=ModelConfig(
+                mapping=self.mapping,
+                allreduce_backend=self.strategy), ).cuda()
 
             self.input_tensors = []
             for i in range(self.world_size):
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
index e26946b1fb0..17e8c2636fd 100644
--- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -26,6 +26,8 @@
 import tensorrt_llm
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams)
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm.functional import AllReduceStrategy
 from tensorrt_llm.mapping import Mapping
 
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
@@ -97,13 +99,15 @@ def row_linear_residual_norm_fusion_forward(
 
     MPI.COMM_WORLD.barrier()
 
-    allreduce = AllReduce(mapping=Mapping(
-        world_size=tensor_parallel_size,
-        tp_size=tensor_parallel_size,
-        rank=tensor_parallel_rank,
-    ),
-                          dtype=dtype,
-                          ar_backend="MNVL")
+    allreduce = AllReduce(
+        model_config=ModelConfig(mapping=Mapping(
+            world_size=tensor_parallel_size,
+            tp_size=tensor_parallel_size,
+            rank=tensor_parallel_rank,
+        ),
+                                 strategy=AllReduceStrategy.MNNVL),
+        dtype=dtype,
+    )
 
     # Since all the modules here are provided by TRT-LLM,
     # so it has to be fullgraph compatible
diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
index 32b0af5ef8c..1207252c134 100644
--- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py
+++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
@@ -17,6 +17,7 @@
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams, AllReduceStrategy,
                                              userbuffers_allreduce_finalize)
+from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.modules.linear import Linear, TensorParallelMode
 from tensorrt_llm._torch.modules.rms_norm import RMSNorm
 from tensorrt_llm.mapping import Mapping
@@ -128,7 +129,8 @@ def run_single_rank_ar_rms_norm(tensor_parallel_size, a, b, c, gamma):
             tp_size=tensor_parallel_size,
             rank=rank,
         )
-        ar = AllReduce(mapping, strategy=AllReduceStrategy.UB)
+        ar = AllReduce(model_config=ModelConfig(
+            mapping=mapping, allreduce_backend=AllReduceStrategy.UB))
         ar_params = AllReduceParams(
             strategy=AllReduceStrategy.UB,
             fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
@@ -220,7 +222,8 @@ def run_single_rank_ar_rms_norm_fp8(tensor_parallel_size, a, b, c, gamma,
             tp_size=tensor_parallel_size,
             rank=rank,
         )
-        ar = AllReduce(mapping, strategy=AllReduceStrategy.UB)
+        ar = AllReduce(model_config=ModelConfig(
+            mapping=mapping, allreduce_backend=AllReduceStrategy.UB))
         ar_params = AllReduceParams(
             strategy=AllReduceStrategy.UB,
             fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8,
@@ -605,7 +608,8 @@ def run_single_rank_ar_rms_norm_fp4(tensor_parallel_size, a, b, c, gamma):
             tp_size=tensor_parallel_size,
             rank=rank,
         )
-        ar = AllReduce(mapping, strategy=AllReduceStrategy.UB)
+        ar = AllReduce(model_config=ModelConfig(
+            mapping=mapping, allreduce_backend=AllReduceStrategy.UB))
         ar_params = AllReduceParams(
             strategy=AllReduceStrategy.UB,
             fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4,
@@ -692,9 +696,9 @@ def __init__(self, tp_size, rank, hidden_size, dtype, eps, norm0_gamma,
             tp_size=tp_size,
             rank=rank,
         )
-        self.ar_0 = AllReduce(mapping).cuda()
-        self.ar_1 = AllReduce(mapping).cuda()
-        self.ar_2 = AllReduce(mapping).cuda()
+        self.ar_0 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
+        self.ar_1 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
+        self.ar_2 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
         self.norm0 = RMSNorm(hidden_size=hidden_size, eps=eps,
                              dtype=dtype).cuda()
         self.norm1 = RMSNorm(hidden_size=hidden_size, eps=eps,

From bd1b058d085e8cf16c58ef615f3ce59be36a6f21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hui=20Gao=C3=A2=C2=80?= <huig@nvidia.com>
Date: Mon, 9 Jun 2025 04:50:36 -0700
Subject: [PATCH 6/9] Fix docs test cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hui Gao <huig@nvidia.com>
Signed-off-by: Hui Gaoâ <huig@nvidia.com>
---
 tests/unittest/api_stability/references_committed/llm.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
index f2c90635fbe..a321b13e64d 100644
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -105,6 +105,9 @@ methods:
       kv_cache_config:
         annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
         default: null
+      allreduce_strategy:
+        annotation: Optional[str]
+        default: AUTO
     return_annotation: None
   generate:
     parameters:

From d93a8760c03a92d29ab188460ee25b8f64fe5af3 Mon Sep 17 00:00:00 2001
From: Hui Gao <huig@nvidia.com>
Date: Wed, 11 Jun 2025 04:36:05 +0000
Subject: [PATCH 7/9] Address comments

Signed-off-by: Hui Gao <huig@nvidia.com>
---
 .../advanced/lowprecision-pcie-allreduce.md   | 14 ++--
 .../_torch/auto_deploy/distributed/trtllm.py  |  2 +-
 tensorrt_llm/_torch/distributed/ops.py        | 64 +++++++++----------
 tensorrt_llm/_torch/model_config.py           |  7 +-
 .../_torch/models/modeling_deepseekv3.py      |  8 ++-
 tensorrt_llm/_torch/models/modeling_llama.py  |  8 ++-
 .../_torch/models/modeling_qwen3_moe.py       |  6 +-
 tensorrt_llm/_torch/modules/fused_moe.py      |  6 +-
 tensorrt_llm/_torch/modules/linear.py         |  5 +-
 .../_torch/multi_gpu/test_allreduce.py        |  2 +-
 .../multi_gpu/test_lowprecision_allreduce.py  |  6 +-
 .../_torch/multi_gpu/test_mnnvl_allreduce.py  |  5 +-
 .../_torch/multi_gpu/test_user_buffers.py     | 16 ++---
 .../references_committed/llm.yaml             |  2 +-
 14 files changed, 72 insertions(+), 79 deletions(-)

diff --git a/docs/source/advanced/lowprecision-pcie-allreduce.md b/docs/source/advanced/lowprecision-pcie-allreduce.md
index 57ca754c4e1..b7ab5070370 100644
--- a/docs/source/advanced/lowprecision-pcie-allreduce.md
+++ b/docs/source/advanced/lowprecision-pcie-allreduce.md
@@ -41,12 +41,12 @@ The Low-Precision-AllReduce algorithm can be enabled in two ways:
 ```
 AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.LOWPRECISION);
 ```
-2. **Environment variable control** with AUTO strategy:
+
+2. Enable by LlmArgs
 ```
-// In your code
-AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.AUTO);
-// Set environment variable before running
-export FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY=1
+Set allreduce_strategy field in LlmArgs.
+Candidates of strategies are "AUTO", "NCCL", "UB", "MINLATENCY", "ONESHOT", "TWOSHOT", "LOWPRECISION" and "MNNVL".
+If no strategy is set, AUTO will be set.
 ```
 
 ## Performance and Accuracy Considerations
@@ -58,8 +58,4 @@ Low-Precision-AllReduce reduces communication volume by using FP8 data format fo
 
 Users should evaluate the precision impact on their specific models and workloads.
 
-## Environment Variables
-
-- `FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY`: When set to `1`, forces the use of low-precision algorithm with AUTO strategy. If the algorithm determines it cannot provide performance benefits, it will automatically fall back to other strategies.
-
 **Note**: When compiling TensorRT-LLM without enabling the `ENABLE_FP8` option, setting Low Precision allreduce will not take effect.
diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
index e0ac0db1b8e..dd9313df0f1 100644
--- a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
+++ b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
@@ -17,7 +17,7 @@ def trtllm_allreduce(tensor, op, all_reduce_params=None):
         rank, world_size = get_rank_world_size()
         assert op == ReduceOp.SUM, "TRT-LLM all reduce only supports SUM op."
         p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank)
-        torch_op = AllReduce(p_config)
+        torch_op = AllReduce(mapping=p_config)
         return torch_op(tensor, all_reduce_params=all_reduce_params)
 
     @torch.library.custom_op(
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 7e18458a0f6..1c8d8023fa4 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -15,8 +15,6 @@
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
-from ..model_config import ModelConfig
-
 _thread_local = threading.local()
 
 
@@ -311,16 +309,16 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype):
         self.mapping = mapping
         self.dtype = dtype
         assert (
-            dtype in MNNVLAllReduce.get_supported_dtype()
+            dtype in MNNVLAllReduce.get_supported_dtypes()
             and (not mapping.has_cp())
-        ), "MNNVL all reduce only support dtype {MNNVLAllReduce.get_supported_dtype()} and without cp."
+        ), "MNNVL all reduce only supports dtype {MNNVLAllReduce.get_supported_dtypes()} and without cp."
 
         self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace(
             self.mapping, dtype)
 
     @staticmethod
-    def get_supported_dtype():
-        return [torch.bfloat16, torch.float32]
+    def get_supported_dtypes():
+        return (torch.bfloat16, torch.float32)
 
     def forward(
         self,
@@ -377,38 +375,38 @@ def forward(
 class AllReduce(nn.Module):
 
     def __init__(self,
-                 dtype: Optional[torch.dtype] = None,
-                 model_config: ModelConfig = ModelConfig()):
+                 mapping: Mapping,
+                 strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
+                 dtype: Optional[torch.dtype] = None):
         super().__init__()
         """
         AllReduce is a module that performs an all-reduce operation on a tensor.
 
         Args:
-            model_config (ModelConfig): mapping and strategy in it are used.
-                mapping (Mapping):  The parallel mapping config.
-                strategy (AllReduceStrategy):
-                    The following all-reduce strategies are supported:
+            mapping (Mapping):  The parallel mapping config.
+            strategy (AllReduceStrategy):
+                The following all-reduce strategies are supported:
 
-                    - UB: AllReduce uses user-buffer based all-reduce kernel.
+                - UB: AllReduce uses user-buffer based all-reduce kernel.
 
-                    - NCCL: Use NCCL allreduce.
+                - NCCL: Use NCCL allreduce.
 
-                    - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel.
+                - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel.
 
-                    - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy.
+                - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy.
 
-                    - LOWPRECISION: AllReduce quantizes data to lower precision for transmission.
-                    Should only be used on topologies with PCIe switches and without NVLink.
-                    This strategy may result in some precision loss but can improve performance
-                    on specific hardware configurations.
+                - LOWPRECISION: AllReduce quantizes data to lower precision for transmission.
+                  Should only be used on topologies with PCIe switches and without NVLink.
+                  This strategy may result in some precision loss but can improve performance
+                  on specific hardware configurations.
 
-                All strategies support the following operations:
-                    - NONE (AllReduce only)
-                    - RESIDUAL_RMS_NORM
-                    - RESIDUAL_RMS_NORM_QUANT_FP8
-                    - RESIDUAL_RMS_NORM_QUANT_NVFP4
-                    - RESIDUAL_RMS_NORM_OUT_QUANT_FP8
-                    - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4
+            All strategies support the following operations:
+                - NONE (AllReduce only)
+                - RESIDUAL_RMS_NORM
+                - RESIDUAL_RMS_NORM_QUANT_FP8
+                - RESIDUAL_RMS_NORM_QUANT_NVFP4
+                - RESIDUAL_RMS_NORM_OUT_QUANT_FP8
+                - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4
 
             Note: NCCL, UB, and LOWPRECISION strategies only support consequent kernel calls
         instead of fused operations.
@@ -417,18 +415,14 @@ def __init__(self,
             For the reference implementation for each pattern, please refer to the following unit test:
             https://github.com/NVIDIA/TensorRT-LLM/blob/main/tests/unittest/_torch/multi_gpu/test_allreduce.py
 
-            The LOWPRECISION strategy can be selected either by directly specifying it in the constructor
-            or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using
-            the AUTO strategy.
+            The LOWPRECISION strategy can be selected either by directly specifying it in the constructor.
         """
 
-        self.mapping = model_config.mapping
+        self.mapping = mapping
         self.workspace = None
-        self.strategy = model_config.allreduce_backend
+        self.strategy = strategy
         self.mnnvl_allreduce = None
 
-        self.force_low_precision_env = os.environ.get(
-            "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY")
         if self.mapping.tp_size > 1:
             # When Strategy is UB, it is guaranteed that the workspace is not used.
             if self.strategy != AllReduceStrategy.UB:
@@ -438,7 +432,7 @@ def __init__(self,
 
             # Initialize MNNVL AllReduce if needed
             if self.strategy == AllReduceStrategy.MNNVL and (
-                    dtype and dtype in MNNVLAllReduce.get_supported_dtype()
+                    dtype and dtype in MNNVLAllReduce.get_supported_dtypes()
             ) and (not self.mapping.has_cp()):
                 self.mnnvl_allreduce = MNNVLAllReduce(self.mapping,
                                                       dtype) if dtype else None
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index 0b0e8a9210e..05471144f5f 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -109,12 +109,12 @@ def __post_init__(self):
             self.is_generation = self.is_generation_model(
                 self.pretrained_config.architectures)
 
-        def map_ar_strategy(strategy: str = "AUTO"):
+        def get_all_reduce_strategy(strategy: str = "AUTO"):
             maps = {
                 "AUTO": AllReduceStrategy.AUTO,
                 "NCCL": AllReduceStrategy.NCCL,
                 "UB": AllReduceStrategy.UB,
-                "MIN_LATENCY": AllReduceStrategy.MIN_LATENCY,
+                "MINLATENCY": AllReduceStrategy.MIN_LATENCY,
                 "ONESHOT": AllReduceStrategy.ONESHOT,
                 "TWOSHOT": AllReduceStrategy.TWOSHOT,
                 "LOWPRECISION": AllReduceStrategy.LOWPRECISION,
@@ -124,7 +124,8 @@ def map_ar_strategy(strategy: str = "AUTO"):
             return maps[key] if key in maps else AllReduceStrategy.AUTO
 
         if isinstance(self.allreduce_backend, str):
-            self.allreduce_backend = map_ar_strategy(self.allreduce_backend)
+            self.allreduce_backend = get_all_reduce_strategy(
+                self.allreduce_backend)
 
     @property
     def fuse_pos_embd(self):
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 67973dc90ba..65fc249f48b 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -399,7 +399,8 @@ def __init__(self,
             overridden_tp_size=shared_tp_size,
             reduce_output=False)
 
-        self.allreduce = AllReduce(model_config=model_config)
+        self.allreduce = AllReduce(mapping=model_config.mapping,
+                                   strategy=model_config.allreduce_backend)
         self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared]
         self.event_dict = {
             key: torch.cuda.Event()
@@ -628,8 +629,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
         self.layer_idx = layer_idx
-        self.allreduce = AllReduce(dtype=config.torch_dtype,
-                                   model_config=model_config)
+        self.allreduce = AllReduce(mapping=model_config.mapping,
+                                   strategy=model_config.allreduce_backend,
+                                   dtype=config.torch_dtype)
         self.moe_allreduce = MoEAllReduce(self.mapping)
         self.next_layer_layernorm: RMSNorm = None
 
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index a852560af10..4d2b677d46e 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -282,7 +282,10 @@ def __init__(
                              quant_config=None)
 
         self.mapping = model_config.mapping
-        self.all_reduce = AllReduce(model_config=model_config)
+        self.all_reduce = AllReduce(
+            mapping=model_config.mapping,
+            strategy=model_config.allreduce_backend,
+        )
         self.moe_event = [torch.cuda.Event(), torch.cuda.Event()]
         self.aux_stream = aux_stream
 
@@ -414,7 +417,8 @@ def __init__(
                                                 dtype=config.torch_dtype)
 
         self.mapping = model_config.mapping
-        self.all_reduce = AllReduce(model_config=model_config)
+        self.all_reduce = AllReduce(mapping=model_config.mapping,
+                                    strategy=model_config.allreduce_backend)
         self.next_layer_layernorm: RMSNorm = None
         self.next_attn: LlamaAttention = None
 
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
index 6a1e13b1467..90ded9ad9c8 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -89,7 +89,8 @@ def __init__(
         self.top_k = config.num_experts_per_tok
         self.enable_attention_dp = model_config.mapping.enable_attention_dp
         self.mapping = model_config.mapping
-        self.allreduce = AllReduce(model_config=model_config)
+        self.allreduce = AllReduce(mapping=model_config.mapping,
+                                   strategy=model_config.allreduce_backend)
         self.enable_alltoall = Qwen3MoE.should_enable_alltoall(
             model_config, self.top_k)
         if self.enable_alltoall:
@@ -202,7 +203,8 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig],
                                                 dtype=config.torch_dtype)
         self.layer_idx = layer_idx
 
-        self.allreduce = AllReduce(model_config=model_config)
+        self.allreduce = AllReduce(mapping=model_config.mapping,
+                                   strategy=model_config.allreduce_backend)
         self.next_layer_layernorm: RMSNorm = None
 
         self.fusion_config = EagerFusionConfig()
diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py
index 7df752814e2..ba21b3eb738 100755
--- a/tensorrt_llm/_torch/modules/fused_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe.py
@@ -355,7 +355,8 @@ def __init__(
         self.mapping = model_config.mapping
         self.parallel_size = self.mapping.tp_size
 
-        self.all_reduce = AllReduce(self.mapping)
+        self.all_reduce = AllReduce(mapping=self.mapping,
+                                    strategy=model_config.allreduce_backend)
 
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
@@ -933,7 +934,8 @@ def __init__(
         self.mapping = model_config.mapping
         self.parallel_size = self.mapping.tp_size
 
-        self.all_reduce = AllReduce(model_config=model_config)
+        self.all_reduce = AllReduce(mapping=self.mapping,
+                                    strategy=model_config.allreduce_backend)
 
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index bd554b51089..48665b7b41e 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -17,7 +17,6 @@
 from tensorrt_llm.mapping import Mapping
 
 from ...models.modeling_utils import QuantConfig
-from ..model_config import ModelConfig
 from ..utils import Fp4QuantizedTensor
 
 
@@ -695,8 +694,8 @@ def __init__(
         self.in_features = local_in_features
         self.out_features = local_out_features
 
-        self.all_reduce = AllReduce(model_config=ModelConfig(
-            mapping=self.mapping)) if reduce_output else None
+        self.all_reduce = AllReduce(
+            mapping=self.mapping) if reduce_output else None
         self._weights_created = False
         self.reduce_output = reduce_output
         self.use_custom_cublas_mm = use_custom_cublas_mm
diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py
index 820fbb0b684..fec42bc3384 100644
--- a/tests/unittest/_torch/multi_gpu/test_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_allreduce.py
@@ -124,7 +124,7 @@ def e2m1_and_ufp8sf_scale_to_float_v2(e2m1_tensor,
     ).cuda()
     norm = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda()
 
-    allreduce = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
+    allreduce = AllReduce(mapping=mapping).cuda()
 
     scale = torch.tensor(1.0, dtype=torch.float32).cuda()
     linear.load_weights([dict(weight=weights[0])])
diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
index 3d8933f4115..5245c454be4 100644
--- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
@@ -22,7 +22,6 @@
 from mpi4py.futures import MPIPoolExecutor
 
 from tensorrt_llm._torch.distributed import AllReduceStrategy
-from tensorrt_llm._torch.model_config import ModelConfig
 
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
 MPI.pickle.__init__(
@@ -88,9 +87,8 @@ def __init__(self,
                 rank=self.rank,
             )
 
-            self.allreduce = AllReduce(model_config=ModelConfig(
-                mapping=self.mapping,
-                allreduce_backend=self.strategy), ).cuda()
+            self.allreduce = AllReduce(mapping=self.mapping,
+                                       allreduce_backend=self.strategy).cuda()
 
             self.input_tensors = []
             for i in range(self.world_size):
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
index 17e8c2636fd..595ff09d12e 100644
--- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -26,7 +26,6 @@
 import tensorrt_llm
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams)
-from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm.functional import AllReduceStrategy
 from tensorrt_llm.mapping import Mapping
 
@@ -100,12 +99,12 @@ def row_linear_residual_norm_fusion_forward(
     MPI.COMM_WORLD.barrier()
 
     allreduce = AllReduce(
-        model_config=ModelConfig(mapping=Mapping(
+        mapping=Mapping(
             world_size=tensor_parallel_size,
             tp_size=tensor_parallel_size,
             rank=tensor_parallel_rank,
         ),
-                                 strategy=AllReduceStrategy.MNNVL),
+        strategy=AllReduceStrategy.MNNVL,
         dtype=dtype,
     )
 
diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
index 1207252c134..66934a7ccc4 100644
--- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py
+++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
@@ -17,7 +17,6 @@
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams, AllReduceStrategy,
                                              userbuffers_allreduce_finalize)
-from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.modules.linear import Linear, TensorParallelMode
 from tensorrt_llm._torch.modules.rms_norm import RMSNorm
 from tensorrt_llm.mapping import Mapping
@@ -129,8 +128,7 @@ def run_single_rank_ar_rms_norm(tensor_parallel_size, a, b, c, gamma):
             tp_size=tensor_parallel_size,
             rank=rank,
         )
-        ar = AllReduce(model_config=ModelConfig(
-            mapping=mapping, allreduce_backend=AllReduceStrategy.UB))
+        ar = AllReduce(mapping=mapping, strategy=AllReduceStrategy.UB)
         ar_params = AllReduceParams(
             strategy=AllReduceStrategy.UB,
             fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
@@ -222,8 +220,7 @@ def run_single_rank_ar_rms_norm_fp8(tensor_parallel_size, a, b, c, gamma,
             tp_size=tensor_parallel_size,
             rank=rank,
         )
-        ar = AllReduce(model_config=ModelConfig(
-            mapping=mapping, allreduce_backend=AllReduceStrategy.UB))
+        ar = AllReduce(mapping=mapping, strategy=AllReduceStrategy.UB)
         ar_params = AllReduceParams(
             strategy=AllReduceStrategy.UB,
             fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8,
@@ -608,8 +605,7 @@ def run_single_rank_ar_rms_norm_fp4(tensor_parallel_size, a, b, c, gamma):
             tp_size=tensor_parallel_size,
             rank=rank,
         )
-        ar = AllReduce(model_config=ModelConfig(
-            mapping=mapping, allreduce_backend=AllReduceStrategy.UB))
+        ar = AllReduce(mapping=mapping, strategy=AllReduceStrategy.UB)
         ar_params = AllReduceParams(
             strategy=AllReduceStrategy.UB,
             fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4,
@@ -696,9 +692,9 @@ def __init__(self, tp_size, rank, hidden_size, dtype, eps, norm0_gamma,
             tp_size=tp_size,
             rank=rank,
         )
-        self.ar_0 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
-        self.ar_1 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
-        self.ar_2 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda()
+        self.ar_0 = AllReduce(mapping=mapping).cuda()
+        self.ar_1 = AllReduce(mapping=mapping).cuda()
+        self.ar_2 = AllReduce(mapping=mapping).cuda()
         self.norm0 = RMSNorm(hidden_size=hidden_size, eps=eps,
                              dtype=dtype).cuda()
         self.norm1 = RMSNorm(hidden_size=hidden_size, eps=eps,
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
index a321b13e64d..cbb0f5681e1 100644
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -106,7 +106,7 @@ methods:
         annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
         default: null
       allreduce_strategy:
-        annotation: Optional[str]
+        annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']]
         default: AUTO
     return_annotation: None
   generate:

From a7fab8b1ce9e8bc7802ca6e051961e8c76c06b3b Mon Sep 17 00:00:00 2001
From: Hui Gao <huig@nvidia.com>
Date: Tue, 10 Jun 2025 01:10:12 +0000
Subject: [PATCH 8/9] Address comments to remove code setting strategies to
 Linear whem no mappingg

Signed-off-by: Hui Gao <huig@nvidia.com>
---
 .../out_of_tree_example/modeling_opt.py       | 34 +++++++++---------
 .../_torch/auto_deploy/distributed/trtllm.py  |  4 +--
 tensorrt_llm/_torch/model_config.py           |  8 ++---
 .../_torch/models/modeling_deepseekv3.py      |  4 +--
 tensorrt_llm/_torch/models/modeling_llama.py  |  6 ++--
 .../_torch/models/modeling_nemotron_nas.py    |  2 +-
 .../_torch/models/modeling_qwen3_moe.py       |  4 +--
 tensorrt_llm/_torch/modules/attention.py      | 14 ++++----
 tensorrt_llm/_torch/modules/fused_moe.py      |  4 +--
 .../modules/fused_moe/fused_moe_vanilla.py    |  2 +-
 .../_torch/modules/fused_moe/interface.py     |  2 +-
 tensorrt_llm/_torch/modules/gated_mlp.py      |  4 +--
 tensorrt_llm/_torch/modules/linear.py         |  7 ++--
 .../_torch/modules/mamba/mamba2_mixer.py      | 36 +++++++++----------
 tensorrt_llm/_torch/modules/mlp.py            |  6 ++--
 .../multi_gpu/test_lowprecision_allreduce.py  |  2 +-
 16 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/examples/pytorch/out_of_tree_example/modeling_opt.py b/examples/pytorch/out_of_tree_example/modeling_opt.py
index 11c8b8d6746..320a431bc74 100644
--- a/examples/pytorch/out_of_tree_example/modeling_opt.py
+++ b/examples/pytorch/out_of_tree_example/modeling_opt.py
@@ -64,24 +64,22 @@ def __init__(
             config.hidden_size,
             elementwise_affine=config.layer_norm_elementwise_affine,
             dtype=config.torch_dtype)
-        self.fc1 = Linear(
-            config.hidden_size,
-            config.ffn_dim,
-            bias=config.enable_bias,
-            dtype=config.torch_dtype,
-            mapping=model_config.mapping,
-            tensor_parallel_mode=TensorParallelMode.COLUMN,
-            quant_config=model_config.get_quant_config(),
-        )
-        self.fc2 = Linear(
-            config.ffn_dim,
-            config.hidden_size,
-            bias=config.enable_bias,
-            dtype=config.torch_dtype,
-            mapping=model_config.mapping,
-            tensor_parallel_mode=TensorParallelMode.ROW,
-            quant_config=model_config.get_quant_config(),
-        )
+        self.fc1 = Linear(config.hidden_size,
+                          config.ffn_dim,
+                          bias=config.enable_bias,
+                          dtype=config.torch_dtype,
+                          mapping=model_config.mapping,
+                          tensor_parallel_mode=TensorParallelMode.COLUMN,
+                          quant_config=model_config.get_quant_config(),
+                          allreduce_strategy=model_config.allreduce_strategy)
+        self.fc2 = Linear(config.ffn_dim,
+                          config.hidden_size,
+                          bias=config.enable_bias,
+                          dtype=config.torch_dtype,
+                          mapping=model_config.mapping,
+                          tensor_parallel_mode=TensorParallelMode.ROW,
+                          quant_config=model_config.get_quant_config(),
+                          allreduce_strategy=model_config.allreduce_strategy)
         self.final_layer_norm = LayerNorm(
             config.hidden_size,
             elementwise_affine=config.layer_norm_elementwise_affine,
diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
index dd9313df0f1..e42da002f6d 100644
--- a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
+++ b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
@@ -6,7 +6,7 @@
 try:
     from ....mapping import Mapping
     from ...distributed import AllReduce, allgather
-    from ...modules.linear import AllReduceFusionOp, AllReduceParams
+    from ...modules.linear import AllReduceFusionOp, AllReduceParams, AllReduceStrategy
 
     def trtllm_allgather(tensor, dim, sizes=None):
         rank, world_size = get_rank_world_size()
@@ -17,7 +17,7 @@ def trtllm_allreduce(tensor, op, all_reduce_params=None):
         rank, world_size = get_rank_world_size()
         assert op == ReduceOp.SUM, "TRT-LLM all reduce only supports SUM op."
         p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank)
-        torch_op = AllReduce(mapping=p_config)
+        torch_op = AllReduce(mapping=p_config, strategy=AllReduceStrategy.AUTO)
         return torch_op(tensor, all_reduce_params=all_reduce_params)
 
     @torch.library.custom_op(
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index 05471144f5f..f5a3d5f4199 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -79,7 +79,7 @@ class ModelConfig(Generic[TConfig]):
 
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'  # options can be CUTLASS, TRTLLM
-    allreduce_backend: AllReduceStrategy = AllReduceStrategy.AUTO
+    allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO
 
     # If true, enable min-latency mode. Currently only used for Llama4.
     enable_min_latency: bool = False
@@ -123,9 +123,9 @@ def get_all_reduce_strategy(strategy: str = "AUTO"):
             key = strategy.upper()
             return maps[key] if key in maps else AllReduceStrategy.AUTO
 
-        if isinstance(self.allreduce_backend, str):
-            self.allreduce_backend = get_all_reduce_strategy(
-                self.allreduce_backend)
+        if isinstance(self.allreduce_strategy, str):
+            self.allreduce_strategy = get_all_reduce_strategy(
+                self.allreduce_strategy)
 
     @property
     def fuse_pos_embd(self):
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 65fc249f48b..f5d3417f88f 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -400,7 +400,7 @@ def __init__(self,
             reduce_output=False)
 
         self.allreduce = AllReduce(mapping=model_config.mapping,
-                                   strategy=model_config.allreduce_backend)
+                                   strategy=model_config.allreduce_strategy)
         self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared]
         self.event_dict = {
             key: torch.cuda.Event()
@@ -630,7 +630,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
                                                 dtype=config.torch_dtype)
         self.layer_idx = layer_idx
         self.allreduce = AllReduce(mapping=model_config.mapping,
-                                   strategy=model_config.allreduce_backend,
+                                   strategy=model_config.allreduce_strategy,
                                    dtype=config.torch_dtype)
         self.moe_allreduce = MoEAllReduce(self.mapping)
         self.next_layer_layernorm: RMSNorm = None
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index 4d2b677d46e..d6ffeac2ca1 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -284,7 +284,7 @@ def __init__(
         self.mapping = model_config.mapping
         self.all_reduce = AllReduce(
             mapping=model_config.mapping,
-            strategy=model_config.allreduce_backend,
+            strategy=model_config.allreduce_strategy,
         )
         self.moe_event = [torch.cuda.Event(), torch.cuda.Event()]
         self.aux_stream = aux_stream
@@ -418,7 +418,7 @@ def __init__(
 
         self.mapping = model_config.mapping
         self.all_reduce = AllReduce(mapping=model_config.mapping,
-                                    strategy=model_config.allreduce_backend)
+                                    strategy=model_config.allreduce_strategy)
         self.next_layer_layernorm: RMSNorm = None
         self.next_attn: LlamaAttention = None
 
@@ -629,7 +629,7 @@ def __init__(
             quant_config=model_config.get_quant_config(),
             skip_create_weights_in_init=model_config.
             skip_create_weights_in_init,
-        )
+            allreduce_strategy=model_config.allreduce_strategy)
 
 
 class Eagle3LlamaDecoderLayer(DecoderLayer):
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_nas.py b/tensorrt_llm/_torch/models/modeling_nemotron_nas.py
index ef562979543..333f52532aa 100644
--- a/tensorrt_llm/_torch/models/modeling_nemotron_nas.py
+++ b/tensorrt_llm/_torch/models/modeling_nemotron_nas.py
@@ -44,7 +44,7 @@ def _create_linear_from_configs(model_config: ModelConfig[PretrainedConfig],
         gather_output=True,
         quant_config=model_config.get_quant_config(),
         skip_create_weights_in_init=model_config.skip_create_weights_in_init,
-    )
+        allreduce_strategy=model_config.allreduce_strategy)
 
 
 class NemotronNASAttention(Attention):
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
index 90ded9ad9c8..5e6f67a8d42 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -90,7 +90,7 @@ def __init__(
         self.enable_attention_dp = model_config.mapping.enable_attention_dp
         self.mapping = model_config.mapping
         self.allreduce = AllReduce(mapping=model_config.mapping,
-                                   strategy=model_config.allreduce_backend)
+                                   strategy=model_config.allreduce_strategy)
         self.enable_alltoall = Qwen3MoE.should_enable_alltoall(
             model_config, self.top_k)
         if self.enable_alltoall:
@@ -204,7 +204,7 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig],
         self.layer_idx = layer_idx
 
         self.allreduce = AllReduce(mapping=model_config.mapping,
-                                   strategy=model_config.allreduce_backend)
+                                   strategy=model_config.allreduce_strategy)
         self.next_layer_layernorm: RMSNorm = None
 
         self.fusion_config = EagerFusionConfig()
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
index cc9031bc288..94574d3f9d7 100644
--- a/tensorrt_llm/_torch/modules/attention.py
+++ b/tensorrt_llm/_torch/modules/attention.py
@@ -126,7 +126,7 @@ def __init__(
                 weight_mode=WeightMode.FUSED_QKV_LINEAR),
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
-        )
+            allreduce_strategy=config.allreduce_strategy)
         self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
                                 [self.hidden_size])
 
@@ -140,7 +140,7 @@ def __init__(
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             lora=self.o_lora,
-        )
+            allreduce_strategy=config.allreduce_strategy)
 
         self.quant_config = config.get_quant_config()
         self.attn_backend = config.attn_backend
@@ -481,7 +481,8 @@ def __init__(
                 mapping=mapping,
                 tensor_parallel_mode=TensorParallelMode.COLUMN,
                 quant_config=quant_config,
-                skip_create_weights_in_init=config.skip_create_weights_in_init)
+                skip_create_weights_in_init=config.skip_create_weights_in_init,
+                allreduce_strategy=config.allreduce_strategy)
         else:
             self.fused_a = Linear(
                 hidden_size,
@@ -501,7 +502,7 @@ def __init__(
                 tensor_parallel_mode=TensorParallelMode.COLUMN,
                 quant_config=quant_config,
                 skip_create_weights_in_init=config.skip_create_weights_in_init,
-            )
+                allreduce_strategy=config.allreduce_strategy)
             self.q_b_proj = self.q_proj
 
         self.kv_a_layernorm = RMSNorm(hidden_size=kv_lora_rank,
@@ -517,7 +518,8 @@ def __init__(
             mapping=mapping,
             tensor_parallel_mode=TensorParallelMode.COLUMN,
             quant_config=quant_config,
-            skip_create_weights_in_init=config.skip_create_weights_in_init)
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+            allreduce_strategy=config.allreduce_strategy)
         # This parameter will view into self.kv_b_proj.weight after loading weights.
         # For dummy weight initialization, this parameter is initialized with empty tensor.
         # Used in forward_generation only
@@ -538,7 +540,7 @@ def __init__(
             tensor_parallel_mode=TensorParallelMode.ROW,
             quant_config=quant_config,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
-        )
+            allreduce_strategy=config.allreduce_strategy)
 
         def yarn_get_mscale(scale=1, mscale=1):
             if scale <= 1:
diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py
index ba21b3eb738..334919050ec 100755
--- a/tensorrt_llm/_torch/modules/fused_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe.py
@@ -356,7 +356,7 @@ def __init__(
         self.parallel_size = self.mapping.tp_size
 
         self.all_reduce = AllReduce(mapping=self.mapping,
-                                    strategy=model_config.allreduce_backend)
+                                    strategy=model_config.allreduce_strategy)
 
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
@@ -935,7 +935,7 @@ def __init__(
         self.parallel_size = self.mapping.tp_size
 
         self.all_reduce = AllReduce(mapping=self.mapping,
-                                    strategy=model_config.allreduce_backend)
+                                    strategy=model_config.allreduce_strategy)
 
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
index f6a0e9323f7..f87647ce511 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
@@ -70,7 +70,7 @@ def __init__(
         self.parallel_size = self.mapping.tp_size
 
         self.all_reduce = AllReduce(mapping=self.mapping,
-                                    strategy=model_config.allreduce_backend)
+                                    strategy=model_config.allreduce_strategy)
 
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
index 3cc73d15dd8..d305a3b763e 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/interface.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -79,7 +79,7 @@ def __init__(
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
         self.all_reduce = AllReduce(mapping=self.mapping,
-                                    strategy=model_config.allreduce_backend)
+                                    strategy=model_config.allreduce_strategy)
 
     @abstractmethod
     def create_weights(self):
diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py
index a727cc93ab9..7fab30e1eee 100644
--- a/tensorrt_llm/_torch/modules/gated_mlp.py
+++ b/tensorrt_llm/_torch/modules/gated_mlp.py
@@ -73,7 +73,7 @@ def __init__(self,
             quant_config=config.get_quant_config(),
             reduce_output=False,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
-        )
+            allreduce_strategy=config.allreduce_strategy)
 
         self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H],
                                    [self.hidden_size])
@@ -89,7 +89,7 @@ def __init__(self,
             reduce_output=reduce_output,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             lora=self.down_lora,
-        )
+            allreduce_strategy=config.allreduce_strategy)
 
         # These two modules are mutually exclusive - either splitted_gate_up_lora or fused_gate_up_lora will be used,
         # but never both at the same time. splitted_gate_up_lora handles gate and up separately while fused_gate_up_lora
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index 48665b7b41e..b97f2ea489b 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -13,7 +13,8 @@
 
 import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
 from tensorrt_llm._torch.peft.lora.layer import LoraLayer
-from tensorrt_llm.functional import AllReduceFusionOp, AllReduceParams
+from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
+                                     AllReduceStrategy)
 from tensorrt_llm.mapping import Mapping
 
 from ...models.modeling_utils import QuantConfig
@@ -658,6 +659,7 @@ def __init__(
         skip_create_weights_in_init: bool = False,
         use_custom_cublas_mm: bool = False,
         lora: Optional[LoraLayer] = None,
+        allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
     ):
         from ..distributed import AllReduce
 
@@ -695,7 +697,8 @@ def __init__(
         self.out_features = local_out_features
 
         self.all_reduce = AllReduce(
-            mapping=self.mapping) if reduce_output else None
+            mapping=self.mapping,
+            strategy=allreduce_strategy) if reduce_output else None
         self._weights_created = False
         self.reduce_output = reduce_output
         self.use_custom_cublas_mm = use_custom_cublas_mm
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
index 2b9019be6eb..55a21dae991 100644
--- a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
+++ b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -88,15 +88,14 @@ def __init__(
         self.is_paged_state = False
 
         # in_proj
-        self.in_proj = Linear(
-            d_model,
-            d_in_proj,
-            bias=bias,
-            dtype=dtype,
-            mapping=self.mapping,
-            tensor_parallel_mode=TensorParallelMode.COLUMN,
-            quant_config=config.get_quant_config(),
-        )
+        self.in_proj = Linear(d_model,
+                              d_in_proj,
+                              bias=bias,
+                              dtype=dtype,
+                              mapping=self.mapping,
+                              tensor_parallel_mode=TensorParallelMode.COLUMN,
+                              quant_config=config.get_quant_config(),
+                              allreduce_strategy=config.allreduce_strategy)
 
         # conv1d, reuse Linear to store weights since it has support for TP > 1 already
         self.conv1d = Linear(
@@ -108,7 +107,7 @@ def __init__(
             tensor_parallel_mode=TensorParallelMode.COLUMN,
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
-        )
+            allreduce_strategy=config.allreduce_strategy)
 
         # A
         self.A = nn.Parameter(
@@ -138,15 +137,14 @@ def __init__(
         )
 
         # out_proj
-        self.out_proj = Linear(
-            d_inner,
-            d_model,
-            bias=bias,
-            dtype=dtype,
-            mapping=self.mapping,
-            tensor_parallel_mode=TensorParallelMode.ROW,
-            quant_config=config.get_quant_config(),
-        )
+        self.out_proj = Linear(d_inner,
+                               d_model,
+                               bias=bias,
+                               dtype=dtype,
+                               mapping=self.mapping,
+                               tensor_parallel_mode=TensorParallelMode.ROW,
+                               quant_config=config.get_quant_config(),
+                               allreduce_strategy=config.allreduce_strategy)
 
     def forward(
         self,
diff --git a/tensorrt_llm/_torch/modules/mlp.py b/tensorrt_llm/_torch/modules/mlp.py
index 8d026e1fa2f..b38da2177bd 100644
--- a/tensorrt_llm/_torch/modules/mlp.py
+++ b/tensorrt_llm/_torch/modules/mlp.py
@@ -43,7 +43,8 @@ def __init__(self,
                 weight_mode=WeightMode.VANILLA),
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
-            lora=self.up_lora)
+            lora=self.up_lora,
+            allreduce_strategy=config.allreduce_strategy)
 
         self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H],
                                    [self.hidden_size])
@@ -56,7 +57,8 @@ def __init__(self,
             tensor_parallel_mode=TensorParallelMode.ROW,
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
-            lora=self.down_lora)
+            lora=self.down_lora,
+            allreduce_strategy=config.allreduce_strategy)
 
     def forward(
         self,
diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
index 5245c454be4..3aa6871fbe4 100644
--- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
@@ -88,7 +88,7 @@ def __init__(self,
             )
 
             self.allreduce = AllReduce(mapping=self.mapping,
-                                       allreduce_backend=self.strategy).cuda()
+                                       strategy=self.strategy).cuda()
 
             self.input_tensors = []
             for i in range(self.world_size):

From e4426ac9012e5b091229c5b7cc49d1d6a6b27a34 Mon Sep 17 00:00:00 2001
From: Hui Gao <huig@nvidia.com>
Date: Wed, 11 Jun 2025 22:53:11 +0000
Subject: [PATCH 9/9] Fix format

Signed-off-by: Hui Gao <huig@nvidia.com>
---
 tensorrt_llm/_torch/distributed/ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 1c8d8023fa4..7c188ec38d0 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -11,7 +11,6 @@
 from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
                                      AllReduceStrategy, MoEAllReduceParams)
-from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper