From aaeb67b9d074a05d7384c15a04acfaa6738df12d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hui=20Gao=C3=A2=C2=80?= Date: Mon, 9 Jun 2025 04:48:11 -0700 Subject: [PATCH 1/9] Use backend to replace macro to control enable MNVL all reduce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hui Gao Signed-off-by: Hui Gao†--- tensorrt_llm/_torch/distributed/ops.py | 121 ++-- tensorrt_llm/_torch/model_config.py | 1 + .../_torch/models/modeling_deepseekv3.py | 4 +- .../defs/output/gpu_monitoring.csv | 525 ++++++++++++++++++ .../defs/output/perf_script_test_results.csv | 3 + .../defs/output/session_properties.csv | 2 + .../qa/trt_llm_release_perf_test.txt | 1 + .../_torch/multi_gpu/test_mnnvl_allreduce.py | 19 +- 8 files changed, 624 insertions(+), 52 deletions(-) create mode 100644 tests/integration/defs/output/gpu_monitoring.csv create mode 100644 tests/integration/defs/output/perf_script_test_results.csv create mode 100644 tests/integration/defs/output/session_properties.csv create mode 100644 tests/integration/test_lists/qa/trt_llm_release_perf_test.txt diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 44ab7b1c8dd..085dff61171 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -307,14 +307,15 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype): super().__init__() self.mapping = mapping self.dtype = dtype - self.enable_mnnvl = (os.environ.get("TRTLLM_MNNVL_AR_ENABLED", - "0") == "1" - and dtype in [torch.bfloat16, torch.float32] - and (not mapping.has_cp())) + assert (dtype in MNNVLAllReduce.get_supported_dtype() + and (not mapping.has_cp())), "" - if self.enable_mnnvl: - self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace( - self.mapping, dtype) + self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace( + self.mapping, dtype) + + @staticmethod + def get_supported_dtype(): + return [torch.bfloat16, torch.float32] def forward( self, @@ -330,7 +331,7 @@ def forward( Returns: Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s) """ - if not self.enable_mnnvl or input.numel() > self.max_num_elements_mnnvl: + if input.numel() > self.max_num_elements_mnnvl: return None fusion_op = all_reduce_params.fusion_op @@ -368,12 +369,63 @@ def forward( return None +class TLLMAllReduce(nn.Module): + """A specialized AllReduce implementation for Multi-Node NVLink communication. + + This class handles the MNNVL-specific allreduce operations, which can be more efficient + for certain operations when using NVLink for multi-node communication. + """ + + def __init__(self, mapping: Mapping, strategy: AllReduceStrategy = AllReduceStrategy.AUTO): + super().__init__() + self.mapping = mapping + self.strategy = strategy + self.workspace = None + + self.force_low_precision_env = os.environ.get( + "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY") + # When Strategy is UB, it is guaranteed that the workspace is not used. + if self.strategy != AllReduceStrategy.UB: + if self.strategy == AllReduceStrategy.LOWPRECISION or self.force_low_precision_env is not None: + allocate_low_presicion_allreduce_workspace(self.mapping) + self.workspace = get_allreduce_workspace(self.mapping) + + def forward( + self, + input: torch.Tensor, + all_reduce_params: AllReduceParams, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: + """Forward pass for MNNVL AllReduce. + + Args: + input (torch.Tensor): Input tensor to be reduced + all_reduce_params (Optional[AllReduceParams]): Parameters for fused operations + + Returns: + Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s) + """ + output = torch.ops.trtllm.allreduce( + input=input, + residual=all_reduce_params.residual, + norm_weight=all_reduce_params.norm_weight, + scale=all_reduce_params.scale, + bias=all_reduce_params.bias, + workspace=self.workspace, + group=self.mapping.tp_group, + strategy=self.strategy, + op=all_reduce_params.fusion_op, + eps=all_reduce_params.eps, + ) + return output + + class AllReduce(nn.Module): def __init__(self, mapping: Mapping, strategy: AllReduceStrategy = AllReduceStrategy.AUTO, - dtype: Optional[torch.dtype] = None): + dtype: Optional[torch.dtype] = None, + ar_backend: str = "TRTLLM"): super().__init__() """ AllReduce is a module that performs an all-reduce operation on a tensor. @@ -415,23 +467,23 @@ def __init__(self, or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using the AUTO strategy. """ + self.skip_ar = self.mapping.tp_size == 1 + self._mnvl_allreduce = None + self._tllm_allreduce = None + self._create_allreduce(mapping, ar_backend, strategy, dtype) - self.mapping = mapping - self.workspace = None - self.strategy = strategy + def _create_allreduce(self, mapping, backend, strategy, dtype): + if mapping.tp_size == 1: + return - self.force_low_precision_env = os.environ.get( - "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY") - if self.mapping.tp_size > 1: - # When Strategy is UB, it is guaranteed that the workspace is not used. - if self.strategy != AllReduceStrategy.UB: - if self.strategy == AllReduceStrategy.LOWPRECISION or self.force_low_precision_env is not None: - allocate_low_presicion_allreduce_workspace(self.mapping) - self.workspace = get_allreduce_workspace(self.mapping) + enable_mnnvl = (backend == "MNNVL" + and (dtype + and dtype in MNNVLAllReduce.get_supported_dtype()) + and (not mapping.has_cp()) and mapping.tp_size > 1) + if enable_mnnvl: + self._mnvl_allreduce = MNNVLAllReduce(mapping, dtype) - # Initialize MNNVL AllReduce if needed - self.mnnvl_allreduce = MNNVLAllReduce(mapping, - dtype) if dtype else None + self._tllm_allreduce = TLLMAllReduce(mapping, strategy) def forward( self, @@ -460,37 +512,26 @@ def forward( RESIDUAL_RMS_NORM_QUANT_FP8: [norm_quant, residual] RESIDUAL_RMS_NORM_OUT_QUANT_FP8: [norm, norm_quant, residual] RESIDUAL_RMS_NORM_QUANT_NVFP4: [norm_quant_fp4, scale_factor, residual] - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual] + RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]P ''' - if self.mapping.tp_size == 1 or (all_reduce_params is not None - and all_reduce_params.enable_allreduce - == False): + if self.skip_ar or (all_reduce_params is not None + and all_reduce_params.enable_allreduce == False): return input if all_reduce_params is None: all_reduce_params = AllReduceParams() - # Try MNNVL AllReduce first if available if self.mnnvl_allreduce: mnnvl_output = self.mnnvl_allreduce( input, all_reduce_params=all_reduce_params) if mnnvl_output is not None: return mnnvl_output - # Fall back to regular AllReduce if MNNVL is not available or not applicable - output = torch.ops.trtllm.allreduce( + # MNVL only support part of AllReduceFusionOp provided in params. + output = self._tllm_allreduce( input=input, - residual=all_reduce_params.residual, - norm_weight=all_reduce_params.norm_weight, - scale=all_reduce_params.scale, - bias=all_reduce_params.bias, - workspace=self.workspace, - group=self.mapping.tp_group, - strategy=self.strategy, - op=all_reduce_params.fusion_op, - eps=all_reduce_params.eps, + all_reduce_params=all_reduce_params, ) - return output if len(output) > 1 else output[0] diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index c2f817c25a2..ba3d359a499 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -77,6 +77,7 @@ class ModelConfig(Generic[TConfig]): attn_backend: str = 'TRTLLM' moe_backend: str = 'CUTLASS' # options can be CUTLASS, TRTLLM + ar_backend: str = 'TRTLLM' # options can be MNVL, TRTLLM # If true, enable min-latency mode. Currently only used for Llama4. enable_min_latency: bool = False diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index ff22d3717ce..21918ed655c 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -628,7 +628,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], eps=config.rms_norm_eps, dtype=config.torch_dtype) self.layer_idx = layer_idx - self.allreduce = AllReduce(self.mapping, dtype=config.torch_dtype) + self.allreduce = AllReduce(self.mapping, + dtype=config.torch_dtype, + ar_backend=model_config.ar_backend) self.moe_allreduce = MoEAllReduce(self.mapping) self.next_layer_layernorm: RMSNorm = None diff --git a/tests/integration/defs/output/gpu_monitoring.csv b/tests/integration/defs/output/gpu_monitoring.csv new file mode 100644 index 00000000000..f0a9de5818a --- /dev/null +++ b/tests/integration/defs/output/gpu_monitoring.csv @@ -0,0 +1,525 @@ +gpu_id,timestamp,gpu_clock__MHz,memory_clock__MHz,graphics_clock__MHz,gpu_utilization__pct,memory_utilization__pct,encoder_utilization__pct,decoder_utilization__pct,gpu_temperature__C,memory_temperature__C,fan_speed__pct,perf_state,power_draw__W,process_num +0,2025-05-22 07:19:59.254958,345,1593,345,0,0,0,0,33,,,0,49.336,0 +0,2025-05-22 07:20:00.255244,345,1593,345,0,0,0,0,33,,,0,49.341,0 +0,2025-05-22 07:20:01.255586,345,1593,345,0,0,0,0,33,,,0,49.335,0 +0,2025-05-22 07:20:02.255856,345,1593,345,0,0,0,0,33,,,0,49.325,0 +0,2025-05-22 07:20:03.256133,345,1593,345,0,0,0,0,33,,,0,49.338,0 +0,2025-05-22 07:20:04.256400,345,1593,345,0,0,0,0,33,,,0,49.329,0 +0,2025-05-22 07:20:05.256668,345,1593,345,0,0,0,0,33,,,0,49.333,0 +0,2025-05-22 07:20:06.256911,345,1593,345,0,0,0,0,33,,,0,49.335,0 +0,2025-05-22 07:20:07.257181,345,1593,345,0,0,0,0,33,,,0,49.341,0 +0,2025-05-22 07:20:08.257467,345,1593,345,0,0,0,0,33,,,0,49.331,0 +0,2025-05-22 07:20:09.257742,345,1593,345,0,0,0,0,33,,,0,49.325,0 +0,2025-05-22 07:20:10.258030,345,1593,345,0,0,0,0,33,,,0,49.325,0 +0,2025-05-22 07:20:11.258311,345,1593,345,0,0,0,0,33,,,0,49.329,0 +0,2025-05-22 07:20:12.258595,345,1593,345,0,0,0,0,33,,,0,49.325,0 +0,2025-05-22 07:20:13.258881,345,1593,345,0,0,0,0,33,,,0,49.327,0 +0,2025-05-22 07:20:14.259151,345,1593,345,0,0,0,0,33,,,0,49.336,0 +0,2025-05-22 07:20:15.259451,345,1593,345,0,0,0,0,33,,,0,49.325,0 +0,2025-05-22 07:20:16.259675,345,1593,345,0,0,0,0,33,,,0,49.339,0 +0,2025-05-22 07:20:17.259991,345,1593,345,0,0,0,0,33,,,0,49.349,0 +0,2025-05-22 07:20:18.260332,345,1593,345,0,0,0,0,33,,,0,49.343,0 +0,2025-05-22 07:20:19.260653,345,1593,345,0,0,0,0,33,,,0,49.34,0 +0,2025-05-22 07:20:20.260928,345,1593,345,0,0,0,0,33,,,0,49.327,0 +0,2025-05-22 07:20:21.261204,345,1593,345,0,0,0,0,33,,,0,49.325,0 +0,2025-05-22 07:20:22.261520,345,1593,345,0,0,0,0,33,,,0,49.327,0 +0,2025-05-22 07:20:23.261836,345,1593,345,0,0,0,0,33,,,0,49.329,0 +0,2025-05-22 07:20:24.262109,345,1593,345,0,0,0,0,33,,,0,49.332,0 +0,2025-05-22 07:20:25.262378,345,1593,345,0,0,0,0,33,,,0,49.327,0 +0,2025-05-22 07:20:26.262645,345,1593,345,0,0,0,0,33,,,0,49.334,0 +0,2025-05-22 07:20:27.263164,1755,1593,1755,4,0,0,0,33,,,0,56.903,1 +0,2025-05-22 07:20:28.263556,1755,1593,1755,47,1,0,0,34,,,0,83.141,1 +0,2025-05-22 07:20:29.263933,1755,1593,1755,0,0,0,0,34,,,0,82.042,1 +0,2025-05-22 07:20:30.264296,1755,1593,1755,0,0,0,0,34,,,0,81.62,1 +0,2025-05-22 07:20:31.264652,1755,1593,1755,0,0,0,0,34,,,0,81.685,1 +0,2025-05-22 07:20:32.265029,1755,1593,1755,0,0,0,0,34,,,0,81.517,1 +0,2025-05-22 07:20:33.265392,1755,1593,1755,0,0,0,0,34,,,0,81.539,1 +0,2025-05-22 07:20:34.265751,1755,1593,1755,0,0,0,0,34,,,0,81.602,1 +0,2025-05-22 07:20:35.266153,1755,1593,1755,47,18,0,0,36,,,0,149.079,1 +0,2025-05-22 07:20:36.266565,1755,1593,1755,46,18,0,0,36,,,0,160.12,1 +0,2025-05-22 07:20:37.266972,1755,1593,1755,46,18,0,0,37,,,0,159.799,1 +0,2025-05-22 07:20:38.267387,1755,1593,1755,46,18,0,0,37,,,0,161.614,1 +0,2025-05-22 07:20:39.267802,1755,1593,1755,46,18,0,0,37,,,0,161.828,1 +0,2025-05-22 07:20:40.268223,1755,1593,1755,46,18,0,0,37,,,0,163.822,1 +0,2025-05-22 07:20:41.268630,1755,1593,1755,46,19,0,0,38,,,0,165.912,1 +0,2025-05-22 07:20:42.269043,1755,1593,1755,47,19,0,0,38,,,0,164.7,1 +0,2025-05-22 07:20:43.269479,1755,1593,1755,46,18,0,0,38,,,0,164.466,1 +0,2025-05-22 07:20:44.269904,1755,1593,1755,46,18,0,0,38,,,0,166.363,1 +0,2025-05-22 07:20:45.270322,1755,1593,1755,47,19,0,0,38,,,0,167.174,1 +0,2025-05-22 07:20:46.270754,1755,1593,1755,46,19,0,0,39,,,0,166.63,1 +0,2025-05-22 07:20:47.271183,1755,1593,1755,47,19,0,0,39,,,0,166.363,1 +0,2025-05-22 07:20:48.271607,1755,1593,1755,46,18,0,0,39,,,0,163.311,1 +0,2025-05-22 07:20:49.272020,1755,1593,1755,46,18,0,0,39,,,0,160.703,1 +0,2025-05-22 07:20:50.272437,1755,1593,1755,46,18,0,0,39,,,0,160.035,1 +0,2025-05-22 07:20:51.272861,1755,1593,1755,46,18,0,0,40,,,0,160.304,1 +0,2025-05-22 07:20:52.273307,1755,1593,1755,45,17,0,0,40,,,0,162.585,1 +0,2025-05-22 07:20:53.273747,1755,1593,1755,46,18,0,0,40,,,0,163.577,1 +0,2025-05-22 07:20:54.274167,1755,1593,1755,46,18,0,0,40,,,0,165.493,1 +0,2025-05-22 07:20:55.274583,1755,1593,1755,46,18,0,0,40,,,0,166.608,1 +0,2025-05-22 07:20:56.275023,1755,1593,1755,46,18,0,0,41,,,0,167.712,1 +0,2025-05-22 07:20:57.275448,1755,1593,1755,46,19,0,0,41,,,0,164.796,1 +0,2025-05-22 07:20:58.275873,1755,1593,1755,46,18,0,0,41,,,0,161.867,1 +0,2025-05-22 07:20:59.276240,1755,1593,1755,46,18,0,0,41,,,0,168.464,1 +0,2025-05-22 07:21:00.276665,1755,1593,1755,46,18,0,0,41,,,0,168.308,1 +0,2025-05-22 07:21:01.277080,1755,1593,1755,46,19,0,0,41,,,0,167.946,1 +0,2025-05-22 07:21:02.277514,1755,1593,1755,47,19,0,0,42,,,0,170.932,1 +0,2025-05-22 07:21:03.277944,1755,1593,1755,46,18,0,0,42,,,0,170.862,1 +0,2025-05-22 07:21:04.278368,1755,1593,1755,46,18,0,0,42,,,0,169.522,1 +0,2025-05-22 07:21:05.278789,1755,1593,1755,46,18,0,0,42,,,0,165.573,1 +0,2025-05-22 07:21:06.279217,1755,1593,1755,47,19,0,0,42,,,0,165.344,1 +0,2025-05-22 07:21:07.279646,1755,1593,1755,46,19,0,0,42,,,0,167.941,1 +0,2025-05-22 07:21:08.280085,1755,1593,1755,46,18,0,0,42,,,0,166.655,1 +0,2025-05-22 07:21:09.280498,1755,1593,1755,46,18,0,0,43,,,0,165.308,1 +0,2025-05-22 07:21:10.280920,1755,1593,1755,47,19,0,0,43,,,0,168.2,1 +0,2025-05-22 07:21:11.281342,1755,1593,1755,46,18,0,0,43,,,0,166.143,1 +0,2025-05-22 07:21:12.281782,1755,1593,1755,46,18,0,0,43,,,0,164.653,1 +0,2025-05-22 07:21:13.282199,1755,1593,1755,46,36,0,0,43,,,0,165.197,1 +0,2025-05-22 07:21:14.282624,1755,1593,1755,46,18,0,0,44,,,0,165.117,1 +0,2025-05-22 07:21:15.283055,1755,1593,1755,46,18,0,0,43,,,0,164.62,1 +0,2025-05-22 07:21:16.283479,1755,1593,1755,46,18,0,0,44,,,0,165.582,1 +0,2025-05-22 07:21:17.283906,1755,1593,1755,47,18,0,0,44,,,0,168.788,1 +0,2025-05-22 07:21:18.284331,1755,1593,1755,47,18,0,0,44,,,0,166.465,1 +0,2025-05-22 07:21:19.284757,1755,1593,1755,45,18,0,0,44,,,0,163.746,1 +0,2025-05-22 07:21:20.285181,1755,1593,1755,45,18,0,0,44,,,0,163.653,1 +0,2025-05-22 07:21:21.285625,1755,1593,1755,45,18,0,0,44,,,0,163.048,1 +0,2025-05-22 07:21:22.286048,1755,1593,1755,46,18,0,0,44,,,0,162.94,1 +0,2025-05-22 07:21:23.286485,1755,1593,1755,47,19,0,0,44,,,0,163.415,1 +0,2025-05-22 07:21:24.286905,1755,1593,1755,46,18,0,0,44,,,0,164.032,1 +0,2025-05-22 07:21:25.287338,1755,1593,1755,46,18,0,0,45,,,0,163.911,1 +0,2025-05-22 07:21:26.287772,1755,1593,1755,46,18,0,0,45,,,0,164.336,1 +0,2025-05-22 07:21:27.288204,1755,1593,1755,47,18,0,0,45,,,0,165.044,1 +0,2025-05-22 07:21:28.288625,1755,1593,1755,46,18,0,0,45,,,0,168.746,1 +0,2025-05-22 07:21:29.289053,1755,1593,1755,46,18,0,0,45,,,0,172.765,1 +0,2025-05-22 07:21:30.289496,1755,1593,1755,46,18,0,0,45,,,0,171.735,1 +0,2025-05-22 07:21:31.289927,1755,1593,1755,46,18,0,0,45,,,0,170.906,1 +0,2025-05-22 07:21:32.290358,1755,1593,1755,46,18,0,0,45,,,0,170.166,1 +0,2025-05-22 07:21:33.290777,1755,1593,1755,47,18,0,0,45,,,0,167.227,1 +0,2025-05-22 07:21:34.291194,1755,1593,1755,46,18,0,0,46,,,0,163.288,1 +0,2025-05-22 07:21:35.291620,1755,1593,1755,47,18,0,0,46,,,0,163.8,1 +0,2025-05-22 07:21:36.292050,1755,1593,1755,47,19,0,0,46,,,0,164.799,1 +0,2025-05-22 07:21:37.292474,1755,1593,1755,47,19,0,0,46,,,0,168.345,1 +0,2025-05-22 07:21:38.292900,1755,1593,1755,46,18,0,0,46,,,0,169.427,1 +0,2025-05-22 07:21:39.293340,1755,1593,1755,47,18,0,0,46,,,0,168.9,1 +0,2025-05-22 07:21:40.293802,1755,1593,1755,47,19,0,0,46,,,0,169.208,1 +0,2025-05-22 07:21:41.294219,1755,1593,1755,47,19,0,0,46,,,0,168.596,1 +0,2025-05-22 07:21:42.294645,1755,1593,1755,46,18,0,0,46,,,0,166.093,1 +0,2025-05-22 07:21:43.295066,1755,1593,1755,47,18,0,0,46,,,0,169.899,1 +0,2025-05-22 07:21:44.295498,1755,1593,1755,47,19,0,0,46,,,0,171.042,1 +0,2025-05-22 07:21:45.295924,1755,1593,1755,47,18,0,0,47,,,0,172.313,1 +0,2025-05-22 07:21:46.296353,1755,1593,1755,46,18,0,0,47,,,0,171.179,1 +0,2025-05-22 07:21:47.296778,1755,1593,1755,46,18,0,0,47,,,0,173.428,1 +0,2025-05-22 07:21:48.297203,1755,1593,1755,46,18,0,0,47,,,0,172.265,1 +0,2025-05-22 07:21:49.297592,1755,1593,1755,46,18,0,0,47,,,0,169.976,1 +0,2025-05-22 07:21:50.298010,1755,1593,1755,46,18,0,0,47,,,0,167.299,1 +0,2025-05-22 07:21:51.298436,1755,1593,1755,46,18,0,0,47,,,0,169.135,1 +0,2025-05-22 07:21:52.298858,1755,1593,1755,46,18,0,0,47,,,0,168.709,1 +0,2025-05-22 07:21:53.299286,1755,1593,1755,47,18,0,0,47,,,0,172.096,1 +0,2025-05-22 07:21:54.299709,1755,1593,1755,47,18,0,0,47,,,0,169.99,1 +0,2025-05-22 07:21:55.300131,1755,1593,1755,46,18,0,0,47,,,0,170.417,1 +0,2025-05-22 07:21:56.300561,1755,1593,1755,44,17,0,0,47,,,0,168.859,1 +0,2025-05-22 07:21:57.300950,1755,1593,1755,21,2,0,0,46,,,0,110.054,1 +0,2025-05-22 07:21:58.301328,1755,1593,1755,22,2,0,0,46,,,0,95.543,1 +0,2025-05-22 07:21:59.301714,1755,1593,1755,0,0,0,0,45,,,0,90.746,1 +0,2025-05-22 07:22:00.302095,1755,1593,1755,0,0,0,0,45,,,0,88.161,1 +0,2025-05-22 07:22:01.302462,1755,1593,1755,0,0,0,0,45,,,0,87.927,1 +0,2025-05-22 07:22:02.302828,1755,1593,1755,0,0,0,0,45,,,0,87.763,1 +0,2025-05-22 07:22:03.303207,1755,1593,1755,0,0,0,0,45,,,0,87.721,1 +0,2025-05-22 07:22:04.303578,1755,1593,1755,0,0,0,0,45,,,0,87.723,1 +0,2025-05-22 07:22:05.304031,1755,1593,1755,0,0,0,0,45,,,0,87.683,1 +0,2025-05-22 07:22:06.304365,1755,1593,1755,0,0,0,0,45,,,0,87.47,1 +0,2025-05-22 07:22:07.304734,1755,1593,1755,0,0,0,0,44,,,0,87.276,1 +0,2025-05-22 07:22:08.305170,1755,1593,1755,0,0,0,0,44,,,0,87.238,1 +0,2025-05-22 07:22:09.305577,1755,1593,1755,0,0,0,0,44,,,0,87.237,1 +0,2025-05-22 07:22:10.305978,1755,1593,1755,0,0,0,0,44,,,0,87.238,1 +0,2025-05-22 07:22:11.306538,1755,1593,1755,0,0,0,0,44,,,0,87.228,1 +0,2025-05-22 07:22:12.306949,1755,1593,1755,0,0,0,0,44,,,0,87.067,1 +0,2025-05-22 07:22:13.307379,1755,1593,1755,0,0,0,0,44,,,0,86.889,1 +0,2025-05-22 07:22:14.307773,1755,1593,1755,0,0,0,0,44,,,0,86.763,1 +0,2025-05-22 07:22:15.308163,1755,1593,1755,0,0,0,0,44,,,0,86.763,1 +0,2025-05-22 07:22:16.308542,1755,1593,1755,0,0,0,0,44,,,0,86.754,1 +0,2025-05-22 07:22:17.308965,1755,1593,1755,9,2,0,0,44,,,0,86.934,1 +0,2025-05-22 07:22:18.309381,1755,1593,1755,0,0,0,0,43,,,0,87.99,1 +0,2025-05-22 07:22:19.309721,1755,1593,1755,0,0,0,0,43,,,0,86.727,1 +0,2025-05-22 07:22:20.310356,1755,1593,1755,0,0,0,0,43,,,0,86.641,1 +0,2025-05-22 07:22:21.310966,1755,1593,1755,0,0,0,0,43,,,0,86.467,1 +0,2025-05-22 07:22:22.311493,1755,1593,1755,0,0,0,0,43,,,0,86.361,1 +0,2025-05-22 07:22:23.311834,1755,1593,1755,0,0,0,0,43,,,0,86.282,1 +0,2025-05-22 07:22:24.312224,1755,1593,1755,0,0,0,0,43,,,0,86.283,1 +0,2025-05-22 07:22:25.312567,1755,1593,1755,0,0,0,0,43,,,0,86.27,1 +0,2025-05-22 07:22:26.312947,1755,1593,1755,0,0,0,0,43,,,0,86.279,1 +0,2025-05-22 07:22:27.313382,1755,1593,1755,5,0,0,0,43,,,0,86.288,1 +0,2025-05-22 07:22:28.313886,1755,1593,1755,5,0,0,0,43,,,0,86.289,1 +0,2025-05-22 07:22:29.314428,1755,1593,1755,5,0,0,0,43,,,0,86.269,1 +0,2025-05-22 07:22:30.314975,1755,1593,1755,16,1,0,0,43,,,0,87.319,1 +0,2025-05-22 07:22:31.315483,1755,1593,1755,33,15,0,0,43,,,0,102.736,1 +0,2025-05-22 07:22:32.316918,1755,1593,1755,0,0,0,0,42,,,0,86.925,1 +0,2025-05-22 07:22:33.317411,1575,1593,1575,92,54,0,0,44,,,0,107.134,1 +0,2025-05-22 07:22:34.317939,1650,1593,1650,95,44,0,0,46,,,0,294.726,1 +0,2025-05-22 07:22:35.318568,1755,1593,1755,0,0,0,0,44,,,0,216.303,1 +0,2025-05-22 07:22:36.319054,1755,1593,1755,0,0,0,0,43,,,0,87.898,1 +0,2025-05-22 07:22:37.319525,1755,1593,1755,0,0,0,0,43,,,0,87.171,1 +0,2025-05-22 07:22:38.320042,1755,1593,1755,0,0,0,0,43,,,0,86.841,1 +0,2025-05-22 07:22:39.320730,1755,1593,1755,0,0,0,0,43,,,0,86.811,1 +0,2025-05-22 07:22:40.321245,1470,1593,1470,87,50,0,0,45,,,0,117.286,1 +0,2025-05-22 07:22:41.321758,1755,1593,1755,0,0,0,0,43,,,0,109.895,1 +0,2025-05-22 07:22:42.322124,1755,1593,1755,0,0,0,0,43,,,0,87.044,1 +0,2025-05-22 07:22:43.322659,1755,1593,1755,1,0,0,0,43,,,0,88.142,1 +0,2025-05-22 07:22:44.323156,1755,1593,1755,0,0,0,0,43,,,0,97.358,1 +0,2025-05-22 07:22:45.323659,1755,1593,1755,0,0,0,0,43,,,0,86.308,1 +0,2025-05-22 07:22:46.324230,1755,1593,1755,0,0,0,0,43,,,0,86.378,1 +0,2025-05-22 07:22:47.324918,1755,1593,1755,0,0,0,0,43,,,0,86.507,1 +0,2025-05-22 07:22:48.325446,1755,1593,1755,28,17,0,0,43,,,0,101.569,1 +0,2025-05-22 07:22:49.325933,1755,1593,1755,0,0,0,0,42,,,0,86.457,1 +0,2025-05-22 07:22:50.326447,1755,1593,1755,0,0,0,0,42,,,0,86.26,1 +0,2025-05-22 07:22:51.327157,1755,1593,1755,0,0,0,0,42,,,0,86.104,1 +0,2025-05-22 07:22:52.327672,1755,1593,1755,2,0,0,0,42,,,0,90.009,1 +0,2025-05-22 07:22:53.328189,1755,1593,1755,1,0,0,0,42,,,0,108.039,1 +0,2025-05-22 07:22:54.328655,1755,1593,1755,1,0,0,0,42,,,0,86.184,1 +0,2025-05-22 07:22:55.329292,1755,1593,1755,0,0,0,0,42,,,0,86.352,1 +0,2025-05-22 07:22:56.329804,1755,1593,1755,20,7,0,0,42,,,0,91.852,1 +0,2025-05-22 07:22:57.330330,1755,1593,1755,0,0,0,0,42,,,0,121.037,1 +0,2025-05-22 07:22:58.330822,1755,1593,1755,1,0,0,0,42,,,0,86.13,1 +0,2025-05-22 07:22:59.331312,1755,1593,1755,1,0,0,0,42,,,0,86.203,1 +0,2025-05-22 07:23:00.332012,1755,1593,1755,2,0,0,0,42,,,0,86.286,1 +0,2025-05-22 07:23:01.332519,1755,1593,1755,0,0,0,0,43,,,0,116.845,1 +0,2025-05-22 07:23:02.333020,1680,1593,1680,64,39,0,0,46,,,0,259.542,1 +0,2025-05-22 07:23:03.333544,1755,1593,1755,8,3,0,0,43,,,0,144.303,1 +0,2025-05-22 07:23:04.334043,1755,1593,1755,8,3,0,0,43,,,0,89.614,1 +0,2025-05-22 07:23:05.334730,1755,1593,1755,0,0,0,0,42,,,0,88.458,1 +0,2025-05-22 07:23:06.335262,1755,1593,1755,70,1,0,0,42,,,0,88.137,1 +0,2025-05-22 07:23:07.335801,1755,1593,1755,0,0,0,0,42,,,0,87.369,1 +0,2025-05-22 07:23:08.336352,1755,1593,1755,0,0,0,0,42,,,0,86.252,1 +0,2025-05-22 07:23:09.336868,1755,1593,1755,73,2,0,0,42,,,0,88.397,1 +0,2025-05-22 07:23:10.337356,1755,1593,1755,0,0,0,0,42,,,0,86.663,1 +0,2025-05-22 07:23:11.337725,1755,1593,1755,0,0,0,0,42,,,0,89.116,1 +0,2025-05-22 07:23:12.338087,1755,1593,1755,0,0,0,0,42,,,0,88.719,1 +0,2025-05-22 07:23:13.338431,1755,1593,1755,0,0,0,0,42,,,0,88.816,1 +0,2025-05-22 07:23:14.338828,1755,1593,1755,0,0,0,0,42,,,0,88.764,1 +0,2025-05-22 07:23:15.339223,1755,1593,1755,0,0,0,0,42,,,0,88.758,1 +0,2025-05-22 07:23:16.339634,1755,1593,1755,0,0,0,0,42,,,0,88.698,1 +0,2025-05-22 07:23:17.340017,1755,1593,1755,0,0,0,0,42,,,0,88.738,1 +0,2025-05-22 07:23:18.340426,1755,1593,1755,0,0,0,0,42,,,0,88.681,1 +0,2025-05-22 07:23:19.340830,1755,1593,1755,0,0,0,0,42,,,0,88.776,1 +0,2025-05-22 07:23:20.341239,1755,1593,1755,0,0,0,0,42,,,0,88.7,1 +0,2025-05-22 07:23:21.341650,1755,1593,1755,0,0,0,0,42,,,0,88.661,1 +0,2025-05-22 07:23:22.342095,1755,1593,1755,0,0,0,0,42,,,0,88.662,1 +0,2025-05-22 07:23:23.342384,1755,1593,1755,0,0,0,0,42,,,0,88.604,1 +0,2025-05-22 07:23:24.342789,1755,1593,1755,0,0,0,0,42,,,0,88.443,1 +0,2025-05-22 07:23:25.343156,1755,1593,1755,0,0,0,0,41,,,0,88.429,1 +0,2025-05-22 07:23:26.343491,1755,1593,1755,0,0,0,0,41,,,0,88.449,1 +0,2025-05-22 07:23:27.343838,1755,1593,1755,0,0,0,0,41,,,0,88.482,1 +0,2025-05-22 07:23:28.344139,1755,1593,1755,0,0,0,0,41,,,0,88.413,1 +0,2025-05-22 07:23:29.344480,1755,1593,1755,0,0,0,0,41,,,0,88.371,1 +0,2025-05-22 07:23:30.344988,1755,1593,1755,0,0,0,0,41,,,0,88.313,1 +0,2025-05-22 07:23:31.345457,1755,1593,1755,0,0,0,0,41,,,0,88.323,1 +0,2025-05-22 07:23:32.345857,1755,1593,1755,0,0,0,0,41,,,0,88.337,1 +0,2025-05-22 07:23:33.346245,1755,1593,1755,0,0,0,0,41,,,0,88.272,1 +0,2025-05-22 07:23:34.346634,1755,1593,1755,0,0,0,0,41,,,0,88.187,1 +0,2025-05-22 07:23:35.347022,1755,1593,1755,0,0,0,0,41,,,0,88.272,1 +0,2025-05-22 07:23:36.347411,1755,1593,1755,0,0,0,0,41,,,0,88.299,1 +0,2025-05-22 07:23:37.347798,1755,1593,1755,0,0,0,0,41,,,0,88.303,1 +0,2025-05-22 07:23:38.348184,1755,1593,1755,0,0,0,0,41,,,0,88.157,1 +0,2025-05-22 07:23:39.348575,1755,1593,1755,0,0,0,0,41,,,0,88.202,1 +0,2025-05-22 07:23:40.348966,1755,1593,1755,0,0,0,0,41,,,0,88.195,1 +0,2025-05-22 07:23:41.349371,1755,1593,1755,0,0,0,0,41,,,0,88.197,1 +0,2025-05-22 07:23:42.349764,1755,1593,1755,0,0,0,0,41,,,0,88.195,1 +0,2025-05-22 07:23:43.350176,1755,1593,1755,0,0,0,0,41,,,0,88.187,1 +0,2025-05-22 07:23:44.350577,1755,1593,1755,0,0,0,0,41,,,0,88.205,1 +0,2025-05-22 07:23:45.350967,1755,1593,1755,0,0,0,0,41,,,0,88.128,1 +0,2025-05-22 07:23:46.351372,1755,1593,1755,0,0,0,0,41,,,0,88.131,1 +0,2025-05-22 07:23:47.351771,1755,1593,1755,0,0,0,0,41,,,0,88.117,1 +0,2025-05-22 07:23:48.352181,1755,1593,1755,0,0,0,0,41,,,0,88.059,1 +0,2025-05-22 07:23:49.352571,1755,1593,1755,0,0,0,0,41,,,0,87.994,1 +0,2025-05-22 07:23:50.352952,1755,1593,1755,0,0,0,0,41,,,0,88.035,1 +0,2025-05-22 07:23:51.353492,1755,1593,1755,0,0,0,0,41,,,0,86.81,2 +0,2025-05-22 07:23:52.353930,1755,1593,1755,0,0,0,0,41,,,0,85.298,2 +0,2025-05-22 07:23:53.354392,1755,1593,1755,0,0,0,0,41,,,0,85.302,2 +0,2025-05-22 07:23:54.354974,1755,1593,1755,2,0,0,0,41,,,0,88.017,2 +0,2025-05-22 07:23:55.355581,1755,1593,1755,2,0,0,0,41,,,0,85.396,2 +0,2025-05-22 07:23:56.356378,1755,1593,1755,0,0,0,0,40,,,0,85.3,2 +0,2025-05-22 07:23:57.357162,1755,1593,1755,0,0,0,0,40,,,0,85.277,2 +0,2025-05-22 07:23:58.358047,1755,1593,1755,0,0,0,0,40,,,0,85.281,2 +0,2025-05-22 07:23:59.358757,1755,1593,1755,0,0,0,0,40,,,0,85.507,2 +0,2025-05-22 07:24:00.359353,1755,1593,1755,0,0,0,0,40,,,0,85.65,2 +0,2025-05-22 07:24:01.360023,1755,1593,1755,0,0,0,0,40,,,0,85.627,2 +0,2025-05-22 07:24:02.360661,1755,1593,1755,0,0,0,0,40,,,0,85.507,2 +0,2025-05-22 07:24:03.361365,1755,1593,1755,0,0,0,0,40,,,0,85.412,2 +0,2025-05-22 07:24:04.362059,1755,1593,1755,0,0,0,0,40,,,0,85.418,2 +0,2025-05-22 07:24:05.362757,1755,1593,1755,0,0,0,0,40,,,0,85.399,2 +0,2025-05-22 07:24:06.363425,1755,1593,1755,0,0,0,0,40,,,0,85.351,2 +0,2025-05-22 07:24:07.364247,1755,1593,1755,0,0,0,0,40,,,0,85.313,2 +0,2025-05-22 07:24:08.364937,1755,1593,1755,0,0,0,0,40,,,0,85.309,2 +0,2025-05-22 07:24:09.365401,1755,1593,1755,100,3,0,0,40,,,0,88.057,2 +0,2025-05-22 07:24:10.365922,1755,1593,1755,10,2,0,0,40,,,0,92.06,2 +0,2025-05-22 07:24:11.366400,1755,1593,1755,0,0,0,0,40,,,0,88.174,1 +0,2025-05-22 07:24:12.366750,1755,1593,1755,0,0,0,0,40,,,0,88.171,0 +0,2025-05-22 07:24:13.367080,1755,1593,1755,0,0,0,0,40,,,0,88.168,0 +0,2025-05-22 07:56:08.196552,345,1593,345,0,0,0,0,38,,,0,50.794,0 +0,2025-05-22 07:56:09.196841,345,1593,345,0,0,0,0,38,,,0,50.808,0 +0,2025-05-22 07:56:10.197111,345,1593,345,0,0,0,0,38,,,0,50.799,0 +0,2025-05-22 07:56:11.197379,345,1593,345,0,0,0,0,38,,,0,50.79,0 +0,2025-05-22 07:56:12.197628,345,1593,345,0,0,0,0,38,,,0,50.794,0 +0,2025-05-22 07:56:13.197905,345,1593,345,0,0,0,0,37,,,0,50.805,0 +0,2025-05-22 07:56:14.198164,345,1593,345,0,0,0,0,37,,,0,50.791,0 +0,2025-05-22 07:56:15.198412,345,1593,345,0,0,0,0,37,,,0,50.809,0 +0,2025-05-22 07:56:16.198678,345,1593,345,0,0,0,0,37,,,0,50.794,0 +0,2025-05-22 07:56:17.198953,345,1593,345,0,0,0,0,37,,,0,50.777,0 +0,2025-05-22 07:56:18.199259,345,1593,345,0,0,0,0,37,,,0,50.794,0 +0,2025-05-22 07:56:19.199564,345,1593,345,0,0,0,0,37,,,0,50.783,0 +0,2025-05-22 07:56:20.199832,345,1593,345,0,0,0,0,37,,,0,50.675,0 +0,2025-05-22 07:56:21.200074,345,1593,345,0,0,0,0,37,,,0,50.665,0 +0,2025-05-22 07:56:22.200413,345,1593,345,0,0,0,0,37,,,0,50.592,0 +0,2025-05-22 07:56:23.200672,345,1593,345,0,0,0,0,37,,,0,50.53,0 +0,2025-05-22 07:56:24.201158,1755,1593,1755,19,0,0,0,37,,,0,60.532,1 +0,2025-05-22 07:56:25.201517,1755,1593,1755,50,1,0,0,38,,,0,85.131,1 +0,2025-05-22 07:56:26.201966,1755,1593,1755,0,0,0,0,38,,,0,83.837,1 +0,2025-05-22 07:56:27.202396,1755,1593,1755,0,0,0,0,38,,,0,83.498,1 +0,2025-05-22 07:56:28.202699,1755,1593,1755,0,0,0,0,38,,,0,83.305,1 +0,2025-05-22 07:56:29.203018,1755,1593,1755,0,0,0,0,38,,,0,83.494,1 +0,2025-05-22 07:56:30.203447,1755,1593,1755,0,0,0,0,38,,,0,83.418,1 +0,2025-05-22 07:56:31.203812,1755,1593,1755,0,0,0,0,38,,,0,83.382,1 +0,2025-05-22 07:56:32.204133,1755,1593,1755,16,6,0,0,39,,,0,94.906,1 +0,2025-05-22 07:56:33.204398,1755,1593,1755,47,18,0,0,39,,,0,167.074,1 +0,2025-05-22 07:56:34.204660,1755,1593,1755,46,18,0,0,40,,,0,167.234,1 +0,2025-05-22 07:56:35.204926,1755,1593,1755,47,18,0,0,40,,,0,167.269,1 +0,2025-05-22 07:56:36.205193,1755,1593,1755,47,19,0,0,40,,,0,165.85,1 +0,2025-05-22 07:56:37.205471,1755,1593,1755,47,19,0,0,40,,,0,163.462,1 +0,2025-05-22 07:56:38.205723,1755,1593,1755,46,18,0,0,41,,,0,162.383,1 +0,2025-05-22 07:56:39.205994,1755,1593,1755,47,18,0,0,41,,,0,166.122,1 +0,2025-05-22 07:56:40.206266,1755,1593,1755,47,18,0,0,41,,,0,168.347,1 +0,2025-05-22 07:56:41.206538,1755,1593,1755,47,19,0,0,41,,,0,165.281,1 +0,2025-05-22 07:56:42.206802,1755,1593,1755,46,18,0,0,41,,,0,162.953,1 +0,2025-05-22 07:56:43.207073,1755,1593,1755,47,18,0,0,42,,,0,169.075,1 +0,2025-05-22 07:56:44.207346,1755,1593,1755,47,19,0,0,42,,,0,172.15,1 +0,2025-05-22 07:56:45.207611,1755,1593,1755,47,18,0,0,42,,,0,165.18,1 +0,2025-05-22 07:56:46.207886,1755,1593,1755,46,18,0,0,42,,,0,168.316,1 +0,2025-05-22 07:56:47.208161,1755,1593,1755,47,18,0,0,42,,,0,170.096,1 +0,2025-05-22 07:56:48.208431,1755,1593,1755,47,19,0,0,42,,,0,164.557,1 +0,2025-05-22 07:56:49.208700,1755,1593,1755,46,18,0,0,43,,,0,167.98,1 +0,2025-05-22 07:56:50.208969,1755,1593,1755,46,18,0,0,43,,,0,169.937,1 +0,2025-05-22 07:56:51.209241,1755,1593,1755,47,19,0,0,43,,,0,166.501,1 +0,2025-05-22 07:56:52.209526,1755,1593,1755,47,18,0,0,43,,,0,164.754,1 +0,2025-05-22 07:56:53.209800,1755,1593,1755,47,18,0,0,43,,,0,167.458,1 +0,2025-05-22 07:56:54.210071,1755,1593,1755,46,18,0,0,43,,,0,168.481,1 +0,2025-05-22 07:56:55.210353,1755,1593,1755,47,18,0,0,43,,,0,162.787,1 +0,2025-05-22 07:56:56.210644,1755,1593,1755,47,18,0,0,44,,,0,171.912,1 +0,2025-05-22 07:56:57.210922,1755,1593,1755,46,18,0,0,44,,,0,172.141,1 +0,2025-05-22 07:56:58.211201,1755,1593,1755,46,18,0,0,44,,,0,168.677,1 +0,2025-05-22 07:56:59.211475,1755,1593,1755,47,18,0,0,44,,,0,167.171,1 +0,2025-05-22 07:57:00.211747,1755,1593,1755,47,18,0,0,44,,,0,171.103,1 +0,2025-05-22 07:57:01.212010,1755,1593,1755,46,18,0,0,44,,,0,170.555,1 +0,2025-05-22 07:57:02.212279,1755,1593,1755,47,18,0,0,44,,,0,165.931,1 +0,2025-05-22 07:57:03.212562,1755,1593,1755,47,18,0,0,44,,,0,165.1,1 +0,2025-05-22 07:57:04.212837,1755,1593,1755,47,18,0,0,45,,,0,172.173,1 +0,2025-05-22 07:57:05.213108,1755,1593,1755,46,18,0,0,45,,,0,170.051,1 +0,2025-05-22 07:57:06.213382,1755,1593,1755,47,19,0,0,45,,,0,168.109,1 +0,2025-05-22 07:57:07.213654,1755,1593,1755,47,18,0,0,45,,,0,168.907,1 +0,2025-05-22 07:57:08.213930,1755,1593,1755,47,18,0,0,45,,,0,172.39,1 +0,2025-05-22 07:57:09.214210,1755,1593,1755,46,18,0,0,45,,,0,165.724,1 +0,2025-05-22 07:57:10.214484,1755,1593,1755,47,19,0,0,45,,,0,165.909,1 +0,2025-05-22 07:57:11.214764,1755,1593,1755,47,18,0,0,45,,,0,170.276,1 +0,2025-05-22 07:57:12.215043,1755,1593,1755,46,18,0,0,46,,,0,174.191,1 +0,2025-05-22 07:57:13.215317,1755,1593,1755,47,19,0,0,46,,,0,170.56,1 +0,2025-05-22 07:57:14.215591,1755,1593,1755,47,19,0,0,46,,,0,167.852,1 +0,2025-05-22 07:57:15.215868,1755,1593,1755,47,18,0,0,46,,,0,165.921,1 +0,2025-05-22 07:57:16.216144,1755,1593,1755,46,18,0,0,46,,,0,171.141,1 +0,2025-05-22 07:57:17.216413,1755,1593,1755,47,19,0,0,46,,,0,173.218,1 +0,2025-05-22 07:57:18.216682,1755,1593,1755,47,19,0,0,46,,,0,171.344,1 +0,2025-05-22 07:57:19.216955,1755,1593,1755,45,18,0,0,46,,,0,168.651,1 +0,2025-05-22 07:57:20.217228,1755,1593,1755,44,17,0,0,46,,,0,164.911,1 +0,2025-05-22 07:57:21.217509,1755,1593,1755,47,18,0,0,46,,,0,165.042,1 +0,2025-05-22 07:57:22.217780,1755,1593,1755,46,18,0,0,47,,,0,170.733,1 +0,2025-05-22 07:57:23.218059,1755,1593,1755,47,19,0,0,47,,,0,173.968,1 +0,2025-05-22 07:57:24.218331,1755,1593,1755,47,19,0,0,47,,,0,170.041,1 +0,2025-05-22 07:57:25.218595,1755,1593,1755,46,18,0,0,47,,,0,166.017,1 +0,2025-05-22 07:57:26.218867,1755,1593,1755,46,18,0,0,47,,,0,166.092,1 +0,2025-05-22 07:57:27.219148,1755,1593,1755,47,18,0,0,47,,,0,166.04,1 +0,2025-05-22 07:57:28.219420,1755,1593,1755,47,18,0,0,47,,,0,166.789,1 +0,2025-05-22 07:57:29.219693,1755,1593,1755,46,18,0,0,47,,,0,171.284,1 +0,2025-05-22 07:57:30.219969,1755,1593,1755,46,18,0,0,47,,,0,172.726,1 +0,2025-05-22 07:57:31.220237,1755,1593,1755,47,19,0,0,47,,,0,168.112,1 +0,2025-05-22 07:57:32.220515,1755,1593,1755,47,18,0,0,47,,,0,165.731,1 +0,2025-05-22 07:57:33.220788,1755,1593,1755,46,18,0,0,47,,,0,170.052,1 +0,2025-05-22 07:57:34.221064,1755,1593,1755,44,17,0,0,47,,,0,171.256,1 +0,2025-05-22 07:57:35.221347,1755,1593,1755,46,18,0,0,47,,,0,168.027,1 +0,2025-05-22 07:57:36.221629,1755,1593,1755,47,19,0,0,48,,,0,174.341,1 +0,2025-05-22 07:57:37.221915,1755,1593,1755,47,19,0,0,48,,,0,172.187,1 +0,2025-05-22 07:57:38.222190,1755,1593,1755,47,18,0,0,48,,,0,169.331,1 +0,2025-05-22 07:57:39.222466,1755,1593,1755,47,18,0,0,48,,,0,167.628,1 +0,2025-05-22 07:57:40.222743,1755,1593,1755,47,18,0,0,48,,,0,170.182,1 +0,2025-05-22 07:57:41.223026,1755,1593,1755,47,19,0,0,48,,,0,174.246,1 +0,2025-05-22 07:57:42.223297,1755,1593,1755,46,18,0,0,48,,,0,167.776,1 +0,2025-05-22 07:57:43.223574,1755,1593,1755,47,18,0,0,48,,,0,166.927,1 +0,2025-05-22 07:57:44.223847,1755,1593,1755,47,18,0,0,48,,,0,169.149,1 +0,2025-05-22 07:57:45.224126,1755,1593,1755,47,19,0,0,48,,,0,172.39,1 +0,2025-05-22 07:57:46.224403,1755,1593,1755,46,18,0,0,48,,,0,173.222,1 +0,2025-05-22 07:57:47.224678,1755,1593,1755,47,19,0,0,48,,,0,168.754,1 +0,2025-05-22 07:57:48.224956,1755,1593,1755,47,18,0,0,48,,,0,166.112,1 +0,2025-05-22 07:57:49.225239,1755,1593,1755,47,18,0,0,48,,,0,167.64,1 +0,2025-05-22 07:57:50.225513,1755,1593,1755,46,18,0,0,49,,,0,172.844,1 +0,2025-05-22 07:57:51.225790,1755,1593,1755,47,19,0,0,48,,,0,174.932,1 +0,2025-05-22 07:57:52.226061,1755,1593,1755,47,19,0,0,48,,,0,170.855,1 +0,2025-05-22 07:57:53.226335,1755,1593,1755,47,18,0,0,49,,,0,167.784,1 +0,2025-05-22 07:57:54.226629,1755,1593,1755,25,4,0,0,48,,,0,147.565,1 +0,2025-05-22 07:57:55.226949,1755,1593,1755,21,2,0,0,47,,,0,100.056,1 +0,2025-05-22 07:57:56.227276,1755,1593,1755,20,0,0,0,47,,,0,96.246,1 +0,2025-05-22 07:57:57.227621,1755,1593,1755,0,0,0,0,47,,,0,90.367,1 +0,2025-05-22 07:57:58.227968,1755,1593,1755,0,0,0,0,47,,,0,90.068,1 +0,2025-05-22 07:57:59.228348,1755,1593,1755,0,0,0,0,46,,,0,89.976,1 +0,2025-05-22 07:58:00.228720,1755,1593,1755,0,0,0,0,46,,,0,89.757,1 +0,2025-05-22 07:58:01.229097,1755,1593,1755,0,0,0,0,46,,,0,89.561,1 +0,2025-05-22 07:58:02.229437,1755,1593,1755,0,0,0,0,46,,,0,89.59,1 +0,2025-05-22 07:58:03.229865,1755,1593,1755,0,0,0,0,46,,,0,89.428,1 +0,2025-05-22 07:58:04.230279,1755,1593,1755,0,0,0,0,46,,,0,89.245,1 +0,2025-05-22 07:58:05.230728,1755,1593,1755,0,0,0,0,46,,,0,89.023,1 +0,2025-05-22 07:58:06.231117,1755,1593,1755,0,0,0,0,45,,,0,89.008,1 +0,2025-05-22 07:58:07.231519,1755,1593,1755,0,0,0,0,45,,,0,88.965,1 +0,2025-05-22 07:58:08.231912,1755,1593,1755,0,0,0,0,45,,,0,88.933,1 +0,2025-05-22 07:58:09.232297,1755,1593,1755,0,0,0,0,45,,,0,88.857,1 +0,2025-05-22 07:58:10.232730,1755,1593,1755,0,0,0,0,45,,,0,88.59,1 +0,2025-05-22 07:58:11.233045,1755,1593,1755,0,0,0,0,45,,,0,88.561,1 +0,2025-05-22 07:58:12.233386,1755,1593,1755,0,0,0,0,45,,,0,88.489,1 +0,2025-05-22 07:58:13.233748,1755,1593,1755,0,0,0,0,45,,,0,88.455,1 +0,2025-05-22 07:58:14.234110,1755,1593,1755,0,0,0,0,45,,,0,88.431,1 +0,2025-05-22 07:58:15.234465,1755,1593,1755,0,0,0,0,45,,,0,88.493,1 +0,2025-05-22 07:58:16.234895,1755,1593,1755,0,0,0,0,44,,,0,89.577,1 +0,2025-05-22 07:58:17.235308,1755,1593,1755,0,0,0,0,44,,,0,88.129,1 +0,2025-05-22 07:58:18.235660,1755,1593,1755,0,0,0,0,44,,,0,88.035,1 +0,2025-05-22 07:58:19.235964,1755,1593,1755,0,0,0,0,44,,,0,87.992,1 +0,2025-05-22 07:58:20.236329,1755,1593,1755,0,0,0,0,44,,,0,87.978,1 +0,2025-05-22 07:58:21.236685,1755,1593,1755,0,0,0,0,44,,,0,87.939,1 +0,2025-05-22 07:58:22.236980,1755,1593,1755,0,0,0,0,44,,,0,87.879,1 +0,2025-05-22 07:58:23.237764,1755,1593,1755,0,0,0,0,44,,,0,87.843,1 +0,2025-05-22 07:58:24.238136,1755,1593,1755,0,0,0,0,44,,,0,87.707,1 +0,2025-05-22 07:58:25.238465,1755,1593,1755,0,0,0,0,44,,,0,87.632,1 +0,2025-05-22 07:58:26.238800,1755,1593,1755,0,0,0,0,44,,,0,87.545,1 +0,2025-05-22 07:58:27.239145,1755,1593,1755,0,0,0,0,44,,,0,87.51,1 +0,2025-05-22 07:58:28.239499,1755,1593,1755,0,0,0,0,43,,,0,87.499,1 +0,2025-05-22 07:58:29.239886,1755,1593,1755,0,0,0,0,43,,,0,87.47,1 +0,2025-05-22 07:58:30.240287,1755,1593,1755,5,0,0,0,43,,,0,87.489,1 +0,2025-05-22 07:58:31.240735,1755,1593,1755,5,0,0,0,43,,,0,87.547,1 +0,2025-05-22 07:58:32.241236,1755,1593,1755,5,0,0,0,43,,,0,87.536,1 +0,2025-05-22 07:58:33.241778,1755,1593,1755,5,0,0,0,43,,,0,87.501,1 +0,2025-05-22 07:58:34.242234,1755,1593,1755,22,2,0,0,43,,,0,92.456,1 +0,2025-05-22 07:58:35.242719,1755,1593,1755,0,0,0,0,43,,,0,99.021,1 +0,2025-05-22 07:58:36.243178,1755,1593,1755,0,0,0,0,43,,,0,86.497,1 +0,2025-05-22 07:58:37.243648,1440,1593,1440,94,55,0,0,47,,,0,250.361,1 +0,2025-05-22 07:58:38.244133,1725,1593,1725,66,31,0,0,46,,,0,283.307,1 +0,2025-05-22 07:58:39.244622,1755,1593,1755,0,0,0,0,44,,,0,88.187,1 +0,2025-05-22 07:58:40.245177,1755,1593,1755,0,0,0,0,44,,,0,88.658,1 +0,2025-05-22 07:58:41.245710,1755,1593,1755,0,0,0,0,44,,,0,87.237,1 +0,2025-05-22 07:58:42.246369,1755,1593,1755,0,0,0,0,43,,,0,87.238,1 +0,2025-05-22 07:58:43.247004,1755,1593,1755,68,20,0,0,44,,,0,108.265,1 +0,2025-05-22 07:58:44.247663,1755,1593,1755,0,0,0,0,43,,,0,88.374,1 +0,2025-05-22 07:58:45.248245,1755,1593,1755,4,0,0,0,43,,,0,88.026,1 +0,2025-05-22 07:58:46.248891,1755,1593,1755,0,0,0,0,43,,,0,99.235,1 +0,2025-05-22 07:58:47.249566,1755,1593,1755,0,0,0,0,43,,,0,86.765,1 +0,2025-05-22 07:58:48.250044,1755,1593,1755,0,0,0,0,43,,,0,86.521,1 +0,2025-05-22 07:58:49.250501,1755,1593,1755,0,0,0,0,43,,,0,86.45,1 +0,2025-05-22 07:58:50.250933,1755,1593,1755,0,0,0,0,43,,,0,86.317,1 +0,2025-05-22 07:58:51.251404,1755,1593,1755,0,0,0,0,43,,,0,86.281,1 +0,2025-05-22 07:58:52.251855,1755,1593,1755,0,0,0,0,43,,,0,86.279,1 +0,2025-05-22 07:58:53.252293,1755,1593,1755,0,0,0,0,43,,,0,86.27,1 +0,2025-05-22 07:58:54.252727,1755,1593,1755,0,0,0,0,43,,,0,86.27,1 +0,2025-05-22 07:58:55.253178,1755,1593,1755,0,0,0,0,43,,,0,86.269,1 +0,2025-05-22 07:58:56.253777,1755,1593,1755,0,0,0,0,43,,,0,86.26,1 +0,2025-05-22 07:58:57.254234,1755,1593,1755,0,0,0,0,42,,,0,86.175,1 +0,2025-05-22 07:58:58.254658,1755,1593,1755,0,0,0,0,42,,,0,86.144,1 +0,2025-05-22 07:58:59.255115,1755,1593,1755,0,0,0,0,42,,,0,85.909,1 +0,2025-05-22 07:59:00.255595,1755,1593,1755,0,0,0,0,42,,,0,86.051,1 +0,2025-05-22 07:59:01.256139,1755,1593,1755,16,2,0,0,42,,,0,87.435,1 +0,2025-05-22 07:59:02.256809,1755,1593,1755,0,0,0,0,42,,,0,100.382,1 +0,2025-05-22 07:59:03.257430,1755,1593,1755,0,0,0,0,42,,,0,86.028,1 +0,2025-05-22 07:59:04.258047,1755,1593,1755,4,2,0,0,42,,,0,86.141,1 +0,2025-05-22 07:59:05.258619,1755,1593,1755,5,2,0,0,42,,,0,87.176,1 +0,2025-05-22 07:59:06.259263,1755,1593,1755,0,0,0,0,42,,,0,109.326,1 +0,2025-05-22 07:59:07.259800,1755,1593,1755,1,0,0,0,42,,,0,86.754,1 +0,2025-05-22 07:59:08.260389,1755,1593,1755,0,0,0,0,42,,,0,86.754,1 +0,2025-05-22 07:59:09.261040,1755,1593,1755,0,0,0,0,42,,,0,86.708,1 +0,2025-05-22 07:59:10.261661,1755,1593,1755,43,19,0,0,43,,,0,117.79,1 +0,2025-05-22 07:59:11.262326,1755,1593,1755,1,0,0,0,42,,,0,95.483,1 +0,2025-05-22 07:59:12.262872,1755,1593,1755,1,0,0,0,42,,,0,85.586,1 +0,2025-05-22 07:59:13.263500,1755,1593,1755,2,0,0,0,42,,,0,85.569,1 +0,2025-05-22 07:59:14.263947,1755,1593,1755,0,0,0,0,42,,,0,117.181,1 +0,2025-05-22 07:59:15.264579,1695,1593,1695,81,38,0,0,45,,,0,242.684,1 +0,2025-05-22 07:59:16.265078,1755,1593,1755,9,3,0,0,43,,,0,156.85,1 +0,2025-05-22 07:59:17.265552,1755,1593,1755,8,3,0,0,42,,,0,88.204,1 +0,2025-05-22 07:59:18.266076,1755,1593,1755,0,0,0,0,42,,,0,88.05,1 +0,2025-05-22 07:59:19.266710,1755,1593,1755,70,1,0,0,42,,,0,86.629,1 +0,2025-05-22 07:59:20.267224,1755,1593,1755,0,0,0,0,42,,,0,86.029,1 +0,2025-05-22 07:59:21.267786,1755,1593,1755,70,1,0,0,42,,,0,85.58,1 +0,2025-05-22 07:59:22.268311,1755,1593,1755,0,0,0,0,42,,,0,87.914,1 +0,2025-05-22 07:59:23.268615,1755,1593,1755,1,0,0,0,42,,,0,87.2,1 +0,2025-05-22 07:59:24.268957,1755,1593,1755,0,0,0,0,42,,,0,88.245,1 +0,2025-05-22 07:59:25.269329,1755,1593,1755,0,0,0,0,42,,,0,88.254,1 +0,2025-05-22 07:59:26.269710,1755,1593,1755,0,0,0,0,42,,,0,88.32,1 +0,2025-05-22 07:59:27.270211,1755,1593,1755,0,0,0,0,42,,,0,88.207,1 +0,2025-05-22 07:59:28.270588,1755,1593,1755,0,0,0,0,42,,,0,88.207,1 +0,2025-05-22 07:59:29.270924,1755,1593,1755,0,0,0,0,42,,,0,88.197,1 +0,2025-05-22 07:59:30.271305,1755,1593,1755,0,0,0,0,42,,,0,88.13,1 +0,2025-05-22 07:59:31.271687,1755,1593,1755,0,0,0,0,42,,,0,88.197,1 +0,2025-05-22 07:59:32.272025,1755,1593,1755,0,0,0,0,42,,,0,88.14,1 +0,2025-05-22 07:59:33.272378,1755,1593,1755,0,0,0,0,42,,,0,88.121,1 +0,2025-05-22 07:59:34.272798,1755,1593,1755,0,0,0,0,42,,,0,88.003,1 +0,2025-05-22 07:59:35.273202,1755,1593,1755,0,0,0,0,41,,,0,87.874,1 +0,2025-05-22 07:59:36.273565,1755,1593,1755,0,0,0,0,41,,,0,87.818,1 +0,2025-05-22 07:59:37.273952,1755,1593,1755,0,0,0,0,41,,,0,87.777,1 +0,2025-05-22 07:59:38.274266,1755,1593,1755,0,0,0,0,41,,,0,87.76,1 +0,2025-05-22 07:59:39.274595,1755,1593,1755,0,0,0,0,41,,,0,87.731,1 +0,2025-05-22 07:59:40.274979,1755,1593,1755,0,0,0,0,41,,,0,87.786,1 +0,2025-05-22 07:59:41.275441,1755,1593,1755,0,0,0,0,41,,,0,87.777,1 +0,2025-05-22 07:59:42.275841,1755,1593,1755,0,0,0,0,41,,,0,87.722,1 +0,2025-05-22 07:59:43.276223,1755,1593,1755,0,0,0,0,41,,,0,87.72,1 +0,2025-05-22 07:59:44.276602,1755,1593,1755,0,0,0,0,41,,,0,87.78,1 +0,2025-05-22 07:59:45.276986,1755,1593,1755,0,0,0,0,41,,,0,87.722,1 +0,2025-05-22 07:59:46.277360,1755,1593,1755,0,0,0,0,41,,,0,87.77,1 +0,2025-05-22 07:59:47.277738,1755,1593,1755,0,0,0,0,41,,,0,87.718,1 +0,2025-05-22 07:59:48.278114,1755,1593,1755,0,0,0,0,41,,,0,87.72,1 +0,2025-05-22 07:59:49.278488,1755,1593,1755,0,0,0,0,41,,,0,87.798,1 +0,2025-05-22 07:59:50.278874,1755,1593,1755,0,0,0,0,41,,,0,87.683,1 +0,2025-05-22 07:59:51.279259,1755,1593,1755,0,0,0,0,41,,,0,87.654,1 +0,2025-05-22 07:59:52.279647,1755,1593,1755,0,0,0,0,41,,,0,87.653,1 +0,2025-05-22 07:59:53.280039,1755,1593,1755,0,0,0,0,41,,,0,87.596,1 +0,2025-05-22 07:59:54.280430,1755,1593,1755,0,0,0,0,41,,,0,87.555,1 +0,2025-05-22 07:59:55.280831,1755,1593,1755,0,0,0,0,41,,,0,87.516,1 +0,2025-05-22 07:59:56.281260,1755,1593,1755,0,0,0,0,41,,,0,87.452,1 +0,2025-05-22 07:59:57.281660,1755,1593,1755,0,0,0,0,41,,,0,87.48,1 +0,2025-05-22 07:59:58.282039,1755,1593,1755,0,0,0,0,41,,,0,87.462,1 +0,2025-05-22 07:59:59.282382,1755,1593,1755,0,0,0,0,41,,,0,87.299,1 +0,2025-05-22 08:00:00.282758,1755,1593,1755,0,0,0,0,41,,,0,87.298,1 +0,2025-05-22 08:00:01.283139,1755,1593,1755,0,0,0,0,41,,,0,87.245,1 +0,2025-05-22 08:00:02.283510,1755,1593,1755,0,0,0,0,41,,,0,87.255,1 +0,2025-05-22 08:00:03.283903,1755,1593,1755,0,0,0,0,41,,,0,87.245,1 +0,2025-05-22 08:00:04.284293,1755,1593,1755,0,0,0,0,41,,,0,87.234,1 +0,2025-05-22 08:00:05.284689,1755,1593,1755,0,0,0,0,41,,,0,87.232,1 +0,2025-05-22 08:00:06.285061,1755,1593,1755,0,0,0,0,41,,,0,87.236,1 +0,2025-05-22 08:00:07.285461,1755,1593,1755,0,0,0,0,41,,,0,87.232,1 +0,2025-05-22 08:00:08.286011,1755,1593,1755,0,0,0,0,40,,,0,86.191,2 +0,2025-05-22 08:00:09.286430,1755,1593,1755,0,0,0,0,40,,,0,84.978,2 +0,2025-05-22 08:00:10.286876,1755,1593,1755,0,0,0,0,40,,,0,85.173,2 +0,2025-05-22 08:00:11.287434,1755,1593,1755,2,0,0,0,40,,,0,87.85,2 +0,2025-05-22 08:00:12.288026,1755,1593,1755,2,0,0,0,40,,,0,85.315,2 +0,2025-05-22 08:00:13.288628,1755,1593,1755,0,0,0,0,40,,,0,85.125,2 +0,2025-05-22 08:00:14.289190,1755,1593,1755,0,0,0,0,40,,,0,85.039,2 +0,2025-05-22 08:00:15.289769,1755,1593,1755,0,0,0,0,40,,,0,84.981,2 +0,2025-05-22 08:00:16.290395,1755,1593,1755,0,0,0,0,40,,,0,84.978,2 +0,2025-05-22 08:00:17.291086,1755,1593,1755,0,0,0,0,40,,,0,84.887,2 +0,2025-05-22 08:00:18.291775,1755,1593,1755,0,0,0,0,40,,,0,84.855,2 +0,2025-05-22 08:00:19.292462,1755,1593,1755,0,0,0,0,40,,,0,84.815,2 +0,2025-05-22 08:00:20.293172,1755,1593,1755,0,0,0,0,40,,,0,84.807,2 +0,2025-05-22 08:00:21.293834,1755,1593,1755,0,0,0,0,40,,,0,84.815,2 +0,2025-05-22 08:00:22.294397,1755,1593,1755,0,0,0,0,40,,,0,84.809,2 +0,2025-05-22 08:00:23.294956,1755,1593,1755,0,0,0,0,40,,,0,84.811,2 +0,2025-05-22 08:00:24.295553,1755,1593,1755,0,0,0,0,40,,,0,84.807,2 +0,2025-05-22 08:00:25.296201,1755,1593,1755,0,0,0,0,40,,,0,84.807,2 +0,2025-05-22 08:00:26.296812,1755,1593,1755,0,0,0,0,40,,,0,84.803,2 +0,2025-05-22 08:00:27.297554,1755,1593,1755,0,0,0,0,40,,,0,84.811,2 +0,2025-05-22 08:00:28.298201,1755,1593,1755,0,0,0,0,40,,,0,84.799,2 +0,2025-05-22 08:00:29.298852,1755,1593,1755,0,0,0,0,40,,,0,84.805,2 +0,2025-05-22 08:00:30.299650,1755,1593,1755,0,0,0,0,40,,,0,84.805,2 +0,2025-05-22 08:00:31.300374,1755,1593,1755,0,0,0,0,40,,,0,84.805,2 +0,2025-05-22 08:00:32.300782,1755,1593,1755,100,3,0,0,40,,,0,89.233,2 +0,2025-05-22 08:00:33.301269,1755,1593,1755,0,0,0,0,40,,,0,90.663,2 +0,2025-05-22 08:00:34.301661,1755,1593,1755,0,0,0,0,40,,,0,87.319,1 +0,2025-05-22 08:00:35.301911,1755,1593,1755,0,0,0,0,40,,,0,87.299,0 +0,2025-05-22 08:00:36.302155,375,1593,375,0,0,0,0,39,,,0,80.476,0 diff --git a/tests/integration/defs/output/perf_script_test_results.csv b/tests/integration/defs/output/perf_script_test_results.csv new file mode 100644 index 00000000000..4c256eadabe --- /dev/null +++ b/tests/integration/defs/output/perf_script_test_results.csv @@ -0,0 +1,3 @@ +network_name,network_hash,sm_clk,mem_clk,gpu_idx,perf_case_name,test_name,original_test_name,raw_result,perf_metric,total_time__sec,start_timestamp,end_timestamp,state,command,threshold,absolute_threshold,metric_type +"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",1755,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:20:22] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:20:22] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:20:25] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:20:25] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:20:25] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path: None\nNumber of Sequences: 512\n\n-- Percentiles statistics ---------------------------------\n\n Input Output Seq. Length\n-----------------------------------------------------------\nMIN: 128.0000 128.0000 256.0000\nMAX: 128.0000 128.0000 256.0000\nAVG: 128.0000 128.0000 256.0000\nP50: 128.0000 128.0000 256.0000\nP90: 128.0000 128.0000 256.0000\nP95: 128.0000 128.0000 256.0000\nP99: 128.0000 128.0000 256.0000\n===========================================================\n\n[05/22/2025-07:20:25] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:20:25] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:20:34] [TRT-LLM] [I] Starting quantization...\nRegistered for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:21:56] [TRT-LLM] [I] Quantization done. Total time used: 82.46 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:16] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/quantized-checkpoint \nTotal time used 20.49 s.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:22:17] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:22:24] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6364, GPU 636 (MiB)\n[05/22/2025-07:22:25] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1005, GPU +6, now: CPU 7168, GPU 642 (MiB)\n[05/22/2025-07:22:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time of constructing network from module object 8.95593547821045 seconds\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:22:26] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:22:26] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:22:30] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:22:30] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:22:30] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:22:40] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:40] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:22:43] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:43] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:43] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.11304ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:22:43] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:22:46] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:46] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:47] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:47] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:47] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.4582ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:22:47] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:22:51] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:51] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:51] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:51] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:51] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.58895ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:22:51] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:22:55] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:55] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:56] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:56] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:56] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.70618ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:22:56] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:22:59] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:59] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:00] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:00] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:00] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.7238ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:23:00] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:23:05] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:23:05] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:06] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:06] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.04059ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:23:06] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:23:06] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:23:06] [TRT] [I] Engine generation completed in 36.5777 seconds.\n[05/22/2025-07:23:06] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:23:09] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:42\n[05/22/2025-07:23:09] [TRT] [I] Serialized 4959 bytes of code generator cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 1912332 bytes of compilation cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:23:09] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:23:09] [TRT-LLM] [I] Build phase peak memory: 32810.35 MB, children: 11886.42 MB\n[05/22/2025-07:23:10] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:23:29] [TRT-LLM] [I] Engine serialized. Total time: 00:00:18\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-07:23:48] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:48] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:23:50] [TRT-LLM] [W] Found worker process 94149 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-07:23:50] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:23:50] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8791 MiB\n[TensorRT-LLM][INFO] Engine load time 3243 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.46 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28322\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.32 GiB for max tokens in paged KV cache (906304).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name: meta-llama/Llama-3.1-8B\nModel Path: /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length: 256\nMax Batch Size: 4096\nMax Num Tokens: 8192\nQuantization: FP8\nKV Cache Dtype: FP8\n===========================================================\n\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",36.5777,254.122006,2025-05-22 07:19:59,2025-05-22 07:24:13,valid, trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME +"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",375,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:56:21] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:56:21] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:56:23] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:56:23] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:56:23] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path: None\nNumber of Sequences: 512\n\n-- Percentiles statistics ---------------------------------\n\n Input Output Seq. Length\n-----------------------------------------------------------\nMIN: 128.0000 128.0000 256.0000\nMAX: 128.0000 128.0000 256.0000\nAVG: 128.0000 128.0000 256.0000\nP50: 128.0000 128.0000 256.0000\nP90: 128.0000 128.0000 256.0000\nP95: 128.0000 128.0000 256.0000\nP99: 128.0000 128.0000 256.0000\n===========================================================\n\n[05/22/2025-07:56:23] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:56:23] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:56:31] [TRT-LLM] [I] Starting quantization...\nRegistered for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:57:53] [TRT-LLM] [I] Quantization done. Total time used: 82.30 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/quantized-checkpoint \nTotal time used 20.10 s.\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:58:15] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:58:15] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:58:28] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6151, GPU 636 (MiB)\n[05/22/2025-07:58:29] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1220, GPU +6, now: CPU 7170, GPU 642 (MiB)\n[05/22/2025-07:58:29] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time of constructing network from module object 14.438017129898071 seconds\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:58:29] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:58:29] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:58:33] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:58:33] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:58:33] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:58:43] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:58:43] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:58:45] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:58:45] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:58:45] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 5.97675ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:58:45] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:59:00] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:00] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:01] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:01] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:01] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.52666ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:59:01] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:59:04] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:04] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:05] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:05] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:05] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.61347ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:59:05] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:59:08] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:08] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:09] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:09] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:09] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.88478ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:59:09] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:59:12] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:12] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:13] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:13] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:13] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.79197ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:59:13] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:59:18] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:18] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:18] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:18] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:18] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.09646ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:59:18] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:59:19] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:59:19] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:59:19] [TRT] [I] Engine generation completed in 46.117 seconds.\n[05/22/2025-07:59:19] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:59:21] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:51\n[05/22/2025-07:59:21] [TRT] [I] Serialized 5010 bytes of code generator cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 1654870 bytes of compilation cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:59:21] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:59:21] [TRT-LLM] [I] Build phase peak memory: 32983.74 MB, children: 11888.58 MB\n[05/22/2025-07:59:23] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:59:39] [TRT-LLM] [I] Engine serialized. Total time: 00:00:16\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-08:00:05] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:05] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-08:00:07] [TRT-LLM] [W] Found worker process 98701 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-08:00:07] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:07] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8786 MiB\n[TensorRT-LLM][INFO] Engine load time 3261 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.47 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28327\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.33 GiB for max tokens in paged KV cache (906464).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name: meta-llama/Llama-3.1-8B\nModel Path: /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length: 256\nMax Batch Size: 4096\nMax Num Tokens: 8192\nQuantization: FP8\nKV Cache Dtype: FP8\n===========================================================\n\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",46.117,268.112764,2025-05-22 07:56:08,2025-05-22 08:00:36,valid, trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME diff --git a/tests/integration/defs/output/session_properties.csv b/tests/integration/defs/output/session_properties.csv new file mode 100644 index 00000000000..b86ff5e75bf --- /dev/null +++ b/tests/integration/defs/output/session_properties.csv @@ -0,0 +1,2 @@ +username,start_timestamp,hostname,ip,nvidia_driver_version,nvidia_device_count,os_properties,cpu_properties,gpu_properties,trt_change_id,trt_branch,commit_timestamp,cuda_version,cublas_version,cudnn_version,end_timestamp +,2025-05-22 07:19:47,ipp2-1606.nvidia.com,10.176.4.8,575.57.05,1,"{'os_name': 'posix', 'platform': 'Linux', 'platform_version': '#144-Ubuntu SMP Fri Feb 7 20:47:38 UTC 2025'}","{'cpu_count': 32, 'cpu_freq': {'current': 1500.167875, 'min': 1500.0, 'max': 3000.0}}","{'device_product_name': 'H100 PCIe', 'pci_device_id': 590418142}",,,,,,,2025-05-22 07:55:34 diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt b/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt new file mode 100644 index 00000000000..57e076ffb02 --- /dev/null +++ b/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt @@ -0,0 +1 @@ +perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8] diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index a2902ede1a8..e26946b1fb0 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import pickle import sys import traceback @@ -97,16 +96,14 @@ def row_linear_residual_norm_fusion_forward( reference_output = tuple(t.cuda() for t in reference_output) MPI.COMM_WORLD.barrier() - os.environ["TRTLLM_MNNVL_AR_ENABLED"] = "1" - - allreduce = AllReduce( - mapping=Mapping( - world_size=tensor_parallel_size, - tp_size=tensor_parallel_size, - rank=tensor_parallel_rank, - ), - dtype=dtype, - ) + + allreduce = AllReduce(mapping=Mapping( + world_size=tensor_parallel_size, + tp_size=tensor_parallel_size, + rank=tensor_parallel_rank, + ), + dtype=dtype, + ar_backend="MNVL") # Since all the modules here are provided by TRT-LLM, # so it has to be fullgraph compatible From 6130848acd42508757545e6ec13c9d77fc0dc2fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hui=20Gao=C3=A2=C2=80?= Date: Mon, 9 Jun 2025 04:48:45 -0700 Subject: [PATCH 2/9] Revert some change in tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hui Gao Signed-off-by: Hui Gao†--- .../defs/output/gpu_monitoring.csv | 525 ------------------ .../defs/output/perf_script_test_results.csv | 3 - .../defs/output/session_properties.csv | 2 - .../qa/trt_llm_release_perf_test.txt | 1 - 4 files changed, 531 deletions(-) delete mode 100644 tests/integration/defs/output/gpu_monitoring.csv delete mode 100644 tests/integration/defs/output/perf_script_test_results.csv delete mode 100644 tests/integration/defs/output/session_properties.csv delete mode 100644 tests/integration/test_lists/qa/trt_llm_release_perf_test.txt diff --git a/tests/integration/defs/output/gpu_monitoring.csv b/tests/integration/defs/output/gpu_monitoring.csv deleted file mode 100644 index f0a9de5818a..00000000000 --- a/tests/integration/defs/output/gpu_monitoring.csv +++ /dev/null @@ -1,525 +0,0 @@ -gpu_id,timestamp,gpu_clock__MHz,memory_clock__MHz,graphics_clock__MHz,gpu_utilization__pct,memory_utilization__pct,encoder_utilization__pct,decoder_utilization__pct,gpu_temperature__C,memory_temperature__C,fan_speed__pct,perf_state,power_draw__W,process_num -0,2025-05-22 07:19:59.254958,345,1593,345,0,0,0,0,33,,,0,49.336,0 -0,2025-05-22 07:20:00.255244,345,1593,345,0,0,0,0,33,,,0,49.341,0 -0,2025-05-22 07:20:01.255586,345,1593,345,0,0,0,0,33,,,0,49.335,0 -0,2025-05-22 07:20:02.255856,345,1593,345,0,0,0,0,33,,,0,49.325,0 -0,2025-05-22 07:20:03.256133,345,1593,345,0,0,0,0,33,,,0,49.338,0 -0,2025-05-22 07:20:04.256400,345,1593,345,0,0,0,0,33,,,0,49.329,0 -0,2025-05-22 07:20:05.256668,345,1593,345,0,0,0,0,33,,,0,49.333,0 -0,2025-05-22 07:20:06.256911,345,1593,345,0,0,0,0,33,,,0,49.335,0 -0,2025-05-22 07:20:07.257181,345,1593,345,0,0,0,0,33,,,0,49.341,0 -0,2025-05-22 07:20:08.257467,345,1593,345,0,0,0,0,33,,,0,49.331,0 -0,2025-05-22 07:20:09.257742,345,1593,345,0,0,0,0,33,,,0,49.325,0 -0,2025-05-22 07:20:10.258030,345,1593,345,0,0,0,0,33,,,0,49.325,0 -0,2025-05-22 07:20:11.258311,345,1593,345,0,0,0,0,33,,,0,49.329,0 -0,2025-05-22 07:20:12.258595,345,1593,345,0,0,0,0,33,,,0,49.325,0 -0,2025-05-22 07:20:13.258881,345,1593,345,0,0,0,0,33,,,0,49.327,0 -0,2025-05-22 07:20:14.259151,345,1593,345,0,0,0,0,33,,,0,49.336,0 -0,2025-05-22 07:20:15.259451,345,1593,345,0,0,0,0,33,,,0,49.325,0 -0,2025-05-22 07:20:16.259675,345,1593,345,0,0,0,0,33,,,0,49.339,0 -0,2025-05-22 07:20:17.259991,345,1593,345,0,0,0,0,33,,,0,49.349,0 -0,2025-05-22 07:20:18.260332,345,1593,345,0,0,0,0,33,,,0,49.343,0 -0,2025-05-22 07:20:19.260653,345,1593,345,0,0,0,0,33,,,0,49.34,0 -0,2025-05-22 07:20:20.260928,345,1593,345,0,0,0,0,33,,,0,49.327,0 -0,2025-05-22 07:20:21.261204,345,1593,345,0,0,0,0,33,,,0,49.325,0 -0,2025-05-22 07:20:22.261520,345,1593,345,0,0,0,0,33,,,0,49.327,0 -0,2025-05-22 07:20:23.261836,345,1593,345,0,0,0,0,33,,,0,49.329,0 -0,2025-05-22 07:20:24.262109,345,1593,345,0,0,0,0,33,,,0,49.332,0 -0,2025-05-22 07:20:25.262378,345,1593,345,0,0,0,0,33,,,0,49.327,0 -0,2025-05-22 07:20:26.262645,345,1593,345,0,0,0,0,33,,,0,49.334,0 -0,2025-05-22 07:20:27.263164,1755,1593,1755,4,0,0,0,33,,,0,56.903,1 -0,2025-05-22 07:20:28.263556,1755,1593,1755,47,1,0,0,34,,,0,83.141,1 -0,2025-05-22 07:20:29.263933,1755,1593,1755,0,0,0,0,34,,,0,82.042,1 -0,2025-05-22 07:20:30.264296,1755,1593,1755,0,0,0,0,34,,,0,81.62,1 -0,2025-05-22 07:20:31.264652,1755,1593,1755,0,0,0,0,34,,,0,81.685,1 -0,2025-05-22 07:20:32.265029,1755,1593,1755,0,0,0,0,34,,,0,81.517,1 -0,2025-05-22 07:20:33.265392,1755,1593,1755,0,0,0,0,34,,,0,81.539,1 -0,2025-05-22 07:20:34.265751,1755,1593,1755,0,0,0,0,34,,,0,81.602,1 -0,2025-05-22 07:20:35.266153,1755,1593,1755,47,18,0,0,36,,,0,149.079,1 -0,2025-05-22 07:20:36.266565,1755,1593,1755,46,18,0,0,36,,,0,160.12,1 -0,2025-05-22 07:20:37.266972,1755,1593,1755,46,18,0,0,37,,,0,159.799,1 -0,2025-05-22 07:20:38.267387,1755,1593,1755,46,18,0,0,37,,,0,161.614,1 -0,2025-05-22 07:20:39.267802,1755,1593,1755,46,18,0,0,37,,,0,161.828,1 -0,2025-05-22 07:20:40.268223,1755,1593,1755,46,18,0,0,37,,,0,163.822,1 -0,2025-05-22 07:20:41.268630,1755,1593,1755,46,19,0,0,38,,,0,165.912,1 -0,2025-05-22 07:20:42.269043,1755,1593,1755,47,19,0,0,38,,,0,164.7,1 -0,2025-05-22 07:20:43.269479,1755,1593,1755,46,18,0,0,38,,,0,164.466,1 -0,2025-05-22 07:20:44.269904,1755,1593,1755,46,18,0,0,38,,,0,166.363,1 -0,2025-05-22 07:20:45.270322,1755,1593,1755,47,19,0,0,38,,,0,167.174,1 -0,2025-05-22 07:20:46.270754,1755,1593,1755,46,19,0,0,39,,,0,166.63,1 -0,2025-05-22 07:20:47.271183,1755,1593,1755,47,19,0,0,39,,,0,166.363,1 -0,2025-05-22 07:20:48.271607,1755,1593,1755,46,18,0,0,39,,,0,163.311,1 -0,2025-05-22 07:20:49.272020,1755,1593,1755,46,18,0,0,39,,,0,160.703,1 -0,2025-05-22 07:20:50.272437,1755,1593,1755,46,18,0,0,39,,,0,160.035,1 -0,2025-05-22 07:20:51.272861,1755,1593,1755,46,18,0,0,40,,,0,160.304,1 -0,2025-05-22 07:20:52.273307,1755,1593,1755,45,17,0,0,40,,,0,162.585,1 -0,2025-05-22 07:20:53.273747,1755,1593,1755,46,18,0,0,40,,,0,163.577,1 -0,2025-05-22 07:20:54.274167,1755,1593,1755,46,18,0,0,40,,,0,165.493,1 -0,2025-05-22 07:20:55.274583,1755,1593,1755,46,18,0,0,40,,,0,166.608,1 -0,2025-05-22 07:20:56.275023,1755,1593,1755,46,18,0,0,41,,,0,167.712,1 -0,2025-05-22 07:20:57.275448,1755,1593,1755,46,19,0,0,41,,,0,164.796,1 -0,2025-05-22 07:20:58.275873,1755,1593,1755,46,18,0,0,41,,,0,161.867,1 -0,2025-05-22 07:20:59.276240,1755,1593,1755,46,18,0,0,41,,,0,168.464,1 -0,2025-05-22 07:21:00.276665,1755,1593,1755,46,18,0,0,41,,,0,168.308,1 -0,2025-05-22 07:21:01.277080,1755,1593,1755,46,19,0,0,41,,,0,167.946,1 -0,2025-05-22 07:21:02.277514,1755,1593,1755,47,19,0,0,42,,,0,170.932,1 -0,2025-05-22 07:21:03.277944,1755,1593,1755,46,18,0,0,42,,,0,170.862,1 -0,2025-05-22 07:21:04.278368,1755,1593,1755,46,18,0,0,42,,,0,169.522,1 -0,2025-05-22 07:21:05.278789,1755,1593,1755,46,18,0,0,42,,,0,165.573,1 -0,2025-05-22 07:21:06.279217,1755,1593,1755,47,19,0,0,42,,,0,165.344,1 -0,2025-05-22 07:21:07.279646,1755,1593,1755,46,19,0,0,42,,,0,167.941,1 -0,2025-05-22 07:21:08.280085,1755,1593,1755,46,18,0,0,42,,,0,166.655,1 -0,2025-05-22 07:21:09.280498,1755,1593,1755,46,18,0,0,43,,,0,165.308,1 -0,2025-05-22 07:21:10.280920,1755,1593,1755,47,19,0,0,43,,,0,168.2,1 -0,2025-05-22 07:21:11.281342,1755,1593,1755,46,18,0,0,43,,,0,166.143,1 -0,2025-05-22 07:21:12.281782,1755,1593,1755,46,18,0,0,43,,,0,164.653,1 -0,2025-05-22 07:21:13.282199,1755,1593,1755,46,36,0,0,43,,,0,165.197,1 -0,2025-05-22 07:21:14.282624,1755,1593,1755,46,18,0,0,44,,,0,165.117,1 -0,2025-05-22 07:21:15.283055,1755,1593,1755,46,18,0,0,43,,,0,164.62,1 -0,2025-05-22 07:21:16.283479,1755,1593,1755,46,18,0,0,44,,,0,165.582,1 -0,2025-05-22 07:21:17.283906,1755,1593,1755,47,18,0,0,44,,,0,168.788,1 -0,2025-05-22 07:21:18.284331,1755,1593,1755,47,18,0,0,44,,,0,166.465,1 -0,2025-05-22 07:21:19.284757,1755,1593,1755,45,18,0,0,44,,,0,163.746,1 -0,2025-05-22 07:21:20.285181,1755,1593,1755,45,18,0,0,44,,,0,163.653,1 -0,2025-05-22 07:21:21.285625,1755,1593,1755,45,18,0,0,44,,,0,163.048,1 -0,2025-05-22 07:21:22.286048,1755,1593,1755,46,18,0,0,44,,,0,162.94,1 -0,2025-05-22 07:21:23.286485,1755,1593,1755,47,19,0,0,44,,,0,163.415,1 -0,2025-05-22 07:21:24.286905,1755,1593,1755,46,18,0,0,44,,,0,164.032,1 -0,2025-05-22 07:21:25.287338,1755,1593,1755,46,18,0,0,45,,,0,163.911,1 -0,2025-05-22 07:21:26.287772,1755,1593,1755,46,18,0,0,45,,,0,164.336,1 -0,2025-05-22 07:21:27.288204,1755,1593,1755,47,18,0,0,45,,,0,165.044,1 -0,2025-05-22 07:21:28.288625,1755,1593,1755,46,18,0,0,45,,,0,168.746,1 -0,2025-05-22 07:21:29.289053,1755,1593,1755,46,18,0,0,45,,,0,172.765,1 -0,2025-05-22 07:21:30.289496,1755,1593,1755,46,18,0,0,45,,,0,171.735,1 -0,2025-05-22 07:21:31.289927,1755,1593,1755,46,18,0,0,45,,,0,170.906,1 -0,2025-05-22 07:21:32.290358,1755,1593,1755,46,18,0,0,45,,,0,170.166,1 -0,2025-05-22 07:21:33.290777,1755,1593,1755,47,18,0,0,45,,,0,167.227,1 -0,2025-05-22 07:21:34.291194,1755,1593,1755,46,18,0,0,46,,,0,163.288,1 -0,2025-05-22 07:21:35.291620,1755,1593,1755,47,18,0,0,46,,,0,163.8,1 -0,2025-05-22 07:21:36.292050,1755,1593,1755,47,19,0,0,46,,,0,164.799,1 -0,2025-05-22 07:21:37.292474,1755,1593,1755,47,19,0,0,46,,,0,168.345,1 -0,2025-05-22 07:21:38.292900,1755,1593,1755,46,18,0,0,46,,,0,169.427,1 -0,2025-05-22 07:21:39.293340,1755,1593,1755,47,18,0,0,46,,,0,168.9,1 -0,2025-05-22 07:21:40.293802,1755,1593,1755,47,19,0,0,46,,,0,169.208,1 -0,2025-05-22 07:21:41.294219,1755,1593,1755,47,19,0,0,46,,,0,168.596,1 -0,2025-05-22 07:21:42.294645,1755,1593,1755,46,18,0,0,46,,,0,166.093,1 -0,2025-05-22 07:21:43.295066,1755,1593,1755,47,18,0,0,46,,,0,169.899,1 -0,2025-05-22 07:21:44.295498,1755,1593,1755,47,19,0,0,46,,,0,171.042,1 -0,2025-05-22 07:21:45.295924,1755,1593,1755,47,18,0,0,47,,,0,172.313,1 -0,2025-05-22 07:21:46.296353,1755,1593,1755,46,18,0,0,47,,,0,171.179,1 -0,2025-05-22 07:21:47.296778,1755,1593,1755,46,18,0,0,47,,,0,173.428,1 -0,2025-05-22 07:21:48.297203,1755,1593,1755,46,18,0,0,47,,,0,172.265,1 -0,2025-05-22 07:21:49.297592,1755,1593,1755,46,18,0,0,47,,,0,169.976,1 -0,2025-05-22 07:21:50.298010,1755,1593,1755,46,18,0,0,47,,,0,167.299,1 -0,2025-05-22 07:21:51.298436,1755,1593,1755,46,18,0,0,47,,,0,169.135,1 -0,2025-05-22 07:21:52.298858,1755,1593,1755,46,18,0,0,47,,,0,168.709,1 -0,2025-05-22 07:21:53.299286,1755,1593,1755,47,18,0,0,47,,,0,172.096,1 -0,2025-05-22 07:21:54.299709,1755,1593,1755,47,18,0,0,47,,,0,169.99,1 -0,2025-05-22 07:21:55.300131,1755,1593,1755,46,18,0,0,47,,,0,170.417,1 -0,2025-05-22 07:21:56.300561,1755,1593,1755,44,17,0,0,47,,,0,168.859,1 -0,2025-05-22 07:21:57.300950,1755,1593,1755,21,2,0,0,46,,,0,110.054,1 -0,2025-05-22 07:21:58.301328,1755,1593,1755,22,2,0,0,46,,,0,95.543,1 -0,2025-05-22 07:21:59.301714,1755,1593,1755,0,0,0,0,45,,,0,90.746,1 -0,2025-05-22 07:22:00.302095,1755,1593,1755,0,0,0,0,45,,,0,88.161,1 -0,2025-05-22 07:22:01.302462,1755,1593,1755,0,0,0,0,45,,,0,87.927,1 -0,2025-05-22 07:22:02.302828,1755,1593,1755,0,0,0,0,45,,,0,87.763,1 -0,2025-05-22 07:22:03.303207,1755,1593,1755,0,0,0,0,45,,,0,87.721,1 -0,2025-05-22 07:22:04.303578,1755,1593,1755,0,0,0,0,45,,,0,87.723,1 -0,2025-05-22 07:22:05.304031,1755,1593,1755,0,0,0,0,45,,,0,87.683,1 -0,2025-05-22 07:22:06.304365,1755,1593,1755,0,0,0,0,45,,,0,87.47,1 -0,2025-05-22 07:22:07.304734,1755,1593,1755,0,0,0,0,44,,,0,87.276,1 -0,2025-05-22 07:22:08.305170,1755,1593,1755,0,0,0,0,44,,,0,87.238,1 -0,2025-05-22 07:22:09.305577,1755,1593,1755,0,0,0,0,44,,,0,87.237,1 -0,2025-05-22 07:22:10.305978,1755,1593,1755,0,0,0,0,44,,,0,87.238,1 -0,2025-05-22 07:22:11.306538,1755,1593,1755,0,0,0,0,44,,,0,87.228,1 -0,2025-05-22 07:22:12.306949,1755,1593,1755,0,0,0,0,44,,,0,87.067,1 -0,2025-05-22 07:22:13.307379,1755,1593,1755,0,0,0,0,44,,,0,86.889,1 -0,2025-05-22 07:22:14.307773,1755,1593,1755,0,0,0,0,44,,,0,86.763,1 -0,2025-05-22 07:22:15.308163,1755,1593,1755,0,0,0,0,44,,,0,86.763,1 -0,2025-05-22 07:22:16.308542,1755,1593,1755,0,0,0,0,44,,,0,86.754,1 -0,2025-05-22 07:22:17.308965,1755,1593,1755,9,2,0,0,44,,,0,86.934,1 -0,2025-05-22 07:22:18.309381,1755,1593,1755,0,0,0,0,43,,,0,87.99,1 -0,2025-05-22 07:22:19.309721,1755,1593,1755,0,0,0,0,43,,,0,86.727,1 -0,2025-05-22 07:22:20.310356,1755,1593,1755,0,0,0,0,43,,,0,86.641,1 -0,2025-05-22 07:22:21.310966,1755,1593,1755,0,0,0,0,43,,,0,86.467,1 -0,2025-05-22 07:22:22.311493,1755,1593,1755,0,0,0,0,43,,,0,86.361,1 -0,2025-05-22 07:22:23.311834,1755,1593,1755,0,0,0,0,43,,,0,86.282,1 -0,2025-05-22 07:22:24.312224,1755,1593,1755,0,0,0,0,43,,,0,86.283,1 -0,2025-05-22 07:22:25.312567,1755,1593,1755,0,0,0,0,43,,,0,86.27,1 -0,2025-05-22 07:22:26.312947,1755,1593,1755,0,0,0,0,43,,,0,86.279,1 -0,2025-05-22 07:22:27.313382,1755,1593,1755,5,0,0,0,43,,,0,86.288,1 -0,2025-05-22 07:22:28.313886,1755,1593,1755,5,0,0,0,43,,,0,86.289,1 -0,2025-05-22 07:22:29.314428,1755,1593,1755,5,0,0,0,43,,,0,86.269,1 -0,2025-05-22 07:22:30.314975,1755,1593,1755,16,1,0,0,43,,,0,87.319,1 -0,2025-05-22 07:22:31.315483,1755,1593,1755,33,15,0,0,43,,,0,102.736,1 -0,2025-05-22 07:22:32.316918,1755,1593,1755,0,0,0,0,42,,,0,86.925,1 -0,2025-05-22 07:22:33.317411,1575,1593,1575,92,54,0,0,44,,,0,107.134,1 -0,2025-05-22 07:22:34.317939,1650,1593,1650,95,44,0,0,46,,,0,294.726,1 -0,2025-05-22 07:22:35.318568,1755,1593,1755,0,0,0,0,44,,,0,216.303,1 -0,2025-05-22 07:22:36.319054,1755,1593,1755,0,0,0,0,43,,,0,87.898,1 -0,2025-05-22 07:22:37.319525,1755,1593,1755,0,0,0,0,43,,,0,87.171,1 -0,2025-05-22 07:22:38.320042,1755,1593,1755,0,0,0,0,43,,,0,86.841,1 -0,2025-05-22 07:22:39.320730,1755,1593,1755,0,0,0,0,43,,,0,86.811,1 -0,2025-05-22 07:22:40.321245,1470,1593,1470,87,50,0,0,45,,,0,117.286,1 -0,2025-05-22 07:22:41.321758,1755,1593,1755,0,0,0,0,43,,,0,109.895,1 -0,2025-05-22 07:22:42.322124,1755,1593,1755,0,0,0,0,43,,,0,87.044,1 -0,2025-05-22 07:22:43.322659,1755,1593,1755,1,0,0,0,43,,,0,88.142,1 -0,2025-05-22 07:22:44.323156,1755,1593,1755,0,0,0,0,43,,,0,97.358,1 -0,2025-05-22 07:22:45.323659,1755,1593,1755,0,0,0,0,43,,,0,86.308,1 -0,2025-05-22 07:22:46.324230,1755,1593,1755,0,0,0,0,43,,,0,86.378,1 -0,2025-05-22 07:22:47.324918,1755,1593,1755,0,0,0,0,43,,,0,86.507,1 -0,2025-05-22 07:22:48.325446,1755,1593,1755,28,17,0,0,43,,,0,101.569,1 -0,2025-05-22 07:22:49.325933,1755,1593,1755,0,0,0,0,42,,,0,86.457,1 -0,2025-05-22 07:22:50.326447,1755,1593,1755,0,0,0,0,42,,,0,86.26,1 -0,2025-05-22 07:22:51.327157,1755,1593,1755,0,0,0,0,42,,,0,86.104,1 -0,2025-05-22 07:22:52.327672,1755,1593,1755,2,0,0,0,42,,,0,90.009,1 -0,2025-05-22 07:22:53.328189,1755,1593,1755,1,0,0,0,42,,,0,108.039,1 -0,2025-05-22 07:22:54.328655,1755,1593,1755,1,0,0,0,42,,,0,86.184,1 -0,2025-05-22 07:22:55.329292,1755,1593,1755,0,0,0,0,42,,,0,86.352,1 -0,2025-05-22 07:22:56.329804,1755,1593,1755,20,7,0,0,42,,,0,91.852,1 -0,2025-05-22 07:22:57.330330,1755,1593,1755,0,0,0,0,42,,,0,121.037,1 -0,2025-05-22 07:22:58.330822,1755,1593,1755,1,0,0,0,42,,,0,86.13,1 -0,2025-05-22 07:22:59.331312,1755,1593,1755,1,0,0,0,42,,,0,86.203,1 -0,2025-05-22 07:23:00.332012,1755,1593,1755,2,0,0,0,42,,,0,86.286,1 -0,2025-05-22 07:23:01.332519,1755,1593,1755,0,0,0,0,43,,,0,116.845,1 -0,2025-05-22 07:23:02.333020,1680,1593,1680,64,39,0,0,46,,,0,259.542,1 -0,2025-05-22 07:23:03.333544,1755,1593,1755,8,3,0,0,43,,,0,144.303,1 -0,2025-05-22 07:23:04.334043,1755,1593,1755,8,3,0,0,43,,,0,89.614,1 -0,2025-05-22 07:23:05.334730,1755,1593,1755,0,0,0,0,42,,,0,88.458,1 -0,2025-05-22 07:23:06.335262,1755,1593,1755,70,1,0,0,42,,,0,88.137,1 -0,2025-05-22 07:23:07.335801,1755,1593,1755,0,0,0,0,42,,,0,87.369,1 -0,2025-05-22 07:23:08.336352,1755,1593,1755,0,0,0,0,42,,,0,86.252,1 -0,2025-05-22 07:23:09.336868,1755,1593,1755,73,2,0,0,42,,,0,88.397,1 -0,2025-05-22 07:23:10.337356,1755,1593,1755,0,0,0,0,42,,,0,86.663,1 -0,2025-05-22 07:23:11.337725,1755,1593,1755,0,0,0,0,42,,,0,89.116,1 -0,2025-05-22 07:23:12.338087,1755,1593,1755,0,0,0,0,42,,,0,88.719,1 -0,2025-05-22 07:23:13.338431,1755,1593,1755,0,0,0,0,42,,,0,88.816,1 -0,2025-05-22 07:23:14.338828,1755,1593,1755,0,0,0,0,42,,,0,88.764,1 -0,2025-05-22 07:23:15.339223,1755,1593,1755,0,0,0,0,42,,,0,88.758,1 -0,2025-05-22 07:23:16.339634,1755,1593,1755,0,0,0,0,42,,,0,88.698,1 -0,2025-05-22 07:23:17.340017,1755,1593,1755,0,0,0,0,42,,,0,88.738,1 -0,2025-05-22 07:23:18.340426,1755,1593,1755,0,0,0,0,42,,,0,88.681,1 -0,2025-05-22 07:23:19.340830,1755,1593,1755,0,0,0,0,42,,,0,88.776,1 -0,2025-05-22 07:23:20.341239,1755,1593,1755,0,0,0,0,42,,,0,88.7,1 -0,2025-05-22 07:23:21.341650,1755,1593,1755,0,0,0,0,42,,,0,88.661,1 -0,2025-05-22 07:23:22.342095,1755,1593,1755,0,0,0,0,42,,,0,88.662,1 -0,2025-05-22 07:23:23.342384,1755,1593,1755,0,0,0,0,42,,,0,88.604,1 -0,2025-05-22 07:23:24.342789,1755,1593,1755,0,0,0,0,42,,,0,88.443,1 -0,2025-05-22 07:23:25.343156,1755,1593,1755,0,0,0,0,41,,,0,88.429,1 -0,2025-05-22 07:23:26.343491,1755,1593,1755,0,0,0,0,41,,,0,88.449,1 -0,2025-05-22 07:23:27.343838,1755,1593,1755,0,0,0,0,41,,,0,88.482,1 -0,2025-05-22 07:23:28.344139,1755,1593,1755,0,0,0,0,41,,,0,88.413,1 -0,2025-05-22 07:23:29.344480,1755,1593,1755,0,0,0,0,41,,,0,88.371,1 -0,2025-05-22 07:23:30.344988,1755,1593,1755,0,0,0,0,41,,,0,88.313,1 -0,2025-05-22 07:23:31.345457,1755,1593,1755,0,0,0,0,41,,,0,88.323,1 -0,2025-05-22 07:23:32.345857,1755,1593,1755,0,0,0,0,41,,,0,88.337,1 -0,2025-05-22 07:23:33.346245,1755,1593,1755,0,0,0,0,41,,,0,88.272,1 -0,2025-05-22 07:23:34.346634,1755,1593,1755,0,0,0,0,41,,,0,88.187,1 -0,2025-05-22 07:23:35.347022,1755,1593,1755,0,0,0,0,41,,,0,88.272,1 -0,2025-05-22 07:23:36.347411,1755,1593,1755,0,0,0,0,41,,,0,88.299,1 -0,2025-05-22 07:23:37.347798,1755,1593,1755,0,0,0,0,41,,,0,88.303,1 -0,2025-05-22 07:23:38.348184,1755,1593,1755,0,0,0,0,41,,,0,88.157,1 -0,2025-05-22 07:23:39.348575,1755,1593,1755,0,0,0,0,41,,,0,88.202,1 -0,2025-05-22 07:23:40.348966,1755,1593,1755,0,0,0,0,41,,,0,88.195,1 -0,2025-05-22 07:23:41.349371,1755,1593,1755,0,0,0,0,41,,,0,88.197,1 -0,2025-05-22 07:23:42.349764,1755,1593,1755,0,0,0,0,41,,,0,88.195,1 -0,2025-05-22 07:23:43.350176,1755,1593,1755,0,0,0,0,41,,,0,88.187,1 -0,2025-05-22 07:23:44.350577,1755,1593,1755,0,0,0,0,41,,,0,88.205,1 -0,2025-05-22 07:23:45.350967,1755,1593,1755,0,0,0,0,41,,,0,88.128,1 -0,2025-05-22 07:23:46.351372,1755,1593,1755,0,0,0,0,41,,,0,88.131,1 -0,2025-05-22 07:23:47.351771,1755,1593,1755,0,0,0,0,41,,,0,88.117,1 -0,2025-05-22 07:23:48.352181,1755,1593,1755,0,0,0,0,41,,,0,88.059,1 -0,2025-05-22 07:23:49.352571,1755,1593,1755,0,0,0,0,41,,,0,87.994,1 -0,2025-05-22 07:23:50.352952,1755,1593,1755,0,0,0,0,41,,,0,88.035,1 -0,2025-05-22 07:23:51.353492,1755,1593,1755,0,0,0,0,41,,,0,86.81,2 -0,2025-05-22 07:23:52.353930,1755,1593,1755,0,0,0,0,41,,,0,85.298,2 -0,2025-05-22 07:23:53.354392,1755,1593,1755,0,0,0,0,41,,,0,85.302,2 -0,2025-05-22 07:23:54.354974,1755,1593,1755,2,0,0,0,41,,,0,88.017,2 -0,2025-05-22 07:23:55.355581,1755,1593,1755,2,0,0,0,41,,,0,85.396,2 -0,2025-05-22 07:23:56.356378,1755,1593,1755,0,0,0,0,40,,,0,85.3,2 -0,2025-05-22 07:23:57.357162,1755,1593,1755,0,0,0,0,40,,,0,85.277,2 -0,2025-05-22 07:23:58.358047,1755,1593,1755,0,0,0,0,40,,,0,85.281,2 -0,2025-05-22 07:23:59.358757,1755,1593,1755,0,0,0,0,40,,,0,85.507,2 -0,2025-05-22 07:24:00.359353,1755,1593,1755,0,0,0,0,40,,,0,85.65,2 -0,2025-05-22 07:24:01.360023,1755,1593,1755,0,0,0,0,40,,,0,85.627,2 -0,2025-05-22 07:24:02.360661,1755,1593,1755,0,0,0,0,40,,,0,85.507,2 -0,2025-05-22 07:24:03.361365,1755,1593,1755,0,0,0,0,40,,,0,85.412,2 -0,2025-05-22 07:24:04.362059,1755,1593,1755,0,0,0,0,40,,,0,85.418,2 -0,2025-05-22 07:24:05.362757,1755,1593,1755,0,0,0,0,40,,,0,85.399,2 -0,2025-05-22 07:24:06.363425,1755,1593,1755,0,0,0,0,40,,,0,85.351,2 -0,2025-05-22 07:24:07.364247,1755,1593,1755,0,0,0,0,40,,,0,85.313,2 -0,2025-05-22 07:24:08.364937,1755,1593,1755,0,0,0,0,40,,,0,85.309,2 -0,2025-05-22 07:24:09.365401,1755,1593,1755,100,3,0,0,40,,,0,88.057,2 -0,2025-05-22 07:24:10.365922,1755,1593,1755,10,2,0,0,40,,,0,92.06,2 -0,2025-05-22 07:24:11.366400,1755,1593,1755,0,0,0,0,40,,,0,88.174,1 -0,2025-05-22 07:24:12.366750,1755,1593,1755,0,0,0,0,40,,,0,88.171,0 -0,2025-05-22 07:24:13.367080,1755,1593,1755,0,0,0,0,40,,,0,88.168,0 -0,2025-05-22 07:56:08.196552,345,1593,345,0,0,0,0,38,,,0,50.794,0 -0,2025-05-22 07:56:09.196841,345,1593,345,0,0,0,0,38,,,0,50.808,0 -0,2025-05-22 07:56:10.197111,345,1593,345,0,0,0,0,38,,,0,50.799,0 -0,2025-05-22 07:56:11.197379,345,1593,345,0,0,0,0,38,,,0,50.79,0 -0,2025-05-22 07:56:12.197628,345,1593,345,0,0,0,0,38,,,0,50.794,0 -0,2025-05-22 07:56:13.197905,345,1593,345,0,0,0,0,37,,,0,50.805,0 -0,2025-05-22 07:56:14.198164,345,1593,345,0,0,0,0,37,,,0,50.791,0 -0,2025-05-22 07:56:15.198412,345,1593,345,0,0,0,0,37,,,0,50.809,0 -0,2025-05-22 07:56:16.198678,345,1593,345,0,0,0,0,37,,,0,50.794,0 -0,2025-05-22 07:56:17.198953,345,1593,345,0,0,0,0,37,,,0,50.777,0 -0,2025-05-22 07:56:18.199259,345,1593,345,0,0,0,0,37,,,0,50.794,0 -0,2025-05-22 07:56:19.199564,345,1593,345,0,0,0,0,37,,,0,50.783,0 -0,2025-05-22 07:56:20.199832,345,1593,345,0,0,0,0,37,,,0,50.675,0 -0,2025-05-22 07:56:21.200074,345,1593,345,0,0,0,0,37,,,0,50.665,0 -0,2025-05-22 07:56:22.200413,345,1593,345,0,0,0,0,37,,,0,50.592,0 -0,2025-05-22 07:56:23.200672,345,1593,345,0,0,0,0,37,,,0,50.53,0 -0,2025-05-22 07:56:24.201158,1755,1593,1755,19,0,0,0,37,,,0,60.532,1 -0,2025-05-22 07:56:25.201517,1755,1593,1755,50,1,0,0,38,,,0,85.131,1 -0,2025-05-22 07:56:26.201966,1755,1593,1755,0,0,0,0,38,,,0,83.837,1 -0,2025-05-22 07:56:27.202396,1755,1593,1755,0,0,0,0,38,,,0,83.498,1 -0,2025-05-22 07:56:28.202699,1755,1593,1755,0,0,0,0,38,,,0,83.305,1 -0,2025-05-22 07:56:29.203018,1755,1593,1755,0,0,0,0,38,,,0,83.494,1 -0,2025-05-22 07:56:30.203447,1755,1593,1755,0,0,0,0,38,,,0,83.418,1 -0,2025-05-22 07:56:31.203812,1755,1593,1755,0,0,0,0,38,,,0,83.382,1 -0,2025-05-22 07:56:32.204133,1755,1593,1755,16,6,0,0,39,,,0,94.906,1 -0,2025-05-22 07:56:33.204398,1755,1593,1755,47,18,0,0,39,,,0,167.074,1 -0,2025-05-22 07:56:34.204660,1755,1593,1755,46,18,0,0,40,,,0,167.234,1 -0,2025-05-22 07:56:35.204926,1755,1593,1755,47,18,0,0,40,,,0,167.269,1 -0,2025-05-22 07:56:36.205193,1755,1593,1755,47,19,0,0,40,,,0,165.85,1 -0,2025-05-22 07:56:37.205471,1755,1593,1755,47,19,0,0,40,,,0,163.462,1 -0,2025-05-22 07:56:38.205723,1755,1593,1755,46,18,0,0,41,,,0,162.383,1 -0,2025-05-22 07:56:39.205994,1755,1593,1755,47,18,0,0,41,,,0,166.122,1 -0,2025-05-22 07:56:40.206266,1755,1593,1755,47,18,0,0,41,,,0,168.347,1 -0,2025-05-22 07:56:41.206538,1755,1593,1755,47,19,0,0,41,,,0,165.281,1 -0,2025-05-22 07:56:42.206802,1755,1593,1755,46,18,0,0,41,,,0,162.953,1 -0,2025-05-22 07:56:43.207073,1755,1593,1755,47,18,0,0,42,,,0,169.075,1 -0,2025-05-22 07:56:44.207346,1755,1593,1755,47,19,0,0,42,,,0,172.15,1 -0,2025-05-22 07:56:45.207611,1755,1593,1755,47,18,0,0,42,,,0,165.18,1 -0,2025-05-22 07:56:46.207886,1755,1593,1755,46,18,0,0,42,,,0,168.316,1 -0,2025-05-22 07:56:47.208161,1755,1593,1755,47,18,0,0,42,,,0,170.096,1 -0,2025-05-22 07:56:48.208431,1755,1593,1755,47,19,0,0,42,,,0,164.557,1 -0,2025-05-22 07:56:49.208700,1755,1593,1755,46,18,0,0,43,,,0,167.98,1 -0,2025-05-22 07:56:50.208969,1755,1593,1755,46,18,0,0,43,,,0,169.937,1 -0,2025-05-22 07:56:51.209241,1755,1593,1755,47,19,0,0,43,,,0,166.501,1 -0,2025-05-22 07:56:52.209526,1755,1593,1755,47,18,0,0,43,,,0,164.754,1 -0,2025-05-22 07:56:53.209800,1755,1593,1755,47,18,0,0,43,,,0,167.458,1 -0,2025-05-22 07:56:54.210071,1755,1593,1755,46,18,0,0,43,,,0,168.481,1 -0,2025-05-22 07:56:55.210353,1755,1593,1755,47,18,0,0,43,,,0,162.787,1 -0,2025-05-22 07:56:56.210644,1755,1593,1755,47,18,0,0,44,,,0,171.912,1 -0,2025-05-22 07:56:57.210922,1755,1593,1755,46,18,0,0,44,,,0,172.141,1 -0,2025-05-22 07:56:58.211201,1755,1593,1755,46,18,0,0,44,,,0,168.677,1 -0,2025-05-22 07:56:59.211475,1755,1593,1755,47,18,0,0,44,,,0,167.171,1 -0,2025-05-22 07:57:00.211747,1755,1593,1755,47,18,0,0,44,,,0,171.103,1 -0,2025-05-22 07:57:01.212010,1755,1593,1755,46,18,0,0,44,,,0,170.555,1 -0,2025-05-22 07:57:02.212279,1755,1593,1755,47,18,0,0,44,,,0,165.931,1 -0,2025-05-22 07:57:03.212562,1755,1593,1755,47,18,0,0,44,,,0,165.1,1 -0,2025-05-22 07:57:04.212837,1755,1593,1755,47,18,0,0,45,,,0,172.173,1 -0,2025-05-22 07:57:05.213108,1755,1593,1755,46,18,0,0,45,,,0,170.051,1 -0,2025-05-22 07:57:06.213382,1755,1593,1755,47,19,0,0,45,,,0,168.109,1 -0,2025-05-22 07:57:07.213654,1755,1593,1755,47,18,0,0,45,,,0,168.907,1 -0,2025-05-22 07:57:08.213930,1755,1593,1755,47,18,0,0,45,,,0,172.39,1 -0,2025-05-22 07:57:09.214210,1755,1593,1755,46,18,0,0,45,,,0,165.724,1 -0,2025-05-22 07:57:10.214484,1755,1593,1755,47,19,0,0,45,,,0,165.909,1 -0,2025-05-22 07:57:11.214764,1755,1593,1755,47,18,0,0,45,,,0,170.276,1 -0,2025-05-22 07:57:12.215043,1755,1593,1755,46,18,0,0,46,,,0,174.191,1 -0,2025-05-22 07:57:13.215317,1755,1593,1755,47,19,0,0,46,,,0,170.56,1 -0,2025-05-22 07:57:14.215591,1755,1593,1755,47,19,0,0,46,,,0,167.852,1 -0,2025-05-22 07:57:15.215868,1755,1593,1755,47,18,0,0,46,,,0,165.921,1 -0,2025-05-22 07:57:16.216144,1755,1593,1755,46,18,0,0,46,,,0,171.141,1 -0,2025-05-22 07:57:17.216413,1755,1593,1755,47,19,0,0,46,,,0,173.218,1 -0,2025-05-22 07:57:18.216682,1755,1593,1755,47,19,0,0,46,,,0,171.344,1 -0,2025-05-22 07:57:19.216955,1755,1593,1755,45,18,0,0,46,,,0,168.651,1 -0,2025-05-22 07:57:20.217228,1755,1593,1755,44,17,0,0,46,,,0,164.911,1 -0,2025-05-22 07:57:21.217509,1755,1593,1755,47,18,0,0,46,,,0,165.042,1 -0,2025-05-22 07:57:22.217780,1755,1593,1755,46,18,0,0,47,,,0,170.733,1 -0,2025-05-22 07:57:23.218059,1755,1593,1755,47,19,0,0,47,,,0,173.968,1 -0,2025-05-22 07:57:24.218331,1755,1593,1755,47,19,0,0,47,,,0,170.041,1 -0,2025-05-22 07:57:25.218595,1755,1593,1755,46,18,0,0,47,,,0,166.017,1 -0,2025-05-22 07:57:26.218867,1755,1593,1755,46,18,0,0,47,,,0,166.092,1 -0,2025-05-22 07:57:27.219148,1755,1593,1755,47,18,0,0,47,,,0,166.04,1 -0,2025-05-22 07:57:28.219420,1755,1593,1755,47,18,0,0,47,,,0,166.789,1 -0,2025-05-22 07:57:29.219693,1755,1593,1755,46,18,0,0,47,,,0,171.284,1 -0,2025-05-22 07:57:30.219969,1755,1593,1755,46,18,0,0,47,,,0,172.726,1 -0,2025-05-22 07:57:31.220237,1755,1593,1755,47,19,0,0,47,,,0,168.112,1 -0,2025-05-22 07:57:32.220515,1755,1593,1755,47,18,0,0,47,,,0,165.731,1 -0,2025-05-22 07:57:33.220788,1755,1593,1755,46,18,0,0,47,,,0,170.052,1 -0,2025-05-22 07:57:34.221064,1755,1593,1755,44,17,0,0,47,,,0,171.256,1 -0,2025-05-22 07:57:35.221347,1755,1593,1755,46,18,0,0,47,,,0,168.027,1 -0,2025-05-22 07:57:36.221629,1755,1593,1755,47,19,0,0,48,,,0,174.341,1 -0,2025-05-22 07:57:37.221915,1755,1593,1755,47,19,0,0,48,,,0,172.187,1 -0,2025-05-22 07:57:38.222190,1755,1593,1755,47,18,0,0,48,,,0,169.331,1 -0,2025-05-22 07:57:39.222466,1755,1593,1755,47,18,0,0,48,,,0,167.628,1 -0,2025-05-22 07:57:40.222743,1755,1593,1755,47,18,0,0,48,,,0,170.182,1 -0,2025-05-22 07:57:41.223026,1755,1593,1755,47,19,0,0,48,,,0,174.246,1 -0,2025-05-22 07:57:42.223297,1755,1593,1755,46,18,0,0,48,,,0,167.776,1 -0,2025-05-22 07:57:43.223574,1755,1593,1755,47,18,0,0,48,,,0,166.927,1 -0,2025-05-22 07:57:44.223847,1755,1593,1755,47,18,0,0,48,,,0,169.149,1 -0,2025-05-22 07:57:45.224126,1755,1593,1755,47,19,0,0,48,,,0,172.39,1 -0,2025-05-22 07:57:46.224403,1755,1593,1755,46,18,0,0,48,,,0,173.222,1 -0,2025-05-22 07:57:47.224678,1755,1593,1755,47,19,0,0,48,,,0,168.754,1 -0,2025-05-22 07:57:48.224956,1755,1593,1755,47,18,0,0,48,,,0,166.112,1 -0,2025-05-22 07:57:49.225239,1755,1593,1755,47,18,0,0,48,,,0,167.64,1 -0,2025-05-22 07:57:50.225513,1755,1593,1755,46,18,0,0,49,,,0,172.844,1 -0,2025-05-22 07:57:51.225790,1755,1593,1755,47,19,0,0,48,,,0,174.932,1 -0,2025-05-22 07:57:52.226061,1755,1593,1755,47,19,0,0,48,,,0,170.855,1 -0,2025-05-22 07:57:53.226335,1755,1593,1755,47,18,0,0,49,,,0,167.784,1 -0,2025-05-22 07:57:54.226629,1755,1593,1755,25,4,0,0,48,,,0,147.565,1 -0,2025-05-22 07:57:55.226949,1755,1593,1755,21,2,0,0,47,,,0,100.056,1 -0,2025-05-22 07:57:56.227276,1755,1593,1755,20,0,0,0,47,,,0,96.246,1 -0,2025-05-22 07:57:57.227621,1755,1593,1755,0,0,0,0,47,,,0,90.367,1 -0,2025-05-22 07:57:58.227968,1755,1593,1755,0,0,0,0,47,,,0,90.068,1 -0,2025-05-22 07:57:59.228348,1755,1593,1755,0,0,0,0,46,,,0,89.976,1 -0,2025-05-22 07:58:00.228720,1755,1593,1755,0,0,0,0,46,,,0,89.757,1 -0,2025-05-22 07:58:01.229097,1755,1593,1755,0,0,0,0,46,,,0,89.561,1 -0,2025-05-22 07:58:02.229437,1755,1593,1755,0,0,0,0,46,,,0,89.59,1 -0,2025-05-22 07:58:03.229865,1755,1593,1755,0,0,0,0,46,,,0,89.428,1 -0,2025-05-22 07:58:04.230279,1755,1593,1755,0,0,0,0,46,,,0,89.245,1 -0,2025-05-22 07:58:05.230728,1755,1593,1755,0,0,0,0,46,,,0,89.023,1 -0,2025-05-22 07:58:06.231117,1755,1593,1755,0,0,0,0,45,,,0,89.008,1 -0,2025-05-22 07:58:07.231519,1755,1593,1755,0,0,0,0,45,,,0,88.965,1 -0,2025-05-22 07:58:08.231912,1755,1593,1755,0,0,0,0,45,,,0,88.933,1 -0,2025-05-22 07:58:09.232297,1755,1593,1755,0,0,0,0,45,,,0,88.857,1 -0,2025-05-22 07:58:10.232730,1755,1593,1755,0,0,0,0,45,,,0,88.59,1 -0,2025-05-22 07:58:11.233045,1755,1593,1755,0,0,0,0,45,,,0,88.561,1 -0,2025-05-22 07:58:12.233386,1755,1593,1755,0,0,0,0,45,,,0,88.489,1 -0,2025-05-22 07:58:13.233748,1755,1593,1755,0,0,0,0,45,,,0,88.455,1 -0,2025-05-22 07:58:14.234110,1755,1593,1755,0,0,0,0,45,,,0,88.431,1 -0,2025-05-22 07:58:15.234465,1755,1593,1755,0,0,0,0,45,,,0,88.493,1 -0,2025-05-22 07:58:16.234895,1755,1593,1755,0,0,0,0,44,,,0,89.577,1 -0,2025-05-22 07:58:17.235308,1755,1593,1755,0,0,0,0,44,,,0,88.129,1 -0,2025-05-22 07:58:18.235660,1755,1593,1755,0,0,0,0,44,,,0,88.035,1 -0,2025-05-22 07:58:19.235964,1755,1593,1755,0,0,0,0,44,,,0,87.992,1 -0,2025-05-22 07:58:20.236329,1755,1593,1755,0,0,0,0,44,,,0,87.978,1 -0,2025-05-22 07:58:21.236685,1755,1593,1755,0,0,0,0,44,,,0,87.939,1 -0,2025-05-22 07:58:22.236980,1755,1593,1755,0,0,0,0,44,,,0,87.879,1 -0,2025-05-22 07:58:23.237764,1755,1593,1755,0,0,0,0,44,,,0,87.843,1 -0,2025-05-22 07:58:24.238136,1755,1593,1755,0,0,0,0,44,,,0,87.707,1 -0,2025-05-22 07:58:25.238465,1755,1593,1755,0,0,0,0,44,,,0,87.632,1 -0,2025-05-22 07:58:26.238800,1755,1593,1755,0,0,0,0,44,,,0,87.545,1 -0,2025-05-22 07:58:27.239145,1755,1593,1755,0,0,0,0,44,,,0,87.51,1 -0,2025-05-22 07:58:28.239499,1755,1593,1755,0,0,0,0,43,,,0,87.499,1 -0,2025-05-22 07:58:29.239886,1755,1593,1755,0,0,0,0,43,,,0,87.47,1 -0,2025-05-22 07:58:30.240287,1755,1593,1755,5,0,0,0,43,,,0,87.489,1 -0,2025-05-22 07:58:31.240735,1755,1593,1755,5,0,0,0,43,,,0,87.547,1 -0,2025-05-22 07:58:32.241236,1755,1593,1755,5,0,0,0,43,,,0,87.536,1 -0,2025-05-22 07:58:33.241778,1755,1593,1755,5,0,0,0,43,,,0,87.501,1 -0,2025-05-22 07:58:34.242234,1755,1593,1755,22,2,0,0,43,,,0,92.456,1 -0,2025-05-22 07:58:35.242719,1755,1593,1755,0,0,0,0,43,,,0,99.021,1 -0,2025-05-22 07:58:36.243178,1755,1593,1755,0,0,0,0,43,,,0,86.497,1 -0,2025-05-22 07:58:37.243648,1440,1593,1440,94,55,0,0,47,,,0,250.361,1 -0,2025-05-22 07:58:38.244133,1725,1593,1725,66,31,0,0,46,,,0,283.307,1 -0,2025-05-22 07:58:39.244622,1755,1593,1755,0,0,0,0,44,,,0,88.187,1 -0,2025-05-22 07:58:40.245177,1755,1593,1755,0,0,0,0,44,,,0,88.658,1 -0,2025-05-22 07:58:41.245710,1755,1593,1755,0,0,0,0,44,,,0,87.237,1 -0,2025-05-22 07:58:42.246369,1755,1593,1755,0,0,0,0,43,,,0,87.238,1 -0,2025-05-22 07:58:43.247004,1755,1593,1755,68,20,0,0,44,,,0,108.265,1 -0,2025-05-22 07:58:44.247663,1755,1593,1755,0,0,0,0,43,,,0,88.374,1 -0,2025-05-22 07:58:45.248245,1755,1593,1755,4,0,0,0,43,,,0,88.026,1 -0,2025-05-22 07:58:46.248891,1755,1593,1755,0,0,0,0,43,,,0,99.235,1 -0,2025-05-22 07:58:47.249566,1755,1593,1755,0,0,0,0,43,,,0,86.765,1 -0,2025-05-22 07:58:48.250044,1755,1593,1755,0,0,0,0,43,,,0,86.521,1 -0,2025-05-22 07:58:49.250501,1755,1593,1755,0,0,0,0,43,,,0,86.45,1 -0,2025-05-22 07:58:50.250933,1755,1593,1755,0,0,0,0,43,,,0,86.317,1 -0,2025-05-22 07:58:51.251404,1755,1593,1755,0,0,0,0,43,,,0,86.281,1 -0,2025-05-22 07:58:52.251855,1755,1593,1755,0,0,0,0,43,,,0,86.279,1 -0,2025-05-22 07:58:53.252293,1755,1593,1755,0,0,0,0,43,,,0,86.27,1 -0,2025-05-22 07:58:54.252727,1755,1593,1755,0,0,0,0,43,,,0,86.27,1 -0,2025-05-22 07:58:55.253178,1755,1593,1755,0,0,0,0,43,,,0,86.269,1 -0,2025-05-22 07:58:56.253777,1755,1593,1755,0,0,0,0,43,,,0,86.26,1 -0,2025-05-22 07:58:57.254234,1755,1593,1755,0,0,0,0,42,,,0,86.175,1 -0,2025-05-22 07:58:58.254658,1755,1593,1755,0,0,0,0,42,,,0,86.144,1 -0,2025-05-22 07:58:59.255115,1755,1593,1755,0,0,0,0,42,,,0,85.909,1 -0,2025-05-22 07:59:00.255595,1755,1593,1755,0,0,0,0,42,,,0,86.051,1 -0,2025-05-22 07:59:01.256139,1755,1593,1755,16,2,0,0,42,,,0,87.435,1 -0,2025-05-22 07:59:02.256809,1755,1593,1755,0,0,0,0,42,,,0,100.382,1 -0,2025-05-22 07:59:03.257430,1755,1593,1755,0,0,0,0,42,,,0,86.028,1 -0,2025-05-22 07:59:04.258047,1755,1593,1755,4,2,0,0,42,,,0,86.141,1 -0,2025-05-22 07:59:05.258619,1755,1593,1755,5,2,0,0,42,,,0,87.176,1 -0,2025-05-22 07:59:06.259263,1755,1593,1755,0,0,0,0,42,,,0,109.326,1 -0,2025-05-22 07:59:07.259800,1755,1593,1755,1,0,0,0,42,,,0,86.754,1 -0,2025-05-22 07:59:08.260389,1755,1593,1755,0,0,0,0,42,,,0,86.754,1 -0,2025-05-22 07:59:09.261040,1755,1593,1755,0,0,0,0,42,,,0,86.708,1 -0,2025-05-22 07:59:10.261661,1755,1593,1755,43,19,0,0,43,,,0,117.79,1 -0,2025-05-22 07:59:11.262326,1755,1593,1755,1,0,0,0,42,,,0,95.483,1 -0,2025-05-22 07:59:12.262872,1755,1593,1755,1,0,0,0,42,,,0,85.586,1 -0,2025-05-22 07:59:13.263500,1755,1593,1755,2,0,0,0,42,,,0,85.569,1 -0,2025-05-22 07:59:14.263947,1755,1593,1755,0,0,0,0,42,,,0,117.181,1 -0,2025-05-22 07:59:15.264579,1695,1593,1695,81,38,0,0,45,,,0,242.684,1 -0,2025-05-22 07:59:16.265078,1755,1593,1755,9,3,0,0,43,,,0,156.85,1 -0,2025-05-22 07:59:17.265552,1755,1593,1755,8,3,0,0,42,,,0,88.204,1 -0,2025-05-22 07:59:18.266076,1755,1593,1755,0,0,0,0,42,,,0,88.05,1 -0,2025-05-22 07:59:19.266710,1755,1593,1755,70,1,0,0,42,,,0,86.629,1 -0,2025-05-22 07:59:20.267224,1755,1593,1755,0,0,0,0,42,,,0,86.029,1 -0,2025-05-22 07:59:21.267786,1755,1593,1755,70,1,0,0,42,,,0,85.58,1 -0,2025-05-22 07:59:22.268311,1755,1593,1755,0,0,0,0,42,,,0,87.914,1 -0,2025-05-22 07:59:23.268615,1755,1593,1755,1,0,0,0,42,,,0,87.2,1 -0,2025-05-22 07:59:24.268957,1755,1593,1755,0,0,0,0,42,,,0,88.245,1 -0,2025-05-22 07:59:25.269329,1755,1593,1755,0,0,0,0,42,,,0,88.254,1 -0,2025-05-22 07:59:26.269710,1755,1593,1755,0,0,0,0,42,,,0,88.32,1 -0,2025-05-22 07:59:27.270211,1755,1593,1755,0,0,0,0,42,,,0,88.207,1 -0,2025-05-22 07:59:28.270588,1755,1593,1755,0,0,0,0,42,,,0,88.207,1 -0,2025-05-22 07:59:29.270924,1755,1593,1755,0,0,0,0,42,,,0,88.197,1 -0,2025-05-22 07:59:30.271305,1755,1593,1755,0,0,0,0,42,,,0,88.13,1 -0,2025-05-22 07:59:31.271687,1755,1593,1755,0,0,0,0,42,,,0,88.197,1 -0,2025-05-22 07:59:32.272025,1755,1593,1755,0,0,0,0,42,,,0,88.14,1 -0,2025-05-22 07:59:33.272378,1755,1593,1755,0,0,0,0,42,,,0,88.121,1 -0,2025-05-22 07:59:34.272798,1755,1593,1755,0,0,0,0,42,,,0,88.003,1 -0,2025-05-22 07:59:35.273202,1755,1593,1755,0,0,0,0,41,,,0,87.874,1 -0,2025-05-22 07:59:36.273565,1755,1593,1755,0,0,0,0,41,,,0,87.818,1 -0,2025-05-22 07:59:37.273952,1755,1593,1755,0,0,0,0,41,,,0,87.777,1 -0,2025-05-22 07:59:38.274266,1755,1593,1755,0,0,0,0,41,,,0,87.76,1 -0,2025-05-22 07:59:39.274595,1755,1593,1755,0,0,0,0,41,,,0,87.731,1 -0,2025-05-22 07:59:40.274979,1755,1593,1755,0,0,0,0,41,,,0,87.786,1 -0,2025-05-22 07:59:41.275441,1755,1593,1755,0,0,0,0,41,,,0,87.777,1 -0,2025-05-22 07:59:42.275841,1755,1593,1755,0,0,0,0,41,,,0,87.722,1 -0,2025-05-22 07:59:43.276223,1755,1593,1755,0,0,0,0,41,,,0,87.72,1 -0,2025-05-22 07:59:44.276602,1755,1593,1755,0,0,0,0,41,,,0,87.78,1 -0,2025-05-22 07:59:45.276986,1755,1593,1755,0,0,0,0,41,,,0,87.722,1 -0,2025-05-22 07:59:46.277360,1755,1593,1755,0,0,0,0,41,,,0,87.77,1 -0,2025-05-22 07:59:47.277738,1755,1593,1755,0,0,0,0,41,,,0,87.718,1 -0,2025-05-22 07:59:48.278114,1755,1593,1755,0,0,0,0,41,,,0,87.72,1 -0,2025-05-22 07:59:49.278488,1755,1593,1755,0,0,0,0,41,,,0,87.798,1 -0,2025-05-22 07:59:50.278874,1755,1593,1755,0,0,0,0,41,,,0,87.683,1 -0,2025-05-22 07:59:51.279259,1755,1593,1755,0,0,0,0,41,,,0,87.654,1 -0,2025-05-22 07:59:52.279647,1755,1593,1755,0,0,0,0,41,,,0,87.653,1 -0,2025-05-22 07:59:53.280039,1755,1593,1755,0,0,0,0,41,,,0,87.596,1 -0,2025-05-22 07:59:54.280430,1755,1593,1755,0,0,0,0,41,,,0,87.555,1 -0,2025-05-22 07:59:55.280831,1755,1593,1755,0,0,0,0,41,,,0,87.516,1 -0,2025-05-22 07:59:56.281260,1755,1593,1755,0,0,0,0,41,,,0,87.452,1 -0,2025-05-22 07:59:57.281660,1755,1593,1755,0,0,0,0,41,,,0,87.48,1 -0,2025-05-22 07:59:58.282039,1755,1593,1755,0,0,0,0,41,,,0,87.462,1 -0,2025-05-22 07:59:59.282382,1755,1593,1755,0,0,0,0,41,,,0,87.299,1 -0,2025-05-22 08:00:00.282758,1755,1593,1755,0,0,0,0,41,,,0,87.298,1 -0,2025-05-22 08:00:01.283139,1755,1593,1755,0,0,0,0,41,,,0,87.245,1 -0,2025-05-22 08:00:02.283510,1755,1593,1755,0,0,0,0,41,,,0,87.255,1 -0,2025-05-22 08:00:03.283903,1755,1593,1755,0,0,0,0,41,,,0,87.245,1 -0,2025-05-22 08:00:04.284293,1755,1593,1755,0,0,0,0,41,,,0,87.234,1 -0,2025-05-22 08:00:05.284689,1755,1593,1755,0,0,0,0,41,,,0,87.232,1 -0,2025-05-22 08:00:06.285061,1755,1593,1755,0,0,0,0,41,,,0,87.236,1 -0,2025-05-22 08:00:07.285461,1755,1593,1755,0,0,0,0,41,,,0,87.232,1 -0,2025-05-22 08:00:08.286011,1755,1593,1755,0,0,0,0,40,,,0,86.191,2 -0,2025-05-22 08:00:09.286430,1755,1593,1755,0,0,0,0,40,,,0,84.978,2 -0,2025-05-22 08:00:10.286876,1755,1593,1755,0,0,0,0,40,,,0,85.173,2 -0,2025-05-22 08:00:11.287434,1755,1593,1755,2,0,0,0,40,,,0,87.85,2 -0,2025-05-22 08:00:12.288026,1755,1593,1755,2,0,0,0,40,,,0,85.315,2 -0,2025-05-22 08:00:13.288628,1755,1593,1755,0,0,0,0,40,,,0,85.125,2 -0,2025-05-22 08:00:14.289190,1755,1593,1755,0,0,0,0,40,,,0,85.039,2 -0,2025-05-22 08:00:15.289769,1755,1593,1755,0,0,0,0,40,,,0,84.981,2 -0,2025-05-22 08:00:16.290395,1755,1593,1755,0,0,0,0,40,,,0,84.978,2 -0,2025-05-22 08:00:17.291086,1755,1593,1755,0,0,0,0,40,,,0,84.887,2 -0,2025-05-22 08:00:18.291775,1755,1593,1755,0,0,0,0,40,,,0,84.855,2 -0,2025-05-22 08:00:19.292462,1755,1593,1755,0,0,0,0,40,,,0,84.815,2 -0,2025-05-22 08:00:20.293172,1755,1593,1755,0,0,0,0,40,,,0,84.807,2 -0,2025-05-22 08:00:21.293834,1755,1593,1755,0,0,0,0,40,,,0,84.815,2 -0,2025-05-22 08:00:22.294397,1755,1593,1755,0,0,0,0,40,,,0,84.809,2 -0,2025-05-22 08:00:23.294956,1755,1593,1755,0,0,0,0,40,,,0,84.811,2 -0,2025-05-22 08:00:24.295553,1755,1593,1755,0,0,0,0,40,,,0,84.807,2 -0,2025-05-22 08:00:25.296201,1755,1593,1755,0,0,0,0,40,,,0,84.807,2 -0,2025-05-22 08:00:26.296812,1755,1593,1755,0,0,0,0,40,,,0,84.803,2 -0,2025-05-22 08:00:27.297554,1755,1593,1755,0,0,0,0,40,,,0,84.811,2 -0,2025-05-22 08:00:28.298201,1755,1593,1755,0,0,0,0,40,,,0,84.799,2 -0,2025-05-22 08:00:29.298852,1755,1593,1755,0,0,0,0,40,,,0,84.805,2 -0,2025-05-22 08:00:30.299650,1755,1593,1755,0,0,0,0,40,,,0,84.805,2 -0,2025-05-22 08:00:31.300374,1755,1593,1755,0,0,0,0,40,,,0,84.805,2 -0,2025-05-22 08:00:32.300782,1755,1593,1755,100,3,0,0,40,,,0,89.233,2 -0,2025-05-22 08:00:33.301269,1755,1593,1755,0,0,0,0,40,,,0,90.663,2 -0,2025-05-22 08:00:34.301661,1755,1593,1755,0,0,0,0,40,,,0,87.319,1 -0,2025-05-22 08:00:35.301911,1755,1593,1755,0,0,0,0,40,,,0,87.299,0 -0,2025-05-22 08:00:36.302155,375,1593,375,0,0,0,0,39,,,0,80.476,0 diff --git a/tests/integration/defs/output/perf_script_test_results.csv b/tests/integration/defs/output/perf_script_test_results.csv deleted file mode 100644 index 4c256eadabe..00000000000 --- a/tests/integration/defs/output/perf_script_test_results.csv +++ /dev/null @@ -1,3 +0,0 @@ -network_name,network_hash,sm_clk,mem_clk,gpu_idx,perf_case_name,test_name,original_test_name,raw_result,perf_metric,total_time__sec,start_timestamp,end_timestamp,state,command,threshold,absolute_threshold,metric_type -"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",1755,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:20:22] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:20:22] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:20:25] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:20:25] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:20:25] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path: None\nNumber of Sequences: 512\n\n-- Percentiles statistics ---------------------------------\n\n Input Output Seq. Length\n-----------------------------------------------------------\nMIN: 128.0000 128.0000 256.0000\nMAX: 128.0000 128.0000 256.0000\nAVG: 128.0000 128.0000 256.0000\nP50: 128.0000 128.0000 256.0000\nP90: 128.0000 128.0000 256.0000\nP95: 128.0000 128.0000 256.0000\nP99: 128.0000 128.0000 256.0000\n===========================================================\n\n[05/22/2025-07:20:25] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:20:25] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:20:25] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:20:25] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:20:25] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:20:25] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:20:28] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:20:34] [TRT-LLM] [I] Starting quantization...\nRegistered for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:21:56] [TRT-LLM] [I] Quantization done. Total time used: 82.46 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:21:58] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:16] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/quantized-checkpoint \nTotal time used 20.49 s.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:22:17] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:22:17] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:22:17] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:22:17] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:22:17] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:22:24] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6364, GPU 636 (MiB)\n[05/22/2025-07:22:25] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1005, GPU +6, now: CPU 7168, GPU 642 (MiB)\n[05/22/2025-07:22:25] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time of constructing network from module object 8.95593547821045 seconds\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:22:26] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:22:26] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:22:26] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:22:30] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:22:30] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:22:30] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:22:40] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:40] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:22:43] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:43] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:43] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:43] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.11304ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:22:43] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:22:46] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:46] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:47] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:47] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:47] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:47] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.4582ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:22:47] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:22:51] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:51] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:51] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:51] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:51] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:51] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.58895ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:22:51] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:22:55] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:55] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:22:56] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:22:56] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:22:56] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:22:56] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.70618ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:22:56] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:22:59] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:22:59] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:00] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:00] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:00] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:00] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.7238ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:23:00] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:23:05] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:23:05] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:23:06] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:23:06] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:23:06] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.04059ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:23:06] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:23:06] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:23:06] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:23:06] [TRT] [I] Engine generation completed in 36.5777 seconds.\n[05/22/2025-07:23:06] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:23:09] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:42\n[05/22/2025-07:23:09] [TRT] [I] Serialized 4959 bytes of code generator cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 1912332 bytes of compilation cache.\n[05/22/2025-07:23:09] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:23:09] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:23:09] [TRT-LLM] [I] Build phase peak memory: 32810.35 MB, children: 11886.42 MB\n[05/22/2025-07:23:10] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmp251svz2n-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:23:29] [TRT-LLM] [I] Engine serialized. Total time: 00:00:18\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:23:31] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-07:23:48] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:48] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:23:50] [TRT-LLM] [W] Found worker process 94149 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-07:23:50] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:23:50] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8791 MiB\n[TensorRT-LLM][INFO] Engine load time 3243 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.46 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28322\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.32 GiB for max tokens in paged KV cache (906304).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:23:56] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-07:23:56] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name: meta-llama/Llama-3.1-8B\nModel Path: /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length: 256\nMax Batch Size: 4096\nMax Num Tokens: 8192\nQuantization: FP8\nKV Cache Dtype: FP8\n===========================================================\n\n[05/22/2025-07:24:09] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",36.5777,254.122006,2025-05-22 07:19:59,2025-05-22 07:24:13,valid, trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-19-47/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME -"llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]",375,1593,0,"H100/perf/test_perf.py::test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","test_perf_metric_build_time[llama_v3.1_8b-bench-bfloat16-maxbs:512-maxnt:2048-input_output_len:128,128-quant:fp8]","H100/perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8]","\n[05/22/2025-07:56:21] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-07:56:21] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-07:56:23] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: verbose\n[05/22/2025-07:56:23] [TRT-LLM] [I] Found dataset.\n[05/22/2025-07:56:23] [TRT-LLM] [I] \n===========================================================\n= DATASET DETAILS\n===========================================================\nDataset Path: None\nNumber of Sequences: 512\n\n-- Percentiles statistics ---------------------------------\n\n Input Output Seq. Length\n-----------------------------------------------------------\nMIN: 128.0000 128.0000 256.0000\nMAX: 128.0000 128.0000 256.0000\nAVG: 128.0000 128.0000 256.0000\nP50: 128.0000 128.0000 256.0000\nP90: 128.0000 128.0000 256.0000\nP95: 128.0000 128.0000 256.0000\nP99: 128.0000 128.0000 256.0000\n===========================================================\n\n[05/22/2025-07:56:23] [TRT-LLM] [I] Max batch size and max num tokens are not provided, use tuning heuristics or pre-defined setting from trtllm-bench.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated engine size: 7.48 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total available memory for KV cache: 72.17 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated total KV cache memory: 68.56 GB\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max number of requests in KV cache memory: 4387.86\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max batch size (after fine-tune): 4096\n[05/22/2025-07:56:23] [TRT-LLM] [I] Estimated max num tokens (after fine-tune): 8192\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_input_len (annotation=int required=False default=1024 description='The maximum input length.') with build_config.max_input_len (1024).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_seq_len (annotation=Union[int, NoneType] required=False default=None description='The maximum sequence length.') with build_config.max_seq_len (256).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Overriding LlmArgsBase.max_beam_width (annotation=int required=False default=1 description='The maximum beam width.') with build_config.max_beam_width (1).\n[05/22/2025-07:56:23] [TRT-LLM] [W] Using default gpus_per_node: 1\n[05/22/2025-07:56:23] [TRT-LLM] [I] Specified dtype 'auto'; inferred dtype 'bfloat16'.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:56:23] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:56:23] [TRT-LLM] [I] Initializing model from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Initializing tokenizer from /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\n[05/22/2025-07:56:25] [TRT-LLM] [I] Loading calibration dataset\n[05/22/2025-07:56:31] [TRT-LLM] [I] Starting quantization...\nRegistered for KV Cache quantization\nInserted 771 quantizers\n[05/22/2025-07:57:53] [TRT-LLM] [I] Quantization done. Total time used: 82.30 s.\ncurrent rank: 0, tp rank: 0, pp rank: 0\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:57:55] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [I] Quantized model exported to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/quantized-checkpoint \nTotal time used 20.10 s.\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-07:58:14] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-07:58:15] [TRT-LLM] [W] Overriding paged_state to False\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-07:58:15] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-07:58:15] [TRT-LLM] [W] max_input_len is 1024 is larger than max_seq_len 256, clipping it to max_seq_len\n[05/22/2025-07:58:15] [TRT-LLM] [W] padding removal and fMHA are both enabled, max_input_len is not required and will be ignored\n[05/22/2025-07:58:28] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 6151, GPU 636 (MiB)\n[05/22/2025-07:58:29] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1220, GPU +6, now: CPU 7170, GPU 642 (MiB)\n[05/22/2025-07:58:29] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time of constructing network from module object 14.438017129898071 seconds\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total optimization profiles added: 6\n[05/22/2025-07:58:29] [TRT-LLM] [I] Total time to initialize the weights in network Unnamed Network 0: 00:00:00\n[05/22/2025-07:58:29] [TRT-LLM] [I] Build TensorRT engine Unnamed Network 0\n[05/22/2025-07:58:29] [TRT] [W] Unused Input: position_ids\n[05/22/2025-07:58:33] [TRT] [W] [RemoveDeadLayers] Input Tensor position_ids is unused or used only at compile-time, but is not being removed.\n[05/22/2025-07:58:33] [TRT] [I] Global timing cache in use. Profiling results in this builder pass will be stored.\n[05/22/2025-07:58:33] [TRT] [I] Compiler backend is used during engine build.\n[05/22/2025-07:58:43] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:58:43] [TRT] [I] Detected 17 inputs and 1 output network tensors.\nXQA Macros: USE_INPUT_KV=1 INPUT_FP16=0 CACHE_ELEM_ENUM=2 HEAD_GRP_SIZE=4 __FORCE_INCLUDE_CUDA_FP16_HPP_FROM_FP16_H__=1 ROPE_STYLE=1 LOW_PREC_OUTPUT=1 DTYPE=__nv_bfloat16 GENERATE_CUBIN=1 HEAD_ELEMS=128 NDEBUG=1 BEAM_WIDTH=1 TOKENS_PER_PAGE=32 M_TILESIZE=4 __FORCE_INCLUDE_CUDA_BF16_HPP_FROM_BF16_H__=1 USE_CUSTOM_BARRIER=1 SLIDING_WINDOW=1 \n[05/22/2025-07:58:45] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:58:45] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:58:45] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:58:45] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 5.97675ms to assign 21 blocks to 175 nodes requiring 140780032 bytes.\n[05/22/2025-07:58:45] [TRT] [I] Total Activation Memory: 140777984 bytes\n[05/22/2025-07:59:00] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:00] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:01] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:01] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:01] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:01] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.52666ms to assign 21 blocks to 175 nodes requiring 143139328 bytes.\n[05/22/2025-07:59:01] [TRT] [I] Total Activation Memory: 143139328 bytes\n[05/22/2025-07:59:04] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:04] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:05] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:05] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:05] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:05] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.61347ms to assign 21 blocks to 175 nodes requiring 147857920 bytes.\n[05/22/2025-07:59:05] [TRT] [I] Total Activation Memory: 147857920 bytes\n[05/22/2025-07:59:08] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:08] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:09] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:09] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:09] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:09] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.88478ms to assign 21 blocks to 175 nodes requiring 157295104 bytes.\n[05/22/2025-07:59:09] [TRT] [I] Total Activation Memory: 157295104 bytes\n[05/22/2025-07:59:12] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:12] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:13] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:13] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:13] [TRT] [I] Max Scratch Memory: 138412032 bytes\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:13] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 6.79197ms to assign 21 blocks to 175 nodes requiring 176169472 bytes.\n[05/22/2025-07:59:13] [TRT] [I] Total Activation Memory: 176169472 bytes\n[05/22/2025-07:59:18] [TRT] [I] [GraphReduction] The approximate region cut reduction algorithm is called.\n[05/22/2025-07:59:18] [TRT] [I] Detected 17 inputs and 1 output network tensors.\n[05/22/2025-07:59:18] [TRT] [I] Total Host Persistent Memory: 67152 bytes\n[05/22/2025-07:59:18] [TRT] [I] Total Device Persistent Memory: 0 bytes\n[05/22/2025-07:59:18] [TRT] [I] Max Scratch Memory: 754974720 bytes\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 175 steps to complete.\n[05/22/2025-07:59:18] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 7.09646ms to assign 21 blocks to 175 nodes requiring 1056973312 bytes.\n[05/22/2025-07:59:18] [TRT] [I] Total Activation Memory: 1056973312 bytes\n[05/22/2025-07:59:19] [TRT] [I] Total Weights Memory: 9148379652 bytes\n[05/22/2025-07:59:19] [TRT] [I] Compiler backend is used during engine execution.\n[05/22/2025-07:59:19] [TRT] [I] Engine generation completed in 46.117 seconds.\n[05/22/2025-07:59:19] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 8725 MiB\n[05/22/2025-07:59:21] [TRT-LLM] [I] Total time of building Unnamed Network 0: 00:00:51\n[05/22/2025-07:59:21] [TRT] [I] Serialized 5010 bytes of code generator cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 1654870 bytes of compilation cache.\n[05/22/2025-07:59:21] [TRT] [I] Serialized 9 timing cache entries\n[05/22/2025-07:59:21] [TRT-LLM] [I] Timing cache serialized to model.cache\n[05/22/2025-07:59:21] [TRT-LLM] [I] Build phase peak memory: 32983.74 MB, children: 11888.58 MB\n[05/22/2025-07:59:23] [TRT-LLM] [I] Serializing engine to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/tmplbgae_lk-llm-workspace/tmp.engine/rank0.engine...\n[05/22/2025-07:59:39] [TRT-LLM] [I] Engine serialized. Total time: 00:00:16\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_request_error_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_result_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_stats_queue\n[05/22/2025-07:59:40] [TRT-LLM] [I] Generating a new HMAC key for server proxy_kv_cache_events_queue\n[05/22/2025-08:00:05] [TRT-LLM] [I] Starting TensorRT-LLM init.\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:05] [TRT-LLM] [I] TensorRT-LLM inited.\n[TensorRT-LLM] TensorRT-LLM version: 0.21.0rc0\n[05/22/2025-08:00:07] [TRT-LLM] [W] Found worker process 98701 was bound to {2, 18}, this may harmperformance.\n[05/22/2025-08:00:07] [TRT-LLM] [W] Will clear the cpu affinity\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:07] [TRT-LLM] [W] Logger level already set from environment. Discard new verbosity: info\n[TensorRT-LLM][INFO] Engine version 0.21.0rc0 found in the config file, assuming engine(s) built by new builder API.\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0\n[TensorRT-LLM][INFO] Rank 0 is using GPU 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 4096\n[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1\n[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 256\n[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0\n[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: (256) * 32\n[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0\n[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 0\n[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 8192\n[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 255 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled\n[TensorRT-LLM][INFO] TRTGptModel If model type is encoder, maxInputLen would be reset in trtEncoderModel to maxInputLen: min(maxSequenceLen, maxNumTokens).\n[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT\n[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None\n[TensorRT-LLM][INFO] Loaded engine size: 8786 MiB\n[TensorRT-LLM][INFO] Engine load time 3261 ms\n[TensorRT-LLM][INFO] Inspecting the engine to identify potential runtime issues...\n[TensorRT-LLM][INFO] The profiling verbosity of the engine does not allow this analysis to proceed. Re-build the engine with 'detailed' profiling verbosity to get more diagnostics.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1008.01 MiB for execution context memory.\n[TensorRT-LLM][INFO] gatherContextLogits: 0\n[TensorRT-LLM][INFO] gatherGenerationLogits: 0\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 1. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 2. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 3. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 4. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 8724 (MiB)\n[TensorRT-LLM][INFO] Switching optimization profile from: 0 to 5. Please ensure there are no enqueued operations pending in this context prior to switching profiles\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 1022.30 MB GPU memory for runtime buffers.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 5.10 GB GPU memory for decoder.\n[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 79.18 GiB, available: 61.47 GiB, extraCostMemory: 0.00 GiB\n[TensorRT-LLM][INFO] Number of blocks in KV cache primary pool: 28327\n[TensorRT-LLM][INFO] Number of blocks in KV cache secondary pool: 0, onboard blocks to primary memory before reuse: true\n[TensorRT-LLM][INFO] before Create KVCacheManager cacheTransPreAllocaSize:0\n[TensorRT-LLM][INFO] Max KV cache pages per sequence: 8 [window size=256]\n[TensorRT-LLM][INFO] Number of tokens per block: 32.\n[TensorRT-LLM][INFO] [MemUsageChange] Allocated 55.33 GiB for max tokens in paged KV cache (906464).\n[TensorRT-LLM][INFO] Set logger level to INFO\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.fc_after_embed = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_input_layernorm_in_first_layer = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.use_last_layernorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.layer_idx_offset = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.has_partial_lora_mask = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.producer = {'name': 'modelopt', 'version': '0.29.0'}\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.share_embedding_table = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rotary_pct = 1.0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rank = 0\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.decoder = llama\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.rmsnorm = True\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.lm_head_bias = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.tie_word_embeddings = False\n[05/22/2025-08:00:12] [TRT-LLM] [W] Implicitly setting LLaMAConfig.model_type = llama\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dtype to bfloat16.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gpt_attention_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set explicitly_disable_gemm_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fp8_rowwise_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set qserve_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set identity_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set nccl_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set lora_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set dora_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_groupwise_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set weight_only_quant_matmul_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_plugins to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set smooth_quant_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set layernorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set rmsnorm_quantization_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_per_token_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set quantize_tensor_plugin to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set moe_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set mamba_conv1d_plugin to auto.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set low_latency_gemm_swiglu_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set gemm_allreduce_plugin to None.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set bert_context_fmha_fp32_acc to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_kv_cache to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set remove_input_padding to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set norm_quant_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set reduce_fusion to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set user_buffer to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set tokens_per_block to 32.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_paged_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fp8_context_fmha to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set fuse_fp4_quant to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set multiple_profiles to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set paged_state to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set streamingllm to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set manage_weights to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set use_fused_mlp to True.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Set pp_reduce_scatter to False.\n[05/22/2025-08:00:12] [TRT-LLM] [I] Save model to /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n[TensorRT-LLM][INFO] Refreshed the MPI local session\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n===========================================================\n= ENGINE BUILD INFO\n===========================================================\nModel Name: meta-llama/Llama-3.1-8B\nModel Path: /scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B\nWorkspace Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8\nEngine Directory: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n\n===========================================================\n= ENGINE CONFIGURATION DETAILS\n===========================================================\nMax Sequence Length: 256\nMax Batch Size: 4096\nMax Num Tokens: 8192\nQuantization: FP8\nKV Cache Dtype: FP8\n===========================================================\n\n[05/22/2025-08:00:31] [TRT-LLM] [I] \n\n===========================================================\nENGINE SAVED: /home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/meta-llama/Llama-3.1-8B/tp_1_pp_1\n===========================================================\n\n",46.117,268.112764,2025-05-22 07:56:08,2025-05-22 08:00:36,valid, trtllm-bench --log_level=verbose --workspace=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8 --model=meta-llama/Llama-3.1-8B --model_path=/scratch.trt_llm_data/llm-models/llama-3.1-model/Meta-Llama-3.1-8B build --dataset=/home/scratch.huig_gpu/TensorRT-LLM/tests/integration/defs/llm-test-workspace/ws-2025-05-22-07-56-01/perf_engines/llama_v3.1_8b-bench-bfloat16-input_output_len_128_128-quant_fp8/synthetic_data.json --tp_size=1 --pp_size=1 --max_seq_len=256 --quantization=FP8,0.1,30,BUILD_TIME diff --git a/tests/integration/defs/output/session_properties.csv b/tests/integration/defs/output/session_properties.csv deleted file mode 100644 index b86ff5e75bf..00000000000 --- a/tests/integration/defs/output/session_properties.csv +++ /dev/null @@ -1,2 +0,0 @@ -username,start_timestamp,hostname,ip,nvidia_driver_version,nvidia_device_count,os_properties,cpu_properties,gpu_properties,trt_change_id,trt_branch,commit_timestamp,cuda_version,cublas_version,cudnn_version,end_timestamp -,2025-05-22 07:19:47,ipp2-1606.nvidia.com,10.176.4.8,575.57.05,1,"{'os_name': 'posix', 'platform': 'Linux', 'platform_version': '#144-Ubuntu SMP Fri Feb 7 20:47:38 UTC 2025'}","{'cpu_count': 32, 'cpu_freq': {'current': 1500.167875, 'min': 1500.0, 'max': 3000.0}}","{'device_product_name': 'H100 PCIe', 'pci_device_id': 590418142}",,,,,,,2025-05-22 07:55:34 diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt b/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt deleted file mode 100644 index 57e076ffb02..00000000000 --- a/tests/integration/test_lists/qa/trt_llm_release_perf_test.txt +++ /dev/null @@ -1 +0,0 @@ -perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8] From 225c1b0f9ea347f29fb7065d7471b6248c429eaf Mon Sep 17 00:00:00 2001 From: Hui Gao Date: Mon, 26 May 2025 01:13:20 +0000 Subject: [PATCH 3/9] Fix MNNVL name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hui Gao Signed-off-by: Hui Gao†--- tensorrt_llm/_torch/distributed/ops.py | 24 +++++++++++++++++------- tensorrt_llm/_torch/model_config.py | 2 +- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 085dff61171..5155e423d4f 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -307,8 +307,10 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype): super().__init__() self.mapping = mapping self.dtype = dtype - assert (dtype in MNNVLAllReduce.get_supported_dtype() - and (not mapping.has_cp())), "" + assert ( + dtype in MNNVLAllReduce.get_supported_dtype() + and (not mapping.has_cp()) + ), "MNNVL all reduce only support dtype {MNNVLAllReduce.get_supported_dtype()} and without cp." self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace( self.mapping, dtype) @@ -331,6 +333,9 @@ def forward( Returns: Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s) """ + if self.mapping == 1: + return input + if input.numel() > self.max_num_elements_mnnvl: return None @@ -376,7 +381,9 @@ class TLLMAllReduce(nn.Module): for certain operations when using NVLink for multi-node communication. """ - def __init__(self, mapping: Mapping, strategy: AllReduceStrategy = AllReduceStrategy.AUTO): + def __init__(self, + mapping: Mapping, + strategy: AllReduceStrategy = AllReduceStrategy.AUTO): super().__init__() self.mapping = mapping self.strategy = strategy @@ -404,6 +411,9 @@ def forward( Returns: Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s) """ + if self.mapping == 1: + return input + output = torch.ops.trtllm.allreduce( input=input, residual=all_reduce_params.residual, @@ -467,7 +477,7 @@ def __init__(self, or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using the AUTO strategy. """ - self.skip_ar = self.mapping.tp_size == 1 + self.skip_ar = mapping.tp_size == 1 self._mnvl_allreduce = None self._tllm_allreduce = None self._create_allreduce(mapping, ar_backend, strategy, dtype) @@ -521,13 +531,13 @@ def forward( if all_reduce_params is None: all_reduce_params = AllReduceParams() - if self.mnnvl_allreduce: - mnnvl_output = self.mnnvl_allreduce( + if self._mnvl_allreduce: + mnnvl_output = self._mnvl_allreduce( input, all_reduce_params=all_reduce_params) if mnnvl_output is not None: return mnnvl_output - # MNVL only support part of AllReduceFusionOp provided in params. + # MNNVL only support part of AllReduceFusionOp provided in params. output = self._tllm_allreduce( input=input, all_reduce_params=all_reduce_params, diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index ba3d359a499..e5b5ce048e8 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -77,7 +77,7 @@ class ModelConfig(Generic[TConfig]): attn_backend: str = 'TRTLLM' moe_backend: str = 'CUTLASS' # options can be CUTLASS, TRTLLM - ar_backend: str = 'TRTLLM' # options can be MNVL, TRTLLM + ar_backend: str = 'TRTLLM' # options can be MNNVL, TRTLLM # If true, enable min-latency mode. Currently only used for Llama4. enable_min_latency: bool = False From d5b372cb53d320c642d83c90f03a5635e5064762 Mon Sep 17 00:00:00 2001 From: Hui Gao Date: Wed, 11 Jun 2025 04:33:43 +0000 Subject: [PATCH 4/9] Address comments Signed-off-by: Hui Gao --- tensorrt_llm/_torch/distributed/ops.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 5155e423d4f..43b15a4d9f8 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -10,7 +10,12 @@ from tensorrt_llm._utils import mpi_barrier from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams, +<<<<<<< HEAD AllReduceStrategy, MoEAllReduceParams) +======= + AllReduceStrategy) +from tensorrt_llm.logger import logger +>>>>>>> 7f3955b17 (Address comments) from tensorrt_llm.mapping import Mapping from tensorrt_llm.plugin.plugin import CustomAllReduceHelper @@ -478,7 +483,7 @@ def __init__(self, the AUTO strategy. """ self.skip_ar = mapping.tp_size == 1 - self._mnvl_allreduce = None + self._mnnvl_allreduce = None self._tllm_allreduce = None self._create_allreduce(mapping, ar_backend, strategy, dtype) @@ -491,7 +496,7 @@ def _create_allreduce(self, mapping, backend, strategy, dtype): and dtype in MNNVLAllReduce.get_supported_dtype()) and (not mapping.has_cp()) and mapping.tp_size > 1) if enable_mnnvl: - self._mnvl_allreduce = MNNVLAllReduce(mapping, dtype) + self._mnnvl_allreduce = MNNVLAllReduce(mapping, dtype) self._tllm_allreduce = TLLMAllReduce(mapping, strategy) @@ -522,7 +527,7 @@ def forward( RESIDUAL_RMS_NORM_QUANT_FP8: [norm_quant, residual] RESIDUAL_RMS_NORM_OUT_QUANT_FP8: [norm, norm_quant, residual] RESIDUAL_RMS_NORM_QUANT_NVFP4: [norm_quant_fp4, scale_factor, residual] - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual]P + RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual] ''' if self.skip_ar or (all_reduce_params is not None and all_reduce_params.enable_allreduce == False): @@ -531,11 +536,12 @@ def forward( if all_reduce_params is None: all_reduce_params = AllReduceParams() - if self._mnvl_allreduce: - mnnvl_output = self._mnvl_allreduce( + if self._mnnvl_allreduce: + mnnvl_output = self._mnnvl_allreduce( input, all_reduce_params=all_reduce_params) if mnnvl_output is not None: return mnnvl_output + logger.info(f"Fallback to tllm_allreduce.") # MNNVL only support part of AllReduceFusionOp provided in params. output = self._tllm_allreduce( From bd1183d4c7b76ecdfafd1eb2228e6f5e33277da2 Mon Sep 17 00:00:00 2001 From: Hui Gao Date: Wed, 11 Jun 2025 04:35:17 +0000 Subject: [PATCH 5/9] Add strategy support in extra llm api config Signed-off-by: Hui Gao --- cpp/tensorrt_llm/thop/allreduceOp.cpp | 9 +- tensorrt_llm/_torch/distributed/ops.py | 165 +- tensorrt_llm/_torch/model_config.py | 21 +- .../_torch/models/modeling_deepseekv3.py | 7 +- tensorrt_llm/_torch/models/modeling_llama.py | 4 +- .../_torch/models/modeling_qwen3_moe.py | 4 +- tensorrt_llm/_torch/modules/fused_moe.py | 2513 +++++++++++++++++ .../modules/fused_moe/fused_moe_vanilla.py | 3 +- .../_torch/modules/fused_moe/interface.py | 3 +- tensorrt_llm/_torch/modules/linear.py | 4 +- tensorrt_llm/_torch/pyexecutor/config.py | 1 + tensorrt_llm/functional.py | 1 + .../_torch/multi_gpu/test_allreduce.py | 2 +- .../multi_gpu/test_lowprecision_allreduce.py | 6 +- .../_torch/multi_gpu/test_mnnvl_allreduce.py | 18 +- .../_torch/multi_gpu/test_user_buffers.py | 16 +- 16 files changed, 2636 insertions(+), 141 deletions(-) create mode 100755 tensorrt_llm/_torch/modules/fused_moe.py diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp index 25af1222aa6..d86a841fab9 100644 --- a/cpp/tensorrt_llm/thop/allreduceOp.cpp +++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp @@ -621,14 +621,12 @@ class AllreduceOp AllReduceStrategyType getRuntimeStrategy(size_t seq_len, size_t size) { - static char* force_nccl_all_reduce_strategy_char = std::getenv("FORCE_NCCL_ALL_REDUCE_STRATEGY"); - bool force_nccl_all_reduce_strategy = (force_nccl_all_reduce_strategy_char != nullptr); AllReduceStrategyType runtime_strategy; if (mStrategy == AllReduceStrategyType::UB) { runtime_strategy = AllReduceStrategyType::UB; } - else if (force_nccl_all_reduce_strategy || mStrategy == AllReduceStrategyType::NCCL) + else if (mStrategy == AllReduceStrategyType::NCCL) { runtime_strategy = AllReduceStrategyType::NCCL; } @@ -936,10 +934,7 @@ class AllreduceOp bool isUsingLowPrecision(size_t message_size) const noexcept { - static char* force_low_precision_allreduce_strategy_char - = std::getenv("FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY"); - bool force_low_precision = (force_low_precision_allreduce_strategy_char != nullptr) - || (mStrategy == AllReduceStrategyType::LOWPRECISION); + bool force_low_precision = mStrategy == AllReduceStrategyType::LOWPRECISION; #ifdef ENABLE_FP8 // Use LowPrecision if PCIe and p2p support and message size is larger than 2MB diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 43b15a4d9f8..7e18458a0f6 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -10,15 +10,13 @@ from tensorrt_llm._utils import mpi_barrier from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams, -<<<<<<< HEAD AllReduceStrategy, MoEAllReduceParams) -======= - AllReduceStrategy) from tensorrt_llm.logger import logger ->>>>>>> 7f3955b17 (Address comments) from tensorrt_llm.mapping import Mapping from tensorrt_llm.plugin.plugin import CustomAllReduceHelper +from ..model_config import ModelConfig + _thread_local = threading.local() @@ -338,9 +336,6 @@ def forward( Returns: Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s) """ - if self.mapping == 1: - return input - if input.numel() > self.max_num_elements_mnnvl: return None @@ -379,97 +374,41 @@ def forward( return None -class TLLMAllReduce(nn.Module): - """A specialized AllReduce implementation for Multi-Node NVLink communication. - - This class handles the MNNVL-specific allreduce operations, which can be more efficient - for certain operations when using NVLink for multi-node communication. - """ - - def __init__(self, - mapping: Mapping, - strategy: AllReduceStrategy = AllReduceStrategy.AUTO): - super().__init__() - self.mapping = mapping - self.strategy = strategy - self.workspace = None - - self.force_low_precision_env = os.environ.get( - "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY") - # When Strategy is UB, it is guaranteed that the workspace is not used. - if self.strategy != AllReduceStrategy.UB: - if self.strategy == AllReduceStrategy.LOWPRECISION or self.force_low_precision_env is not None: - allocate_low_presicion_allreduce_workspace(self.mapping) - self.workspace = get_allreduce_workspace(self.mapping) - - def forward( - self, - input: torch.Tensor, - all_reduce_params: AllReduceParams, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: - """Forward pass for MNNVL AllReduce. - - Args: - input (torch.Tensor): Input tensor to be reduced - all_reduce_params (Optional[AllReduceParams]): Parameters for fused operations - - Returns: - Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s) - """ - if self.mapping == 1: - return input - - output = torch.ops.trtllm.allreduce( - input=input, - residual=all_reduce_params.residual, - norm_weight=all_reduce_params.norm_weight, - scale=all_reduce_params.scale, - bias=all_reduce_params.bias, - workspace=self.workspace, - group=self.mapping.tp_group, - strategy=self.strategy, - op=all_reduce_params.fusion_op, - eps=all_reduce_params.eps, - ) - return output - - class AllReduce(nn.Module): def __init__(self, - mapping: Mapping, - strategy: AllReduceStrategy = AllReduceStrategy.AUTO, dtype: Optional[torch.dtype] = None, - ar_backend: str = "TRTLLM"): + model_config: ModelConfig = ModelConfig()): super().__init__() """ AllReduce is a module that performs an all-reduce operation on a tensor. Args: - mapping (Mapping): The parallel mapping config. - strategy (AllReduceStrategy): - The following all-reduce strategies are supported: + model_config (ModelConfig): mapping and strategy in it are used. + mapping (Mapping): The parallel mapping config. + strategy (AllReduceStrategy): + The following all-reduce strategies are supported: - - UB: AllReduce uses user-buffer based all-reduce kernel. + - UB: AllReduce uses user-buffer based all-reduce kernel. - - NCCL: Use NCCL allreduce. + - NCCL: Use NCCL allreduce. - - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel. + - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel. - - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy. + - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy. - - LOWPRECISION: AllReduce quantizes data to lower precision for transmission. - Should only be used on topologies with PCIe switches and without NVLink. - This strategy may result in some precision loss but can improve performance - on specific hardware configurations. + - LOWPRECISION: AllReduce quantizes data to lower precision for transmission. + Should only be used on topologies with PCIe switches and without NVLink. + This strategy may result in some precision loss but can improve performance + on specific hardware configurations. - All strategies support the following operations: - - NONE (AllReduce only) - - RESIDUAL_RMS_NORM - - RESIDUAL_RMS_NORM_QUANT_FP8 - - RESIDUAL_RMS_NORM_QUANT_NVFP4 - - RESIDUAL_RMS_NORM_OUT_QUANT_FP8 - - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4 + All strategies support the following operations: + - NONE (AllReduce only) + - RESIDUAL_RMS_NORM + - RESIDUAL_RMS_NORM_QUANT_FP8 + - RESIDUAL_RMS_NORM_QUANT_NVFP4 + - RESIDUAL_RMS_NORM_OUT_QUANT_FP8 + - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4 Note: NCCL, UB, and LOWPRECISION strategies only support consequent kernel calls instead of fused operations. @@ -482,23 +421,27 @@ def __init__(self, or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using the AUTO strategy. """ - self.skip_ar = mapping.tp_size == 1 - self._mnnvl_allreduce = None - self._tllm_allreduce = None - self._create_allreduce(mapping, ar_backend, strategy, dtype) - def _create_allreduce(self, mapping, backend, strategy, dtype): - if mapping.tp_size == 1: - return - - enable_mnnvl = (backend == "MNNVL" - and (dtype - and dtype in MNNVLAllReduce.get_supported_dtype()) - and (not mapping.has_cp()) and mapping.tp_size > 1) - if enable_mnnvl: - self._mnnvl_allreduce = MNNVLAllReduce(mapping, dtype) + self.mapping = model_config.mapping + self.workspace = None + self.strategy = model_config.allreduce_backend + self.mnnvl_allreduce = None - self._tllm_allreduce = TLLMAllReduce(mapping, strategy) + self.force_low_precision_env = os.environ.get( + "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY") + if self.mapping.tp_size > 1: + # When Strategy is UB, it is guaranteed that the workspace is not used. + if self.strategy != AllReduceStrategy.UB: + if self.strategy == AllReduceStrategy.LOWPRECISION: + allocate_low_presicion_allreduce_workspace(self.mapping) + self.workspace = get_allreduce_workspace(self.mapping) + + # Initialize MNNVL AllReduce if needed + if self.strategy == AllReduceStrategy.MNNVL and ( + dtype and dtype in MNNVLAllReduce.get_supported_dtype() + ) and (not self.mapping.has_cp()): + self.mnnvl_allreduce = MNNVLAllReduce(self.mapping, + dtype) if dtype else None def forward( self, @@ -529,25 +472,35 @@ def forward( RESIDUAL_RMS_NORM_QUANT_NVFP4: [norm_quant_fp4, scale_factor, residual] RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: [norm, norm_quant_fp4, scale_factor, residual] ''' - if self.skip_ar or (all_reduce_params is not None - and all_reduce_params.enable_allreduce == False): + if self.mapping.tp_size == 1 or (all_reduce_params is not None + and all_reduce_params.enable_allreduce + == False): return input if all_reduce_params is None: all_reduce_params = AllReduceParams() - if self._mnnvl_allreduce: - mnnvl_output = self._mnnvl_allreduce( + # Try MNNVL AllReduce first if available + if self.mnnvl_allreduce: + mnnvl_output = self.mnnvl_allreduce( input, all_reduce_params=all_reduce_params) if mnnvl_output is not None: return mnnvl_output - logger.info(f"Fallback to tllm_allreduce.") - # MNNVL only support part of AllReduceFusionOp provided in params. - output = self._tllm_allreduce( + # Fall back to regular AllReduce if MNNVL is not available or not applicable + output = torch.ops.trtllm.allreduce( input=input, - all_reduce_params=all_reduce_params, + residual=all_reduce_params.residual, + norm_weight=all_reduce_params.norm_weight, + scale=all_reduce_params.scale, + bias=all_reduce_params.bias, + workspace=self.workspace, + group=self.mapping.tp_group, + strategy=self.strategy, + op=all_reduce_params.fusion_op, + eps=all_reduce_params.eps, ) + return output if len(output) > 1 else output[0] diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index e5b5ce048e8..0b0e8a9210e 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -8,6 +8,8 @@ from tensorrt_llm import logger from tensorrt_llm._utils import torch_dtype_to_binding +from tensorrt_llm.functional import AllReduceStrategy +from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization.mode import QuantAlgo @@ -77,7 +79,7 @@ class ModelConfig(Generic[TConfig]): attn_backend: str = 'TRTLLM' moe_backend: str = 'CUTLASS' # options can be CUTLASS, TRTLLM - ar_backend: str = 'TRTLLM' # options can be MNNVL, TRTLLM + allreduce_backend: AllReduceStrategy = AllReduceStrategy.AUTO # If true, enable min-latency mode. Currently only used for Llama4. enable_min_latency: bool = False @@ -107,6 +109,23 @@ def __post_init__(self): self.is_generation = self.is_generation_model( self.pretrained_config.architectures) + def map_ar_strategy(strategy: str = "AUTO"): + maps = { + "AUTO": AllReduceStrategy.AUTO, + "NCCL": AllReduceStrategy.NCCL, + "UB": AllReduceStrategy.UB, + "MIN_LATENCY": AllReduceStrategy.MIN_LATENCY, + "ONESHOT": AllReduceStrategy.ONESHOT, + "TWOSHOT": AllReduceStrategy.TWOSHOT, + "LOWPRECISION": AllReduceStrategy.LOWPRECISION, + "MNNVL": AllReduceStrategy.MNNVL + } + key = strategy.upper() + return maps[key] if key in maps else AllReduceStrategy.AUTO + + if isinstance(self.allreduce_backend, str): + self.allreduce_backend = map_ar_strategy(self.allreduce_backend) + @property def fuse_pos_embd(self): if self.attn_backend == 'TRTLLM': diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 21918ed655c..67973dc90ba 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -399,7 +399,7 @@ def __init__(self, overridden_tp_size=shared_tp_size, reduce_output=False) - self.allreduce = AllReduce(self.mapping) + self.allreduce = AllReduce(model_config=model_config) self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared] self.event_dict = { key: torch.cuda.Event() @@ -628,9 +628,8 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], eps=config.rms_norm_eps, dtype=config.torch_dtype) self.layer_idx = layer_idx - self.allreduce = AllReduce(self.mapping, - dtype=config.torch_dtype, - ar_backend=model_config.ar_backend) + self.allreduce = AllReduce(dtype=config.torch_dtype, + model_config=model_config) self.moe_allreduce = MoEAllReduce(self.mapping) self.next_layer_layernorm: RMSNorm = None diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 600808c6b61..a852560af10 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -282,7 +282,7 @@ def __init__( quant_config=None) self.mapping = model_config.mapping - self.all_reduce = AllReduce(self.mapping) + self.all_reduce = AllReduce(model_config=model_config) self.moe_event = [torch.cuda.Event(), torch.cuda.Event()] self.aux_stream = aux_stream @@ -414,7 +414,7 @@ def __init__( dtype=config.torch_dtype) self.mapping = model_config.mapping - self.all_reduce = AllReduce(self.mapping) + self.all_reduce = AllReduce(model_config=model_config) self.next_layer_layernorm: RMSNorm = None self.next_attn: LlamaAttention = None diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index f15a21df31d..6a1e13b1467 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -89,7 +89,7 @@ def __init__( self.top_k = config.num_experts_per_tok self.enable_attention_dp = model_config.mapping.enable_attention_dp self.mapping = model_config.mapping - self.allreduce = AllReduce(self.mapping) + self.allreduce = AllReduce(model_config=model_config) self.enable_alltoall = Qwen3MoE.should_enable_alltoall( model_config, self.top_k) if self.enable_alltoall: @@ -202,7 +202,7 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig], dtype=config.torch_dtype) self.layer_idx = layer_idx - self.allreduce = AllReduce(self.mapping) + self.allreduce = AllReduce(model_config=model_config) self.next_layer_layernorm: RMSNorm = None self.fusion_config = EagerFusionConfig() diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py new file mode 100755 index 00000000000..7df752814e2 --- /dev/null +++ b/tensorrt_llm/_torch/modules/fused_moe.py @@ -0,0 +1,2513 @@ +import copy +import math +import os +import threading +from enum import Enum, IntEnum +from typing import Dict, List, NamedTuple, Optional, Union + +import torch +from torch import nn + +from tensorrt_llm._mnnvl_utils import MnnvlMoe, MoEAlltoallInfo +from tensorrt_llm._utils import get_sm_version, logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.quantization.utils import fp4_utils +from tensorrt_llm.quantization.utils.fp4_utils import ( + get_reorder_rows_for_gated_act_gemm_row_indices, + get_shuffle_matrix_a_row_indices, get_shuffle_matrix_sf_a_row_indices, + shuffle_matrix_a, shuffle_matrix_sf_a) + +from ...quantization.utils.fp4_utils import float4_sf_dtype +from ..distributed import allgather, reducescatter +from ..expert_statistic import ExpertStatistic +from ..model_config import ModelConfig, MoeLoadBalancerConfig +from ..utils import (EventType, Fp4QuantizedTensor, disable_fp4_allgather, + reswizzle_sf, swizzle_sf, unswizzle_sf) +from .gated_mlp import GatedMLP +from .linear import TensorParallelMode, load_weight_shard +from .moe_load_balancer import MoeLoadBalancer + +# The declarations aligns with moe_kernels.h +# pack inputs into int64, e.g. 4 x bf16 input values +FUSED_MOE_NVFP4_INPUT_DTYPE = torch.int64 +# pack weights into int64, e.g. 16 x nvfp4 weight values +FUSED_MOE_NVFP4_WEIGHT_DTYPE = torch.int64 +# pack weight block scales into int32, e.g. 4 x fp8 weight values +FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE = torch.int32 + + +# The type of method in top-K routing, for use in torch custom op +# Please keep this in sync with the counterpart defined in cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h +class RoutingMethodType(IntEnum): + # Default: Softmax -> TopK + Default = 0, + # Renormalize: TopK -> Softmax + Renormalize = 1, + # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts from the Top4 groups + DeepSeekV3 = 2, + # Llama4: Top1 -> Sigmoid + Llama4 = 3, + # Qwen3: Softmax -> TopK -> Renormalize + Qwen3 = 4, + # Unspecified + Unspecified = 5. + + +class BaseMoeRoutingMethod(nn.Module): + + def apply(self, _router_logits) -> (torch.Tensor, torch.Tensor): + """ + Applies the routing method to the router logits. + Router logits are usually the output of the router Linear layer, but can be any type for more complex routing methods. + Returns (token_selected_experts: torch.Tensor, token_final_scales: torch.Tensor): + token_selected_experts: shape (num_tokens, experts_per_token). + It is a list of selected expert indices for each token + token_final_scales: shape (num_tokens, experts_per_token). May be None + It contains a final scaling/weighting factor applied to the output of each selected expert before summing the results + """ + raise NotImplementedError("Subclasses must implement this method") + + def get_experts_per_token(self): + return self.top_k + + @property + def experts_per_token(self): + return self.get_experts_per_token() + + @property + def routing_method_type(self): + return RoutingMethodType.Unspecified + + +class DefaultMoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__(self, top_k: int): + super().__init__() + self.top_k = top_k + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + topk_values, topk_indices = torch.topk(torch.nn.functional.softmax( + router_logits.float(), dim=-1), + k=self.top_k, + dim=-1) + return topk_indices.to(torch.int32), topk_values + + @property + def routing_method_type(self): + return RoutingMethodType.Default + + +class DeepSeekV3MoeRoutingMethod(BaseMoeRoutingMethod): + + # Intentionally leave apply() unimplemented. + # See comments in DeepseekV3Gate on why routing is done by DeepseekV3Gate. + def __init__(self, top_k: int): + super().__init__() + self.top_k = top_k + + @property + def routing_method_type(self): + return RoutingMethodType.DeepSeekV3 + + +class RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__( + self, + top_k: int, + ): + super().__init__() + self.top_k = top_k + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + topk_values, topk_indices = torch.topk(router_logits, + k=self.top_k, + dim=-1) + return topk_indices.to(torch.int32), torch.nn.functional.softmax( + topk_values.float(), dim=-1) + + @property + def routing_method_type(self): + return RoutingMethodType.Renormalize + + +class Llama4RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__(self, top_k: int): + super().__init__() + self.top_k = top_k + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + topk_values, topk_indices = torch.topk(router_logits, + k=self.top_k, + dim=-1) + return topk_indices.to(torch.int32), torch.sigmoid(topk_values.float()) + + @property + def routing_method_type(self): + return RoutingMethodType.Llama4 + + +# TODO: re-enable this once the custom op is working. +# class Llama4RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod): + +# def __init__(self, top_k: int, num_experts_total: int, ep_size: int, +# ep_rank: int): +# super().__init__() +# self.top_k = top_k +# self.num_experts_total = num_experts_total +# self.num_experts_per_node = self.num_experts_total // ep_size +# self.start_expert = self.num_experts_per_node * ep_rank +# self.end_expert = self.start_expert + self.num_experts_per_node + +# def apply(self, +# router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): +# unpermuted_scales, indices = torch.ops.trtllm.fused_topk_softmax( +# router_logits, self.top_k, self.num_experts_total, +# self.start_expert, self.end_expert) +# return indices, unpermuted_scales + + +# TODO Test this for Phi models +class SparseMixerMoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__(self, top_k: int, eps: float): + super().__init__() + self.top_k = top_k + self.eps = eps + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + router_logits = router_logits.float() + topk_values = torch.empty(router_logits.shape[0], + self.top_k, + device=router_logits.device, + dtype=torch.float32) + topk_indices = torch.empty(router_logits.shape[0], + self.top_k, + device=router_logits.device, + dtype=torch.int32) + for i in range(self.top_k): + if i > 0: + max_elem = torch.argmax(router_logits, dim=-1) + # Mask out the previously selected indices to negative infinity + router_logits.scatter_(-1, max_elem.unsqueeze(-1), + -float('inf')) + # Get the max value of the remaining indices + max_values, max_indices = torch.max(router_logits, + dim=-1, + keepdim=True) + assert torch.all(max_values != -float('inf')) + + topk_indices[:, i] = max_indices.squeeze(-1) + + # Mask out any values that fail the condition '(max - value) / std::max(abs(value), max) > 2 * epsilon' + mask = ( + (max_values - router_logits) / + torch.max(torch.abs(router_logits), max_values)) > 2 * self.eps + masked_logits = torch.where(mask, -float('inf'), router_logits) + softmax_masked_logits = torch.nn.functional.softmax(masked_logits, + dim=-1) + selected_values = torch.gather(softmax_masked_logits, -1, + max_indices) + topk_values[:, i] = selected_values.squeeze(-1) + + return topk_indices.to(torch.int32), topk_values + + +class StaticMoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__(self, + routing_tensor: torch.Tensor, + routing_scales: Optional[torch.Tensor] = None): + super().__init__() + assert routing_tensor.dtype == torch.int32 + if routing_scales is not None: + assert routing_tensor.shape[0] == routing_scales.shape[0] + assert routing_tensor.shape[1] == routing_scales.shape[1] + assert routing_scales.dtype == torch.float32 + self.routing_tensor = routing_tensor + self.routing_scales = routing_scales + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + return self.routing_tensor, self.routing_scales + + def get_experts_per_token(self): + return self.routing_tensor.shape[1] + + +class LoadBalancedMoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__(self, top_k: int): + super().__init__() + self.top_k = top_k + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + balanced_values = torch.ones(router_logits.shape[0], + self.top_k, + device=router_logits.device, + dtype=torch.float32) + balanced_indices = torch.empty(router_logits.shape[0], + self.top_k, + device=router_logits.device, + dtype=torch.int32) + + # Fill the balanced_indices with each expert in round-robin fashion + final_size = router_logits.shape[0] * self.top_k + repeat_count = math.ceil(final_size / router_logits.shape[1]) + indices = torch.arange(router_logits.shape[1], + device=router_logits.device, + dtype=torch.int32) + indices = indices.repeat(repeat_count) + indices = indices[:final_size] + balanced_indices = indices.view(router_logits.shape[0], + self.top_k).contiguous() + + return balanced_indices, balanced_values + + +class Qwen3MoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__(self, top_k: int): + super().__init__() + self.top_k = top_k + + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + + routing_weights = torch.nn.functional.softmax(router_logits, + dim=1, + dtype=torch.float) + topk_values, topk_indices = torch.topk(routing_weights, + k=self.top_k, + dim=-1) + topk_values /= topk_values.sum(dim=-1, keepdim=True) + return topk_indices.to(torch.int32), topk_values + + @property + def routing_method_type(self) -> RoutingMethodType: + return RoutingMethodType.Qwen3 + + +class MoEWeightLoadingMode(Enum): + VANILLA = 0 + FUSED_GATE_UP_PROJ = 1 + + +class VanillaMoE(nn.ModuleList): + + def __init__( + self, + *, + routing_method: BaseMoeRoutingMethod, + num_experts: int, + hidden_size: int, + intermediate_size: int, + dtype: Optional[torch.dtype] = None, + reduce_results: bool = False, + model_config: ModelConfig = ModelConfig(), + aux_stream: Optional[torch.cuda.Stream] = None, + weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode. + VANILLA, + apply_router_weight_on_input: bool = False, + enable_alltoall: bool = False, + pack_weights: bool = False, + ): + from ..distributed import AllReduce + + super().__init__() + self.routing_method = routing_method + self.num_experts = num_experts + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.weight_loading_mode = weight_loading_mode + self.pack_weights = pack_weights + + self.dtype = dtype + self.reduce_results = reduce_results + self.model_config = model_config + # could be modified later + self.quant_config = model_config.quant_config + + self.cluster_rank = model_config.mapping.moe_cluster_rank + self.cluster_size = model_config.mapping.moe_cluster_size + self.smart_router = True if self.cluster_size > 1 else False + assert not self.smart_router, ( + "Smart router is not supported in vanilla MoE, " + "please set moe_cluster_size to 1.") + + self.rank = model_config.mapping.rank + + self.tp_rank = model_config.mapping.moe_tp_rank + self.tp_size = model_config.mapping.moe_tp_size + + self.ep_size = model_config.mapping.moe_ep_size + self.ep_rank = model_config.mapping.moe_ep_rank + self.moe_backend = model_config.moe_backend + self.use_dp = model_config.mapping.enable_attention_dp + + # All ranks participate in allreduce regardless of EP/TP combination + self.mapping = model_config.mapping + self.parallel_size = self.mapping.tp_size + + self.all_reduce = AllReduce(self.mapping) + + self.intermediate_size_per_partition = intermediate_size // self.tp_size + + self.expert_size_per_partition = num_experts // self.ep_size + self.expert_start = self.ep_rank * self.expert_size_per_partition + self.expert_end = min( + self.expert_start + self.expert_size_per_partition, + self.num_experts) + self.expert_size_per_partition = self.expert_end - self.expert_start + + max_num_tokens = model_config.max_num_tokens + # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled + if self.use_dp: + max_num_tokens *= model_config.mapping.world_size + self.moe_max_num_tokens = model_config.moe_max_num_tokens if model_config.moe_max_num_tokens is not None else max_num_tokens + + self.enable_alltoall = False + + self._weights_created = False + if not model_config.skip_create_weights_in_init: + self.create_weights() + + # If True, the router weight will be multiplied on the input rather than at the end of FC2 + self.apply_router_weight_on_input = apply_router_weight_on_input + + def create_experts(self, module_list: nn.ModuleList = None): + if module_list is None: + module_list = self + model_config = copy.copy(self.model_config) + model_config.mapping = Mapping( + world_size=self.mapping.moe_tp_size, + tp_size=self.mapping.moe_tp_size, + rank=self.mapping.moe_tp_rank, + ) + model_config.quant_config = self.quant_config + model_config.skip_create_weights_in_init = False + for expert_idx in range(self.num_experts): + if self.expert_start <= expert_idx < self.expert_end: + module_list[expert_idx] = GatedMLP( + hidden_size=self.hidden_size, + intermediate_size=self.intermediate_size, + bias=False, + dtype=self.dtype, + config=model_config, + reduce_output=False, + ) + else: + # use identity as placeholder for unused experts + module_list[expert_idx] = nn.Identity() + + def create_weights(self): + if self._weights_created: + return + self._weights_created = True + + if not self.pack_weights: + self.create_experts() + return + + self.has_any_quant = False + self.has_fp8_qdq = False + self.has_fp8_block_scales = False + self.has_nvfp4 = False + gate_up_proj_shape = ( + self.expert_size_per_partition, + self.intermediate_size_per_partition * 2, + self.hidden_size, + ) + down_proj_shape = ( + self.expert_size_per_partition, + self.hidden_size, + self.intermediate_size_per_partition, + ) + if self.quant_config and self.quant_config.layer_quant_mode.has_any_quant( + exclude_kv_cache=True): + self.has_any_quant = True + qc = self.quant_config + if qc.layer_quant_mode.has_fp8_qdq(): + self.has_fp8_qdq = True + + self.gate_up_proj_weight = nn.Parameter( + torch.empty( + gate_up_proj_shape, + dtype=torch.float8_e4m3fn, + ), + requires_grad=False, + ) + self.gate_up_proj_weight_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.gate_up_proj_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.gate_up_proj_inv_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + + self.down_proj_weight = nn.Parameter( + torch.empty( + down_proj_shape, + dtype=torch.float8_e4m3fn, + ), + requires_grad=False, + ) + self.down_proj_weight_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.down_proj_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.down_proj_inv_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + elif qc.layer_quant_mode.has_fp8_block_scales(): + self.has_fp8_block_scales = True + + self.gate_up_proj_weight = nn.Parameter( + torch.empty( + gate_up_proj_shape, + dtype=torch.float8_e4m3fn, + ), + requires_grad=False, + ) + gate_up_proj_scale_shape = ( + self.expert_size_per_partition, + math.ceil(self.intermediate_size_per_partition * 2 / 128), + math.ceil(self.hidden_size / 128), + ) + self.gate_up_proj_weight_scale = nn.Parameter( + torch.empty( + gate_up_proj_scale_shape, + dtype=torch.float32, + ), + requires_grad=False, + ) + # Not really used for Gemm now. + # Only used to quantize output of FP8 attention. + self.gate_up_proj_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.gate_up_proj_inv_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + + self.down_proj_weight = nn.Parameter( + torch.empty( + down_proj_shape, + dtype=torch.float8_e4m3fn, + ), + requires_grad=False, + ) + down_proj_scale_shape = ( + self.expert_size_per_partition, + math.ceil(self.hidden_size / 128), + math.ceil(self.intermediate_size_per_partition / 128), + ) + self.down_proj_weight_scale = nn.Parameter( + torch.empty( + down_proj_scale_shape, + dtype=torch.float32, + ), + requires_grad=False, + ) + # Not really used for Gemm now. + # Only used to quantize output of FP8 attention. + self.down_proj_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.down_proj_inv_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + elif qc.layer_quant_mode.has_nvfp4(): + self.has_nvfp4 = True + self.scaling_vector_size = 16 + + assert self.hidden_size % self.scaling_vector_size == 0, f"hidden_size {self.hidden_size} must be divisible by scaling_vector_size {self.scaling_vector_size}" + + # Quantized weights + self.gate_up_proj_weight = nn.Parameter( + torch.empty( + [ + self.expert_size_per_partition, + self.intermediate_size_per_partition * 2, + self.hidden_size // 2, + ], + dtype=fp4_utils.float4_e2m1x2, + ), + requires_grad=False, + ) + + # FP8 per-block scaling factors. dtype must be aligned with SF_DTYPE + # Padding is required. See computeSFSize in quantization.h + nrows = fp4_utils.pad_up( + self.intermediate_size_per_partition * 2, 128) + ncols = fp4_utils.pad_up( + self.hidden_size // self.scaling_vector_size, 4) + self.gate_up_proj_weight_scale = nn.Parameter( + torch.empty( + [self.expert_size_per_partition, nrows * ncols], + dtype=fp4_utils.float4_sf_dtype, + ), + requires_grad=False, + ) + + # FP32 per-tensor global scaling factor = 448*6/amax_input + self.gate_up_proj_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.gate_up_proj_inv_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + + # (amax_input*amax_weight) / (448*6*448*6) + self.gate_up_proj_alpha = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + + assert self.intermediate_size_per_partition % self.scaling_vector_size == 0, f"intermediate_size_per_partition {self.intermediate_size_per_partition} must be divisible by scaling_vector_size {self.scaling_vector_size}" + + # Quantized weights + self.down_proj_weight = nn.Parameter( + torch.empty( + [ + self.expert_size_per_partition, + self.hidden_size, + self.intermediate_size_per_partition // 2, + ], + dtype=fp4_utils.float4_e2m1x2, + ), + requires_grad=False, + ) + + # FP8 per-block scaling factors. dtype must be aligned with SF_DTYPE + # Padding is required. See computeSFSize in quantization.h + nrows = fp4_utils.pad_up(self.hidden_size, 128) + ncols = fp4_utils.pad_up( + self.intermediate_size_per_partition // + self.scaling_vector_size, 4) + self.down_proj_weight_scale = nn.Parameter( + torch.empty( + [self.expert_size_per_partition, nrows * ncols], + dtype=fp4_utils.float4_sf_dtype, + ), + requires_grad=False, + ) + + # FP32 per-tensor global scaling factor = 448*6/amax_input + self.down_proj_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + self.down_proj_inv_input_scale = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + + # (amax_input*amax_weight) / (448*6*448*6) + self.down_proj_alpha = nn.Parameter( + torch.empty( + self.expert_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + else: + raise ValueError(f'unsupported quant mode: {qc.quant_mode}') + else: + self.gate_up_proj_weight = nn.Parameter( + torch.empty(gate_up_proj_shape, dtype=self.dtype), + requires_grad=False, + ) + self.down_proj_weight = nn.Parameter( + torch.empty(down_proj_shape, dtype=self.dtype), + requires_grad=False, + ) + + def pack_params(self, experts, module_name: str, weight_name: str): + weights = [] + for expert_idx in range(self.expert_start, self.expert_end): + weights.append( + getattr(getattr(experts[expert_idx], module_name), weight_name)) + packed_weight = torch._utils._flatten_dense_tensors(weights) + weights_data = torch._utils._unflatten_dense_tensors( + packed_weight, weights) + for weight, data in zip(weights, weights_data): + weight.data = data + packed_weight = packed_weight.view(len(weights), *weights_data[0].shape) + getattr(self, f"{module_name}_{weight_name}").data = packed_weight + + def load_weights(self, weights: List[Dict]): + from ..models.modeling_utils import filter_weights + + assert self._weights_created + assert len(weights) == 1 + weights = weights[0] + + if self.pack_weights: + experts = nn.ModuleList([None] * self.num_experts) + self.create_experts(experts) + experts.to("cuda") + else: + experts = self + + for expert_idx in range(self.expert_start, self.expert_end): + experts[expert_idx].gate_up_proj.load_weights([ + filter_weights(f"{expert_idx}.w1", weights), + filter_weights(f"{expert_idx}.w3", weights), + ]) + experts[expert_idx].down_proj.load_weights([ + filter_weights(f"{expert_idx}.w2", weights), + ]) + + if self.pack_weights: + for module_name in ["gate_up_proj", "down_proj"]: + for weight_name, _ in getattr(experts[self.expert_start], + module_name).named_parameters(): + self.pack_params(experts, module_name, weight_name) + + def reducescatter_or_allreduce( + self, + inputs, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + ): + outputs = inputs + if self.parallel_size > 1 and not self.enable_alltoall: + if self.use_dp: + outputs = reducescatter( + inputs, + self.mapping, + dim=0, + sizes=None if use_dp_padding else all_rank_num_tokens) + elif self.reduce_results: + outputs = self.all_reduce(inputs) + return outputs + + def run_experts( + self, + input: torch.Tensor, + expanded_inputs: torch.Tensor, + expanded_scales: torch.Tensor, + sorted_experts: torch.Tensor, + batch_indices: torch.Tensor, + ) -> torch.Tensor: + final_hidden_states = torch.zeros( + input.shape, + dtype=input.dtype, + device=input.device, + ) + for expert_idx in range(self.expert_start, self.expert_end): + expert_mask = sorted_experts == expert_idx + if not torch.any(expert_mask): + continue + expanded_input = expanded_inputs[expert_mask] + batch_idx = batch_indices[expert_mask] + expanded_scale = expanded_scales[expert_mask] + + output = self[expert_idx](expanded_input) + final_hidden_states[batch_idx] += output * expanded_scale + return final_hidden_states + + def forward( + self, + x: torch.Tensor, + router_logits: torch.Tensor, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + **kwargs, + ) -> torch.Tensor: + assert x.shape[-1] == self.hidden_size + x = x.view(-1, self.hidden_size) + + token_selected_experts, token_final_scales = self.routing_method.apply( + router_logits) + + if self.use_dp and self.parallel_size > 1: + x, token_selected_experts, token_final_scales = allgather( + [x, token_selected_experts, token_final_scales], + self.mapping, + dim=0, + sizes=None if use_dp_padding else all_rank_num_tokens) + + expert_masks = ((token_selected_experts >= self.expert_start) + & (token_selected_experts < self.expert_end)) + local_selected_experts = token_selected_experts[expert_masks] + sort_indices = torch.argsort(local_selected_experts) + sorted_experts = local_selected_experts[sort_indices] + + batch_indices, nth_experts = torch.where(expert_masks) + batch_indices = batch_indices[sort_indices] + nth_experts = nth_experts[sort_indices] + expanded_inputs = x[batch_indices] + expanded_scales = token_final_scales[batch_indices, nth_experts, None] + + final_hidden_states = self.run_experts( + x, + expanded_inputs, + expanded_scales, + sorted_experts, + batch_indices, + ) + + final_hidden_states = self.reducescatter_or_allreduce( + final_hidden_states, + all_rank_num_tokens=all_rank_num_tokens, + use_dp_padding=use_dp_padding, + ) + return final_hidden_states + + +class FusedMoE(nn.Module): + """ + Fused Mixture of Experts (MoE) Layer with performance tuning. + + Args: + num_experts (int): Number of experts in the MoE layer. + top_k (int): Number of top experts to select for each input token. + hidden_size (int): Size of the hidden state. + intermediate_size (int): Size of the intermediate state. + aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks. + dtype (Optional[torch.dtype]): Data type for the weights. + reduce_results (bool): Whether to reduce the results across devices. + model_config (ModelConfig): Configuration object for the model. + enable_alltoall (bool): whether to enable alltoall instead of allgather/reducescatter + + MoE torch custom op: + cutlass Backend + In min-latency mode: + Quant: + fp8 block scales (SM90 Hopper only): + FusedMoE Op: dynamic quant + gemm1 + swiglu + gemm2 (return tensor list). + fp8 qdq, nvfp4: + FusedMoE Op: gemm1 + swiglu + gemm2 (return tensor list). + + In max-throughput mode: + Quant: + fp8 block scales (SM90 Hopper only): + FusedMoE Op: dynamic quant + scatter + gemm1 + swiglu + gemm2 + finalizeMoeRoute (return one tensor) + p8 qdq, nvfp4: + FusedMoE Op: scatter + gemm1 + swiglu + gemm2 + finalizeMoeRoute (return one tensor) + + trtllm_gen backend: + Only support min-latency mode now (SM100 Blackwell only). + Quant: fp8 block scales quant and nvfp4 quant + FusedMoE Op: routing(topK, etc.) + scatter + gemm1 + swiglu + gemm2 + finalize MoeRoute + + FusedMoE module: + cutlass Backend (moe_backend="CUTLASS"): + min-latency mode: + routing(topK, etc.) + FusedMoE Op + equals to: routing(topK, etc.) [+ dynamic quant fp8 qdq | optional dynamic quant nvfp4] + gemm1 + swiglu + gemm2 + + max-throughput mode: + routing(topK, etc.) [+ dynamic quant for fp8 qdq and nvfp4 ] [+ fp4_allgather] + FusedMoe Op[no allreduce] + reducescatter, with AttentionDP on + equals to: dynamic quant + routing(topK, etc.) [+ fp4_allgather] + scatter + gemm1 + swiglu + gemm2 + finalizeMoeRoute [no allreduce] + reducescatter + + trtllm_gen backend (moe_backend="TRTLLM"): + min-latency mode (cutlass_min_latency_mode flag of forward has no effect when trtllm_gen is used): + dynamic quant + FusedMoe Op + equals to: dynamic quant + routing(topK, etc.) + scatter + gemm1 + swiglu + gemm2 + finalize MoeRoute + + In min-latency mode, setting `reduce_results=False` disables the AllReduce in the FusedMoE module, so any necessary AllReduce operations must be added explicitly in the model definition. + AttentionDP should be turned off for min-latency mode. + + When we have redundant expert, we have more weight slots than `num_experts`, in that case, we separate the concepts of expert and slot. + Expert is the concept from model's perspective while slot is the concept from model engine's perspective. + There should be at lease `num_experts` slots in the model engine. More than that is OK, in that case, some experts may have multiple replicas. + """ + + def __init__( + self, + *, + routing_method: BaseMoeRoutingMethod, + num_experts: int, + hidden_size: int, + intermediate_size: int, + dtype: Optional[torch.dtype] = None, + reduce_results: bool = False, + model_config: ModelConfig = ModelConfig(), + aux_stream: Optional[torch.cuda.Stream] = None, + weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode. + VANILLA, + apply_router_weight_on_input: bool = False, + enable_alltoall: bool = False, + moe_load_balancer: Optional[MoeLoadBalancer] = None, + layer_idx: Optional[int] = None, + ): + from ..distributed import AllReduce + + super().__init__() + self.routing_method = routing_method + self.num_experts = num_experts + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.weight_loading_mode = weight_loading_mode + + self.dtype = dtype + self.reduce_results = reduce_results + # could be modified later + self.quant_config = model_config.quant_config + + self.cluster_rank = model_config.mapping.moe_cluster_rank + self.cluster_size = model_config.mapping.moe_cluster_size + self.smart_router = True if self.cluster_size > 1 else False + + self.rank = model_config.mapping.rank + + self.tp_rank = model_config.mapping.moe_tp_rank + self.tp_size = model_config.mapping.moe_tp_size + + self.ep_size = model_config.mapping.moe_ep_size + self.ep_rank = model_config.mapping.moe_ep_rank + self.moe_backend = model_config.moe_backend + self.use_dp = model_config.mapping.enable_attention_dp + + # All ranks participate in allreduce regardless of EP/TP combination + self.mapping = model_config.mapping + self.parallel_size = self.mapping.tp_size + + self.all_reduce = AllReduce(model_config=model_config) + + self.intermediate_size_per_partition = intermediate_size // self.tp_size + + self.layer_idx = layer_idx + moe_load_balancer_config = model_config.moe_load_balancer + if moe_load_balancer_config is None: + assert moe_load_balancer is None + # A dummy MoeLoadBalancerConfig to generate default initial_global_assignments + moe_load_balancer_config = MoeLoadBalancerConfig() + moe_load_balancer_config.setup(num_experts=num_experts, + ep_rank=self.ep_rank, + ep_size=self.ep_size) + else: + assert moe_load_balancer is not None + + self.num_slots = moe_load_balancer_config.num_slots + if self.smart_router: + assert self.num_slots == self.num_experts, "Smart router should not have redundant slots" + + self.initial_global_assignments = moe_load_balancer_config.get_layer_initial_global_assignments( + layer_idx) + self.expert_size_per_partition = moe_load_balancer_config.num_local_slots + self.slot_start = moe_load_balancer_config.slot_start + self.slot_end = moe_load_balancer_config.slot_end + self.initial_local_expert_ids = self.initial_global_assignments[ + self.slot_start:self.slot_end] + assert len( + self.initial_local_expert_ids) == self.expert_size_per_partition + + self.balancer_layer = None + if moe_load_balancer is not None: + self.balancer_layer = moe_load_balancer.add_layer( + expert_count=num_experts, + top_k=routing_method.experts_per_token, + slot_count_per_rank=self.expert_size_per_partition, + ) + self.balancer_layer.set_initial_weight_assignments( + self.initial_global_assignments) + logger.info( + f"MoE load balancer enabled. num_experts = {num_experts}, num_slots = {self.num_slots}, ep_size = {self.ep_size}" + ) + logger.info( + f"initial_global_assignments (layer {layer_idx}) = {self.initial_global_assignments}" + ) + + max_num_tokens = model_config.max_num_tokens + # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled + if self.use_dp: + max_num_tokens *= model_config.mapping.world_size + self.moe_max_num_tokens = model_config.moe_max_num_tokens if model_config.moe_max_num_tokens is not None else max_num_tokens + # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied + if self.moe_max_num_tokens < max_num_tokens: + self.aux_stream = aux_stream if aux_stream is not None else torch.cuda.Stream( + ) + self.event_dict = { + key: torch.cuda.Event() + for key in [EventType.Main, EventType.MoeChunkingOverlap] + } + else: + self.aux_stream = None + self.event_dict = None + + # The profiler converges on the same best tactic when the number of tokens is large enough. + # To avoid long profiling time, the max number of tokens used in the profiling is capped to + # around 16k tokens per expert, which is well into the compute bound domain. + self.tune_max_num_tokens = min( + self.moe_max_num_tokens, + 16384 * self.num_slots // routing_method.get_experts_per_token(), + ) + self.has_been_profiled = False + self.has_been_profiled_min_latency = False + + self.enable_alltoall = enable_alltoall + self.use_postquant_alltoall = False + if self.enable_alltoall: + assert self.use_dp and self.parallel_size > 1,\ + "alltoall should only enabled with attention dp and parallel_size > 1" + qm = self.quant_config.quant_mode + self.use_postquant_alltoall = (os.environ.get( + "TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1") + == "1") and qm.has_nvfp4() + self.alltoall_workspace = MnnvlMoe.get_moe_workspaces( + model_config.mapping) if enable_alltoall else None + + self._weights_created = False + if not model_config.skip_create_weights_in_init: + self.create_weights() + + # If True, the router weight will be multiplied on the input rather than at the end of FC2 + self.apply_router_weight_on_input = apply_router_weight_on_input + self._check_configs() + + @property + def has_any_quant(self): + return self.quant_config and self.quant_config.quant_mode.has_any_quant( + exclude_kv_cache=True) + + def _check_configs(self): + if self.enable_alltoall: + assert self.use_dp and self.parallel_size > 1,\ + "alltoall should only enabled with attention dp and parallel_size > 1" + + if self.is_trtllm(): + # trtllm_gen backend only support min-latency mode now + assert not self.apply_router_weight_on_input, "TRTLLM backend does not support applying router weight on input yet." + assert not self.reduce_results + assert self.quant_config and ( + self.quant_config.quant_mode.has_nvfp4() + | self.quant_config.quant_mode.has_fp8_block_scales() + ), "The TRTLLM backend of FusedMoE only supports fp8_block_scaling and nvfp4 dtypes." + else: + if self.apply_router_weight_on_input: + assert self.routing_method.top_k == 1, "Current walkaround only supports top-1 routing" + if self.quant_config and self.quant_config.quant_mode.has_any_quant( + exclude_kv_cache=True): + if not (self.quant_config.quant_mode.has_nvfp4() + | self.quant_config.quant_mode.has_fp8_block_scales() + | self.quant_config.quant_mode.has_fp8_qdq() + | self.quant_config.quant_mode. + is_int4_weight_only_per_group()): + raise ValueError( + f"unsupported quantization mode: {self.quant_config.quant_mode}" + ) + + def setup_quant_scales(self): + self.quant_scales = None + if not self.has_any_quant: + return + if self.has_fp8_qdq: + self.quant_scales = FusedMoEQuantScalesFP8( + fc1_dequant=self.fc31_dequant, + fc2_quant=self.fc2_quant, + fc2_dequant=self.fc2_dequant, + fc1_input_dequant=self.fc31_input_dequant, + ) + elif self.has_fp8_block_scales: + self.quant_scales = FusedMoEQuantScalesFP8BlockScales( + fc_weight_scales=self.w3_w1_weight_scaling_factor, + proj_weight_scales=self.w2_weight_scaling_factor, + ) + elif self.has_nvfp4: + self.quant_scales = FusedMoEQuantScalesNVFP4( + fc1_act_global=self.fc31_input_scale, + fc1_weight_block=self.w3_w1_weight_scale, + fc1_global=self.fc31_alpha, + fc2_act_global=self.fc2_input_scale, + fc2_weight_block=self.w2_weight_scale, + fc2_global=self.fc2_alpha, + ) + elif self.has_w4afp8: + self.quant_scales = FusedMoEQuantScalesW4A8( + scale_1_interleaved=self.fc31_weight_scale, + scale_2_interleaved=self.fc2_weight_scale, + pre_quant_scale_1=self.fc31_act_scale, + pre_quant_scale_2=self.fc2_act_scale, + zero_1=torch.Tensor(), + zero_2=torch.Tensor(), + alpha_1=self.fc31_alpha, + alpha_2=self.fc2_alpha, + ) + + def is_trtllm(self): + return self.moe_backend == "TRTLLM" and self.has_any_quant + + def is_cutlass(self): + return not self.is_trtllm() + + def get_quant_scales(self, slot_start, slot_end): + assert self.smart_router + + if self.has_fp8_block_scales: + return FusedMoEQuantScalesFP8BlockScales( + fc_weight_scales=self.w3_w1_weight_scaling_factor.narrow( + 0, slot_start, slot_end - slot_start), + proj_weight_scales=self.w2_weight_scaling_factor.narrow( + 0, slot_start, slot_end - slot_start), + ) + elif self.has_nvfp4: + return FusedMoEQuantScalesNVFP4( + fc1_act_global=self.fc31_input_scale, + fc1_weight_block=self.w3_w1_weight_scale.narrow( + 0, slot_start, slot_end - slot_start), + fc1_global=self.fc31_alpha.narrow(0, slot_start, + slot_end - slot_start), + fc2_act_global=self.fc2_input_scale, + fc2_weight_block=self.w2_weight_scale.narrow( + 0, slot_start, slot_end - slot_start), + fc2_global=self.fc2_alpha.narrow(0, slot_start, + slot_end - slot_start), + ) + elif self.has_w4afp8: + return FusedMoEQuantScalesW4A8( + scale_1_interleaved=self.fc31_weight_scale.narrow( + 0, slot_start, slot_end - slot_start), + scale_2_interleaved=self.fc2_weight_scale.narrow( + 0, slot_start, slot_end - slot_start), + pre_quant_scale_1=self.fc31_act_scale.narrow( + 0, slot_start, slot_end - slot_start), + pre_quant_scale_2=self.fc2_act_scale.narrow( + 0, slot_start, slot_end - slot_start), + zero_1=torch.Tensor(), + zero_2=torch.Tensor(), + alpha_1=self.fc31_alpha.narrow(0, slot_start, + slot_end - slot_start), + alpha_2=self.fc2_alpha.narrow(0, slot_start, + slot_end - slot_start), + ) + else: + return self.quant_scales + + def create_weights(self): + if self._weights_created: + return + weight_dtype = self.dtype + w3_w1_weight_shape = (self.expert_size_per_partition, + self.intermediate_size_per_partition * 2, + self.hidden_size) + w2_weight_shape = ( + self.expert_size_per_partition, + self.hidden_size, + self.intermediate_size_per_partition, + ) + + self.quant_scales = [] + self.has_fp8_qdq = False + self.has_fp8_block_scales = False + self.has_nvfp4 = False + self.has_w4afp8 = False + if self.quant_config and self.quant_config.quant_mode.has_any_quant( + exclude_kv_cache=True): + qc = self.quant_config + if qc.quant_mode.has_fp8_qdq(): + self.has_fp8_qdq = True + weight_dtype = torch.float8_e4m3fn + + fc31_dequant = nn.Parameter(torch.empty( + self.expert_size_per_partition, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc31_dequant", fc31_dequant) + + fc2_dequant = nn.Parameter(torch.empty( + self.expert_size_per_partition, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc2_dequant", fc2_dequant) + + fc2_quant = nn.Parameter(torch.tensor(1., dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc2_quant", fc2_quant) + + fc31_input_dequant = nn.Parameter(torch.tensor( + 1., dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc31_input_dequant", + fc31_input_dequant) + elif qc.quant_mode.has_fp8_block_scales(): + self.has_fp8_block_scales = True + weight_dtype = torch.float8_e4m3fn + cell_div = lambda x, y: (x + y - 1) // y + w3_w1_weight_scaling_factor = nn.Parameter(torch.empty( + (self.expert_size_per_partition, + cell_div(self.intermediate_size_per_partition, 128) * 2, + cell_div(w3_w1_weight_shape[2], 128)), + dtype=torch.float32), + requires_grad=False) + self.register_parameter("w3_w1_weight_scaling_factor", + w3_w1_weight_scaling_factor) + + w2_weight_scaling_factor = nn.Parameter(torch.empty( + (self.expert_size_per_partition, + cell_div(w2_weight_shape[1], + 128), cell_div(w2_weight_shape[2], 128)), + dtype=torch.float32), + requires_grad=False) + self.register_parameter("w2_weight_scaling_factor", + w2_weight_scaling_factor) + elif qc.quant_mode.is_int4_weight_only_per_group(): + self.has_w4afp8 = True + self.sm_version = get_sm_version() + if self.sm_version == 89: + self.interleave = [1, 1] + elif self.sm_version == 90: + self.interleave = [] + for k_shape in [ + self.hidden_size, + self.intermediate_size_per_partition + ]: + if k_shape % 512 == 0: + self.interleave.append(4) + elif k_shape % 256 == 0: + self.interleave.append(2) + elif k_shape % 128 == 0: + self.interleave.append(1) + else: + raise NotImplementedError( + f"K shape is required to be multiple of 128, received {k_shape}." + ) + else: + raise NotImplementedError( + f"W4AFP8 MoE is unsupported on SM{self.sm_version}.") + weight_dtype = torch.int8 + w3_w1_weight_shape = (self.expert_size_per_partition, + self.intermediate_size_per_partition * 2, + self.hidden_size // 2) + w2_weight_shape = (self.expert_size_per_partition, + self.hidden_size, + self.intermediate_size_per_partition // 2) + + fc31_act_scale = nn.Parameter(torch.empty( + self.expert_size_per_partition, 1, dtype=self.dtype), + requires_grad=False) + self.register_parameter("fc31_act_scale", fc31_act_scale) + + fc2_act_scale = nn.Parameter(torch.empty( + self.expert_size_per_partition, 1, dtype=self.dtype), + requires_grad=False) + self.register_parameter("fc2_act_scale", fc2_act_scale) + + # col parallel + fc31_weight_scale = nn.Parameter( + torch.empty(self.expert_size_per_partition, + self.hidden_size // (128 * self.interleave[0]), + self.intermediate_size_per_partition * 2 * + self.interleave[0], + dtype=self.dtype), + requires_grad=False) + self.register_parameter("fc31_weight_scale", fc31_weight_scale) + + # row parallel + fc2_weight_scale = nn.Parameter( + torch.empty(self.expert_size_per_partition, + self.intermediate_size_per_partition // + (128 * self.interleave[1]), + self.hidden_size * self.interleave[1], + dtype=self.dtype), + requires_grad=False) + self.register_parameter("fc2_weight_scale", fc2_weight_scale) + + fc31_alpha = nn.Parameter(torch.empty( + self.expert_size_per_partition, 1, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc31_alpha", fc31_alpha) + + fc2_alpha = nn.Parameter(torch.empty( + self.expert_size_per_partition, 1, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc2_alpha", fc2_alpha) + elif qc.quant_mode.has_nvfp4(): + self.has_nvfp4 = True + if self.is_trtllm(): + weight_dtype = float4_sf_dtype + weight_vec_size = torch.iinfo(weight_dtype).bits // 4 + block_scales_dtype = torch.float8_e4m3fn + block_scales_vec_size = 1 + else: + weight_dtype = FUSED_MOE_NVFP4_WEIGHT_DTYPE + weight_vec_size = torch.iinfo(weight_dtype).bits // 4 + block_scales_dtype = FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE + block_scales_vec_size = torch.iinfo( + block_scales_dtype).bits // 8 + + self.scaling_vector_size = 16 + # Divide by 16 because we use int64 to pack 16 fp4 values + w3_w1_weight_shape = (self.expert_size_per_partition, + self.intermediate_size_per_partition * 2, + self.hidden_size // weight_vec_size) + w2_weight_shape = (self.expert_size_per_partition, + self.hidden_size, + self.intermediate_size_per_partition // + weight_vec_size) + + # Divide by 4 because we use int32 to pack 4 fp8 values + # column parallel + w3_w1_weight_scale = nn.Parameter( + torch.ones(self.expert_size_per_partition, + self.intermediate_size_per_partition * 2, + self.hidden_size // self.scaling_vector_size // + block_scales_vec_size, + dtype=block_scales_dtype), + requires_grad=False) + self.register_parameter("w3_w1_weight_scale", + w3_w1_weight_scale) + + # row parallel + w2_weight_scale = nn.Parameter(torch.ones( + self.expert_size_per_partition, + self.hidden_size, + self.intermediate_size_per_partition // + self.scaling_vector_size // block_scales_vec_size, + dtype=block_scales_dtype), + requires_grad=False) + self.register_parameter("w2_weight_scale", w2_weight_scale) + + fc31_input_scale = nn.Parameter(torch.tensor( + 1., dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc31_input_scale", fc31_input_scale) + + fc2_input_scale = nn.Parameter(torch.tensor( + 1., dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc2_input_scale", fc2_input_scale) + + fc31_alpha = nn.Parameter(torch.ones( + self.expert_size_per_partition, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc31_alpha", fc31_alpha) + + fc2_alpha = nn.Parameter(torch.ones( + self.expert_size_per_partition, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc2_alpha", fc2_alpha) + + if self.is_trtllm(): + fc31_scale_c = nn.Parameter(torch.ones( + self.expert_size_per_partition, dtype=torch.float32), + requires_grad=False) + self.register_parameter("fc31_scale_c", fc31_scale_c) + + else: + # TODO: support other quant mode + raise ValueError( + f"unsupported quantization mode: {qc.quant_mode}") + self.setup_quant_scales() + + # Fused gate_up_proj (column parallel) + w3_w1_weight = nn.Parameter(torch.empty(w3_w1_weight_shape, + dtype=weight_dtype), + requires_grad=False) + self.register_parameter("w3_w1_weight", w3_w1_weight) + + # down_proj (row parallel) + w2_weight = nn.Parameter(torch.empty(w2_weight_shape, + dtype=weight_dtype), + requires_grad=False) + self.register_parameter("w2_weight", w2_weight) + self._weights_created = True + + def reducescatter_or_allreduce( + self, + inputs, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + ): + outputs = inputs + if self.parallel_size > 1 and not self.enable_alltoall: + if self.use_dp: + outputs = reducescatter( + inputs, + self.mapping, + dim=0, + sizes=None if use_dp_padding else all_rank_num_tokens) + elif self.reduce_results: + outputs = self.all_reduce(inputs) + return outputs + + def forward_chunk( + self, + x: Union[torch.Tensor, Fp4QuantizedTensor], + router_logits: torch.Tensor, + cutlass_min_latency_mode: bool = False, + output_dtype: Optional[torch.dtype] = None, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + ) -> torch.Tensor: + if isinstance(x, Fp4QuantizedTensor): + assert output_dtype is not None + output_dtype = output_dtype + else: + output_dtype = x.dtype + + use_fp8_block_scaling = False + use_w4a8_group_scaling = False + weight_dtype = self.w3_w1_weight.dtype + + token_selected_experts, token_final_scales = self.routing_method.apply( + router_logits) + if self.balancer_layer is None: + token_selected_slots = token_selected_experts + else: + # If attention DP is enabled, token_selected_experts is a local rank tensor, + # so we need to offset the round robin position by ep_rank + token_selected_slots = self.balancer_layer.route( + token_selected_experts, offset_by_ep_rank=self.use_dp) + + # If load balancer is disabled, the statistics are collected from expert IDs. + # If load balancer is enabled, the statistics are collected from expert slot IDs. + ExpertStatistic.set_layer(self.layer_idx) + ExpertStatistic.maybe_add_info(self.num_slots, token_selected_slots) + + assert token_selected_slots.shape[ + 1] == self.routing_method.experts_per_token + assert token_selected_slots.shape == token_final_scales.shape + assert token_selected_slots.shape[0] == router_logits.shape[0] + assert token_final_scales.dtype == torch.float32 + assert token_selected_slots.dtype == torch.int32 + + if self.apply_router_weight_on_input: + assert self.routing_method.top_k == 1, "Current workaround only supports top-1 routing" + assert x.dtype != torch.float8_e4m3fn, "Current workaround for apply_router_weight_on_input does not support fp8 input" + x = x * token_final_scales.to(x.dtype) + # TODO: remove this once we have correct fusedmoe kernel ready + token_final_scales = None + + token_count = x.shape[0] + + alltoall_info = None + + if self.enable_alltoall: + x, token_selected_slots, token_final_scales, alltoall_info = \ + self.alltoall_prepare_maybe_dispatch(all_rank_num_tokens, + x, + token_selected_slots, + token_final_scales) + + x_sf = None + if self.has_any_quant: + if self.has_fp8_qdq: + x, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor( + x, self.fc31_input_dequant) + elif self.has_nvfp4: + if not disable_fp4_allgather() or self.use_postquant_alltoall: + if isinstance(x, Fp4QuantizedTensor): + x, x_sf = x.fp4_tensor, x.scaling_factor + x_row = x.shape[0] + # note: we use uint8 to store 2 fp4 values + x_col = x.shape[1] * 2 + else: + x_row = x.shape[0] + x_col = x.shape[1] + x, x_sf = torch.ops.trtllm.fp4_quantize( + x, self.fc31_input_scale, self.scaling_vector_size, + False) + + elif self.has_fp8_block_scales: + use_fp8_block_scaling = True + elif self.has_w4afp8: + use_w4a8_group_scaling = True + weight_dtype = torch.quint4x2 + else: + raise ValueError( + f"unsupported quantization mode: {self.quant_config.quant_mode}" + ) + + if self.use_dp and self.parallel_size > 1 and not disable_fp4_allgather( + ) and not self.enable_alltoall: + x, x_sf, token_selected_slots, token_final_scales = allgather( + [x, x_sf, token_selected_slots, token_final_scales], + self.mapping, + dim=0, + sizes=None if use_dp_padding else all_rank_num_tokens) + # Fp4 gemm has extra scaling factor + if x_sf is not None: + x_sf = reswizzle_sf(x_sf, x_row, x_col, + self.scaling_vector_size) + + if self.smart_router and not cutlass_min_latency_mode: + ep_size = self.cluster_size + ep_rank = self.cluster_rank + expert_start = ep_rank * self.num_experts // ep_size + expert_end = min(self.num_experts, + (ep_rank + 1) * self.num_experts // ep_size) + w3_w1_weight = self.w3_w1_weight.narrow(0, expert_start, + expert_end - expert_start) + w2_weight = self.w2_weight.narrow(0, expert_start, + expert_end - expert_start) + cluster_size = self.ep_size + cluster_rank = self.ep_rank + quant_scales = self.get_quant_scales(expert_start, expert_end) + else: + ep_size = self.ep_size + ep_rank = self.ep_rank + w3_w1_weight = self.w3_w1_weight + w2_weight = self.w2_weight + cluster_size = self.cluster_size + cluster_rank = self.cluster_rank + quant_scales = self.quant_scales + + if self.use_postquant_alltoall: + x, x_sf = self.alltoall_postquant_dispatch(x, x_sf, x_row, x_col, + alltoall_info) + + final_hidden_states = torch.ops.trtllm.fused_moe( + x, + token_selected_slots, + token_final_scales, + w3_w1_weight.view(weight_dtype), + w2_weight.view(weight_dtype), + output_dtype, + quant_scales=quant_scales, + input_sf=x_sf, + tp_size=self.tp_size, + tp_rank=self.tp_rank, + ep_size=ep_size, + ep_rank=ep_rank, + cluster_size=cluster_size, + cluster_rank=cluster_rank, + use_fp8_block_scaling=use_fp8_block_scaling, + use_w4a8_group_scaling=use_w4a8_group_scaling, + min_latency_mode=cutlass_min_latency_mode, + tune_max_num_tokens=self.tune_max_num_tokens, + ) + + if cutlass_min_latency_mode: + assert not self.reduce_results + return final_hidden_states + else: + # Custom op requires all inputs are in the same type. + # Only in cutlass_min_latency_mode, the output is a list of tensors. + # Otherwise, the output should be unpacked as a single tensor. + final_hidden_states = final_hidden_states[0] + + if not self.enable_alltoall: + return final_hidden_states + else: + return self.alltoall_combine(final_hidden_states, alltoall_info, + token_count) + + def forward( + self, + x: Union[torch.Tensor, Fp4QuantizedTensor], + router_logits: torch.Tensor, + cutlass_min_latency_mode: bool = False, + output_dtype: Optional[torch.dtype] = None, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + ) -> torch.Tensor: + """ + cutlass_min_latency_mode has no effect when trtllm_gen backend is enabled. + """ + if self.is_cutlass(): + return self.forward_cutlass(x, router_logits, + cutlass_min_latency_mode, output_dtype, + all_rank_num_tokens, use_dp_padding) + elif self.is_trtllm(): + return self.forward_trtllmgen(x, router_logits) + else: + raise NotImplementedError( + f"FusedMoE only supports CUTLASS or TRTLLM backends, not {self.moe_backend}" + ) + + def forward_cutlass( + self, + x: Union[torch.Tensor, Fp4QuantizedTensor], + router_logits: torch.Tensor, + cutlass_min_latency_mode: bool = False, + output_dtype: Optional[torch.dtype] = None, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + ) -> torch.Tensor: + assert self.is_cutlass() + + if self.use_dp: + assert all_rank_num_tokens is not None + assert use_dp_padding is not None + num_rows = sum(all_rank_num_tokens) + else: + num_rows = x.shape[0] + + # in case of num_rows is larger than max_chunk_size, we need to split the input into multiple chunks + num_chunks = (num_rows + self.moe_max_num_tokens - + 1) // self.moe_max_num_tokens + + if cutlass_min_latency_mode: + assert num_chunks == 1 and ( + not self.reduce_results + ), "cutlass_min_latency_mode must be used with a single chunk and reduce_results must be False" + + if use_dp_padding: + all_rank_num_tokens_padded = [max(all_rank_num_tokens) + ] * len(all_rank_num_tokens) + else: + all_rank_num_tokens_padded = all_rank_num_tokens + if num_chunks == 1: + outputs = self.forward_chunk( + x, + router_logits, + cutlass_min_latency_mode, + output_dtype, + all_rank_num_tokens=all_rank_num_tokens_padded, + use_dp_padding=use_dp_padding) + outputs = self.reducescatter_or_allreduce( + outputs, + all_rank_num_tokens=all_rank_num_tokens_padded, + use_dp_padding=use_dp_padding) + else: + + def split_chunk(split_token_num: int, split_num_chunks: int): + val_div = split_token_num // split_num_chunks + val_mod = split_token_num % split_num_chunks + split_chunk_size_list = [val_div + 1] * val_mod + [val_div] * ( + split_num_chunks - val_mod) + return split_chunk_size_list + + if self.use_dp: + all_rank_chunk_size_list = [ + split_chunk(val, num_chunks) + for val in all_rank_num_tokens_padded + ] + all_rank_num_tokens_list = [[ + val[idx_chunk] for val in all_rank_chunk_size_list + ] for idx_chunk in range(num_chunks)] + chunk_size_list = all_rank_chunk_size_list[self.rank] + if self.enable_alltoall: + all_rank_num_tokens_list = [[ + 1 if val == 0 else val for val in val_list + ] for val_list in all_rank_num_tokens_list] + else: + all_rank_num_tokens_list = [None] * num_chunks + chunk_size_list = split_chunk(x.shape[0], num_chunks) + + x_list = x.split(chunk_size_list) + router_logits_list = router_logits.split(chunk_size_list) + + if not self.enable_alltoall: + self.event_dict[EventType.Main].record() + with torch.cuda.stream(self.aux_stream): + self.event_dict[EventType.Main].wait() + + outputs_list = [] + # Postpone reduce-scatter/all-reduce to the next iteration to achieve better overlap + for idx_chunk, (x, router_logits) in enumerate( + zip(x_list, router_logits_list)): + if not self.enable_alltoall: + if idx_chunk % 2 == 0: + with torch.cuda.stream(self.aux_stream): + outputs = self.forward_chunk( + x, + router_logits, + all_rank_num_tokens=all_rank_num_tokens_list[ + idx_chunk] if self.use_dp else None, + use_dp_padding=use_dp_padding) + if idx_chunk > 0: + outputs_list[-1] = self.reducescatter_or_allreduce( + outputs_list[-1], + all_rank_num_tokens=all_rank_num_tokens_list[ + idx_chunk - 1], + use_dp_padding=use_dp_padding) + else: + outputs = self.forward_chunk( + x, + router_logits, + all_rank_num_tokens=all_rank_num_tokens_list[ + idx_chunk] if self.use_dp else None, + use_dp_padding=use_dp_padding) + with torch.cuda.stream(self.aux_stream): + outputs_list[-1] = self.reducescatter_or_allreduce( + outputs_list[-1], + all_rank_num_tokens=all_rank_num_tokens_list[ + idx_chunk - 1], + use_dp_padding=use_dp_padding) + else: + outputs = self.forward_chunk( + x, + router_logits, + all_rank_num_tokens=all_rank_num_tokens_list[idx_chunk] + if self.use_dp else None) + + outputs_list.append(outputs) + if not self.enable_alltoall: + if num_chunks % 2 == 0: + outputs_list[-1] = self.reducescatter_or_allreduce( + outputs_list[-1], + all_rank_num_tokens=all_rank_num_tokens_list[-1], + use_dp_padding=use_dp_padding) + else: + with torch.cuda.stream(self.aux_stream): + outputs_list[-1] = self.reducescatter_or_allreduce( + outputs_list[-1], + all_rank_num_tokens=all_rank_num_tokens_list[-1], + use_dp_padding=use_dp_padding) + with torch.cuda.stream(self.aux_stream): + self.event_dict[EventType.MoeChunkingOverlap].record() + self.event_dict[EventType.MoeChunkingOverlap].wait() + outputs = torch.cat(outputs_list) + if self.use_dp: + rank = self.mapping.tp_rank + outputs = outputs[:all_rank_num_tokens[rank]] + return outputs + + def forward_trtllmgen(self, x: torch.Tensor, + router_logits: torch.Tensor) -> torch.Tensor: + assert self.is_trtllm() + assert x.dtype == torch.bfloat16 + + # DeepSeekV3 style routing + if isinstance(self.routing_method, DeepSeekV3MoeRoutingMethod): + top_k = self.routing_method.routing_impl.top_k + routing_bias = self.routing_method.e_score_correction_bias + n_group = self.routing_method.routing_impl.n_group + topk_group = self.routing_method.routing_impl.topk_group + routed_scaling_factor = self.routing_method.routing_impl.routed_scaling_factor + else: + top_k = self.routing_method.top_k + routing_bias = None + n_group = None + topk_group = None + routed_scaling_factor = None + + # TODO: since routing kernel is integrated into moe_runner for fp8, + # here we just route the I/Os for moe_runner + if self.quant_config and self.quant_config.quant_mode.has_fp8_block_scales( + ): + x_val, x_scale = torch.ops.trtllm.fp8_quantize_1x128(x) + + final_hidden_states = torch.ops.trtllm.fp8_block_scale_moe_runner( + router_logits, + routing_bias, + x_val, + x_scale, + self.w3_w1_weight, + self.w3_w1_weight_scaling_factor, + self.w2_weight, + self.w2_weight_scaling_factor, + self.num_slots, + top_k, + n_group, + topk_group, + self.intermediate_size_per_partition, + self. + slot_start, # local_expert_start; use ep_rank if stride!=1 + self.expert_size_per_partition, # local_expert_size + routed_scaling_factor, + self.routing_method.routing_method_type, + ) + elif self.quant_config and self.quant_config.quant_mode.has_nvfp4(): + scale_factor_use_ue8m0 = False + is_scale_factor_swizzled = False # use linear layout here + hidden_states_fp4, hidden_states_scale_linear_fp4 = torch.ops.trtllm.fp4_quantize( + x, self.fc31_input_scale, 16, scale_factor_use_ue8m0, + is_scale_factor_swizzled) + + final_hidden_states = torch.ops.trtllm.fp4_block_scale_moe_runner( + router_logits, + routing_bias, + hidden_states_fp4, + hidden_states_scale_linear_fp4.view(torch.float8_e4m3fn), + self.w3_w1_weight, + self.w3_w1_weight_scale.view(torch.float8_e4m3fn), + self.w2_weight, + self.w2_weight_scale.view(torch.float8_e4m3fn), + self.fc31_scale_c.data, + self.fc31_alpha.data, + self.fc2_alpha.data, + self.num_slots, + top_k, + n_group, + topk_group, + self.intermediate_size_per_partition, + self. + slot_start, # local_expert_start; use ep_rank if stride!=1 + self.expert_size_per_partition, # local_expert_size + routed_scaling_factor, + self.routing_method.routing_method_type, + ) + else: + raise NotImplementedError( + "The TRTLLM backend of FusedMoE only supports fp8_block_scaling and nvfp4 dtypes." + ) + + if self.reduce_results and self.parallel_size > 1: + final_hidden_states = self.all_reduce(final_hidden_states) + + return final_hidden_states + + def alltoall_prepare_maybe_dispatch(self, all_rank_num_tokens: list, + x: torch.Tensor, + token_selected_slots: torch.Tensor, + token_final_scales: torch.Tensor): + top_k = self.routing_method.experts_per_token + expert_count = self.num_experts + # gather router info + max_num_token = max(all_rank_num_tokens) + token_selected_slots = torch.nn.functional.pad( + token_selected_slots, + (0, 0, 0, max_num_token - token_selected_slots.shape[0]), + 'constant', self.num_experts) + token_final_scales = torch.nn.functional.pad( + token_final_scales, + (0, 0, 0, max_num_token - token_final_scales.shape[0])) + gathered_token_selected_slots, gathered_token_final_scales = allgather( + [token_selected_slots, token_final_scales], self.mapping, dim=0) + gathered_token_selected_slots = torch.flatten( + gathered_token_selected_slots.contiguous(), start_dim=0, end_dim=-2) + gathered_token_final_scales = torch.flatten( + gathered_token_final_scales.contiguous(), start_dim=0, end_dim=-2) + gathered_target_rank_ids = MnnvlMoe.compute_target_rank_id( + gathered_token_selected_slots, self.num_experts, self.ep_size) + alltoall_info, token_selected_slots, token_final_scales = MnnvlMoe.mnnvl_moe_alltoallv_prepare( + gathered_target_rank_ids, None, gathered_token_selected_slots, + gathered_token_final_scales, max_num_token, expert_count, top_k, + self.ep_rank, self.ep_size) + + if not self.use_postquant_alltoall: + assert not isinstance( + x, Fp4QuantizedTensor + ), "pre-quant alltoall doesn't support fp4 tensor" + x = MnnvlMoe.mnnvl_moe_alltoallv(x, alltoall_info, + self.alltoall_workspace, + self.ep_rank, self.ep_size) + + return x, token_selected_slots, token_final_scales, alltoall_info + + def alltoall_postquant_dispatch(self, x: torch.Tensor, x_sf: torch.Tensor, + x_row: int, x_col: int, + alltoall_info: MoEAlltoallInfo): + x = MnnvlMoe.mnnvl_moe_alltoallv(x, alltoall_info, + self.alltoall_workspace, self.ep_rank, + self.ep_size) + + if x_sf is not None: + if self.has_nvfp4: + x_sf = unswizzle_sf(x_sf, x_row, x_col, + self.scaling_vector_size) + + x_sf = MnnvlMoe.mnnvl_moe_alltoallv(x_sf, alltoall_info, + self.alltoall_workspace, + self.ep_rank, self.ep_size) + + if self.has_nvfp4: + x_sf = swizzle_sf(x_sf, x.shape[0], x.shape[1] * 2, + self.scaling_vector_size) + + return x, x_sf + + def alltoall_combine(self, final_hidden_states: torch.Tensor, + alltoall_info: MoEAlltoallInfo, token_count: int): + top_k = self.routing_method.experts_per_token + if isinstance(final_hidden_states, list): + final_hidden_states = final_hidden_states[0] + final_hidden_states = MnnvlMoe.mnnvl_moe_alltoallv_combine( + final_hidden_states, + alltoall_info, + self.alltoall_workspace, + ep_rank=self.ep_rank, + ep_size=self.ep_size, + top_k=top_k, + token_count=token_count) + + return final_hidden_states + + def load_weights(self, weights: List[Dict]): + assert self._weights_created + assert len(weights) == 1 + weights = weights[0] + + def load_expert_w3_w1_weight(w1_weight, + w3_weight, + dst_w3_w1_weight: torch.Tensor, + is_trtllm: bool = False): + w1_weight_shard = load_weight_shard(w1_weight, self.tp_size, + self.tp_rank, + TensorParallelMode.COLUMN) + w3_weight_shard = load_weight_shard(w3_weight, self.tp_size, + self.tp_rank, + TensorParallelMode.COLUMN) + + if is_trtllm: + # FIXME: this depends on the kernel internals + epilogue_tile_m = 128 + + # Keep weights in device buffer + dst_w3_weight = dst_w3_w1_weight.narrow( + dim=0, start=0, length=self.intermediate_size_per_partition) + dst_w1_weight = dst_w3_w1_weight.narrow( + dim=0, + start=self.intermediate_size_per_partition, + length=self.intermediate_size_per_partition) + dst_w3_weight.copy_(w3_weight_shard.view(dst_w3_weight.dtype)) + dst_w1_weight.copy_(w1_weight_shard.view(dst_w1_weight.dtype)) + + # Get permute indices and chain them together + permute0 = get_reorder_rows_for_gated_act_gemm_row_indices( + dst_w3_w1_weight) + permute1 = get_shuffle_matrix_a_row_indices( + dst_w3_w1_weight, epilogue_tile_m) + permute = permute0[permute1] + + # Shuffle the weight according to permute indices + processed_w31_weight_shard = torch.ops.trtllm.shuffle_matrix( + dst_w3_w1_weight, permute.to(dst_w3_w1_weight.device)) + # Copy the result into device buffer + dst_w3_w1_weight.copy_(processed_w31_weight_shard.view( + dst_w3_w1_weight.dtype), + non_blocking=True) + # We are done here so do not continue + return + + w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], + dim=0) + + if self.has_w4afp8 and self.sm_version == 89: + import tensorrt_llm.quantization.functional + preprocessor = tensorrt_llm.quantization.functional.preprocess_weights_for_mixed_gemm + packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4 + unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8 + w31_weight_shard = packer( + unpacker(w31_weight_shard.cpu()).T.contiguous()).to( + w31_weight_shard.device) + w31_weight_shard = preprocessor(w31_weight_shard, + torch.quint4x2, + torch.float8_e4m3fn, + 89).view(dst_w3_w1_weight.shape) + dst_w3_w1_weight.copy_(w31_weight_shard.view( + dst_w3_w1_weight.dtype), + non_blocking=True) + + def load_expert_w2_weight(w2_weight, + dst_w2_weight: torch.Tensor, + is_trtllm: bool = False): + w2_weight_shard = load_weight_shard(w2_weight, self.tp_size, + self.tp_rank, + TensorParallelMode.ROW) + if is_trtllm: + # FIXME: this depends on the kernel internals + epilogue_tile_m = 128 + + # Keep weights in device buffer + dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype), + non_blocking=True) + # Get permuted result + processed_w2_weight = shuffle_matrix_a(dst_w2_weight, + epilogue_tile_m) + # Copy the result into device buffer + dst_w2_weight.copy_(processed_w2_weight.view( + dst_w2_weight.dtype), + non_blocking=True) + # We are done here so do not continue + return + + if self.has_w4afp8 and self.sm_version == 89: + import tensorrt_llm.quantization.functional + preprocessor = tensorrt_llm.quantization.functional.preprocess_weights_for_mixed_gemm + packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4 + unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8 + w2_weight_shard = packer( + unpacker(w2_weight_shard.cpu()).T.contiguous()).to( + w2_weight_shard.device) + w2_weight_shard = preprocessor(w2_weight_shard, torch.quint4x2, + torch.float8_e4m3fn, + 89).view(dst_w2_weight.shape) + + dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype), + non_blocking=True) + + # Use multi-threading to load expert weights in parallel. + # Even though CPython has global interpreter lock (GIL), + # it's still faster to load weights in parallel because it can utilize + # CPU memory bandwidth better. + threads = [] + + for local_slot_id, expert_id in enumerate( + self.initial_local_expert_ids): + # expert_idx is the local slot index of current rank + expert_idx = local_slot_id + + if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA: + w1_weight = weights[f"{expert_id}.w1.weight"] + w3_weight = weights[f"{expert_id}.w3.weight"] + w2_weight = weights[f"{expert_id}.w2.weight"] + elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ: + w1_w3_weight = weights["gate_up_proj"][expert_id].transpose( + 0, 1) + w1_weight, w3_weight = w1_w3_weight.chunk(2, dim=0) + w2_weight = weights["down_proj"][expert_id].transpose( + 0, 1).contiguous() + else: + raise NotImplementedError( + f"Unknown weight loading mode in MoE: {self.weight_loading_mode}" + ) + + is_trtllm_nvfp4 = self.is_trtllm( + ) and self.quant_config.quant_mode.has_nvfp4() + + thread = threading.Thread(target=load_expert_w3_w1_weight, + args=(w1_weight, w3_weight, + self.w3_w1_weight.data[expert_idx], + is_trtllm_nvfp4)) + thread.start() + threads.append(thread) + + thread = threading.Thread(target=load_expert_w2_weight, + args=(w2_weight, + self.w2_weight.data[expert_idx], + is_trtllm_nvfp4)) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + + if self.quant_config and self.quant_config.quant_mode.has_any_quant( + exclude_kv_cache=True): + if self.quant_config.quant_mode.has_fp8_qdq(): + self._load_fp8_qdq_scales(weights) + elif self.quant_config.quant_mode.has_nvfp4(): + self._load_nvfp4_scales(weights) + elif self.quant_config.quant_mode.has_fp8_block_scales(): + self._load_fp8_block_scales_scales(weights) + elif self.quant_config.quant_mode.is_int4_weight_only_per_group(): + self._load_int4_groupwise_scales(weights) + else: + raise ValueError( + f"unsupported quantization mode: {self.quant_config.quant_mode}" + ) + # Re-setup quant scales after loading weights as the tensors may have been modified. + self.setup_quant_scales() + + def _load_fp8_block_scales_scales(self, weights: Dict): + all_w2_scales = [ + load_weight_shard(weights[f"{expert_id}.w2.weight_scale_inv"], + self.tp_size, self.tp_rank, + TensorParallelMode.ROW) + for expert_id in self.initial_local_expert_ids + ] + + w2_scales = torch.stack(all_w2_scales) + self.w2_weight_scaling_factor.data.copy_(w2_scales) + + all_w3_scales = [ + load_weight_shard(weights[f"{expert_id}.w3.weight_scale_inv"], + self.tp_size, self.tp_rank, + TensorParallelMode.COLUMN) + for expert_id in self.initial_local_expert_ids + ] + + all_w1_scales = [ + load_weight_shard(weights[f"{expert_id}.w1.weight_scale_inv"], + self.tp_size, self.tp_rank, + TensorParallelMode.COLUMN) + for expert_id in self.initial_local_expert_ids + ] + + w3_w1_scales = torch.cat( + [torch.stack(all_w3_scales), + torch.stack(all_w1_scales)], dim=-2) + self.w3_w1_weight_scaling_factor.data.copy_(w3_w1_scales) + + def _load_fp8_qdq_scales(self, weights: Dict): + # Step1: Load input scales. + def load_expert_fc31_input_scale_fp8_qdq( + w1_input_scale, w3_input_scale, + dst_fc31_input_scale: torch.Tensor): + dst_fc31_input_scale.copy_( + max(w1_input_scale[...].reshape([]), + w3_input_scale[...].reshape([]))) + + def load_expert_fc2_input_scale_fp8_qdq( + w2_input_scale, dst_fc2_input_scale: torch.Tensor): + dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([])) + + tmp_fc31_input_scale = torch.empty(self.num_experts, + dtype=torch.float32) + tmp_fc2_input_scale = torch.empty(self.num_experts, dtype=torch.float32) + for expert_id in range(self.num_experts): + if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA: + w1_input_scale = weights[f"{expert_id}.w1.input_scale"] + w3_input_scale = weights[f"{expert_id}.w3.input_scale"] + w2_input_scale = weights[f"{expert_id}.w2.input_scale"] + elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ: + w1_input_scale = weights[f"gate_up_proj_input_scale"] + w3_input_scale = weights[f"gate_up_proj_input_scale"] + w2_input_scale = weights[f"down_proj_input_scale"] + else: + raise NotImplementedError( + f"Unknown weight loading mode in MoE: {self.weight_loading_mode}" + ) + + load_expert_fc31_input_scale_fp8_qdq( + w1_input_scale, w3_input_scale, tmp_fc31_input_scale[expert_id]) + + load_expert_fc2_input_scale_fp8_qdq(w2_input_scale, + tmp_fc2_input_scale[expert_id]) + + # max_fc31_input_scale is the maximum of all w1 input scales and w3 input scales. + # It's used to quantize fc31 input inside the MOE op + max_fc31_input_scale = tmp_fc31_input_scale.max() + # max_fc2_input_scale is the maximum of all w2 input scales. + max_fc2_input_scale = tmp_fc2_input_scale.max() + + # Step2: Load weight scales and requantize w3_w1_weight. + tmp_w3_w1_weight_scale = torch.empty(self.expert_size_per_partition, + dtype=torch.float32) + tmp_w2_weight_scale = torch.empty(self.expert_size_per_partition, + dtype=torch.float32) + + def load_expert_w3_w1_weight_scale_fp8_qdq( + w1_weight_scale, w3_weight_scale, + dst_w3_w1_weight_scale: torch.Tensor): + w1_weight_scale = w1_weight_scale[...].reshape([]) + w3_weight_scale = w3_weight_scale[...].reshape([]) + dst_w3_w1_weight_scale.copy_(max(w1_weight_scale, w3_weight_scale)) + + def requantize_expert_w3_w1_weight_fp8_qdq( + w1_weight_scale, w3_weight_scale, + dst_w3_w1_weight: torch.Tensor): + w1_weight_scale = w1_weight_scale[...].reshape([]) + w3_weight_scale = w3_weight_scale[...].reshape([]) + max_w3_w1_weight_scale = max(w1_weight_scale, w3_weight_scale) + + w3_weight = dst_w3_w1_weight.narrow( + dim=0, start=0, length=self.intermediate_size_per_partition).to( + dtype=self.dtype) + w1_weight = dst_w3_w1_weight.narrow( + dim=0, + start=self.intermediate_size_per_partition, + length=self.intermediate_size_per_partition).to( + dtype=self.dtype) + dequant_w3_weight = w3_weight * w3_weight_scale + dequant_w1_weight = w1_weight * w1_weight_scale + requant_w3_weight = (dequant_w3_weight / max_w3_w1_weight_scale).to( + torch.float8_e4m3fn) + requant_w1_weight = (dequant_w1_weight / max_w3_w1_weight_scale).to( + torch.float8_e4m3fn) + + dst_w3_w1_weight.narrow( + dim=0, start=0, + length=self.intermediate_size_per_partition).copy_( + requant_w3_weight) + dst_w3_w1_weight.narrow( + dim=0, + start=self.intermediate_size_per_partition, + length=self.intermediate_size_per_partition).copy_( + requant_w1_weight) + + def load_expert_w2_weight_scale_fp8(w2_weight_scale, + dst_w2_weight_scale: torch.Tensor): + dst_w2_weight_scale.copy_(w2_weight_scale[...].reshape([])) + + for local_slot_id, expert_id in enumerate( + self.initial_local_expert_ids): + if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA: + w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"] + w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"] + w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"] + elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ: + w1_weight_scale = weights[f"gate_up_proj_weight_scale"] + w3_weight_scale = weights[f"gate_up_proj_weight_scale"] + w2_weight_scale = weights[f"down_proj_weight_scale"] + else: + raise NotImplementedError( + f"Unknown weight loading mode in MoE: {self.weight_loading_mode}" + ) + + expert_idx = local_slot_id + + load_expert_w3_w1_weight_scale_fp8_qdq( + w1_weight_scale, w3_weight_scale, + tmp_w3_w1_weight_scale[expert_idx]) + + requantize_expert_w3_w1_weight_fp8_qdq( + w1_weight_scale, w3_weight_scale, + self.w3_w1_weight.data[expert_idx]) + + load_expert_w2_weight_scale_fp8(w2_weight_scale, + tmp_w2_weight_scale[expert_idx]) + + # Step3: calculate and store final loaded weights + self.fc31_dequant.data.copy_(tmp_w3_w1_weight_scale * + max_fc31_input_scale) + self.fc2_quant.data.copy_(max_fc2_input_scale.reciprocal()) + self.fc2_dequant.data.copy_(tmp_w2_weight_scale * max_fc2_input_scale) + self.fc31_input_dequant.data.copy_(max_fc31_input_scale) + + def _load_nvfp4_scales(self, weights: Dict): + # Step1: Load input scales. + tmp_fc31_input_scale = torch.empty(self.num_experts, + dtype=torch.float32) + tmp_fc2_input_scale = torch.empty(self.num_experts, dtype=torch.float32) + + def load_expert_fc31_input_scale_nvfp4( + w1_input_scale, w3_input_scale, + dst_fc31_input_scale: torch.Tensor): + w1_input_scale = w1_input_scale[...].reshape([]) + w3_input_scale = w3_input_scale[...].reshape([]) + assert torch.allclose( + w1_input_scale, + w3_input_scale), "w1_input_scale != w3_input_scale" + dst_fc31_input_scale.copy_(w1_input_scale) + + def load_expert_fc2_input_scale_nvfp4( + w2_input_scale, dst_fc2_input_scale: torch.Tensor): + dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([])) + + for expert_id in range(self.num_experts): + if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA: + w1_input_scale = weights[f"{expert_id}.w1.input_scale"] + w3_input_scale = weights[f"{expert_id}.w3.input_scale"] + w2_input_scale = weights[f"{expert_id}.w2.input_scale"] + elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ: + w1_input_scale = weights["gate_up_proj_input_scale"] + w3_input_scale = weights["gate_up_proj_input_scale"] + w2_input_scale = weights["down_proj_input_scale"] + else: + raise NotImplementedError( + f"Unknown weight loading mode in MoE: {self.weight_loading_mode}" + ) + + load_expert_fc31_input_scale_nvfp4(w1_input_scale, w3_input_scale, + tmp_fc31_input_scale[expert_id]) + load_expert_fc2_input_scale_nvfp4(w2_input_scale, + tmp_fc2_input_scale[expert_id]) + + # fc31_input_scale is the reciprocal of the maximum of all w1 input scales and w3 input scales. + self.fc31_input_scale.data.copy_( + tmp_fc31_input_scale.max().reciprocal()) + # fc2_input_scale is the reciprocal of the maximum of all w2 input scales. + self.fc2_input_scale.data.copy_(tmp_fc2_input_scale.max().reciprocal()) + + if self.is_trtllm(): + block_scales_dtype = torch.float8_e4m3fn + else: + block_scales_dtype = FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE + + # Step2: Load weight block scales and alphas. + def load_expert_w3_w1_weight_scale_nvfp4( + w1_weight_scale, w3_weight_scale, + dst_w3_w1_weight_scale: torch.Tensor, is_trtllm: bool): + w1_weight_scale = load_weight_shard(w1_weight_scale, self.tp_size, + self.tp_rank, + TensorParallelMode.COLUMN) + w3_weight_scale = load_weight_shard(w3_weight_scale, self.tp_size, + self.tp_rank, + TensorParallelMode.COLUMN) + # Keep weights in device buffer + # w3 + dst_w3_weight_scale = dst_w3_w1_weight_scale.narrow( + dim=0, start=0, length=self.intermediate_size_per_partition) + dst_w3_weight_scale.copy_( + w3_weight_scale.view(dst_w3_weight_scale.dtype)) + + # w1 + dst_w1_weight_scale = dst_w3_w1_weight_scale.narrow( + dim=0, + start=self.intermediate_size_per_partition, + length=self.intermediate_size_per_partition) + dst_w1_weight_scale.copy_( + w1_weight_scale.view(dst_w1_weight_scale.dtype)) + + orig_shape = dst_w3_w1_weight_scale.shape + + if is_trtllm: + # FIXME + epilogue_tile_m = 128 + + # Get permute indices and chain them together + permute0 = get_reorder_rows_for_gated_act_gemm_row_indices( + dst_w3_w1_weight_scale) + permute1 = get_shuffle_matrix_sf_a_row_indices( + dst_w3_w1_weight_scale.view(float4_sf_dtype), + epilogue_tile_m, 16) + permute = permute0[permute1] + + # Shuffle the weight according to permute indices + w3_w1_weight_scale = torch.ops.trtllm.shuffle_matrix( + dst_w3_w1_weight_scale.view(float4_sf_dtype), + permute.cuda()) + # Assert should only be removed during debugging + assert w3_w1_weight_scale.is_cuda, "w3_w1_weight_scale.is_cuda should be true or suffer from slow speed" + # Interleave the weight. + processed_w3_w1_weight_scale = torch.ops.tensorrt_llm.nvfp4_block_scale_interleave( + w3_w1_weight_scale.view(float4_sf_dtype).reshape( + orig_shape)) + # Copy the result into device buffer + dst_w3_w1_weight_scale.copy_( + processed_w3_w1_weight_scale.view( + block_scales_dtype).reshape(orig_shape)) + else: + dst_w3_w1_weight_scale.copy_( + torch.ops.tensorrt_llm.nvfp4_block_scale_interleave( + dst_w3_w1_weight_scale.view(float4_sf_dtype)).view( + block_scales_dtype).reshape(orig_shape)) + + def load_expert_w2_weight_scale_nvfp4(w2_weight_scale, + dst_w2_weight_scale: torch.Tensor, + is_trtllm: bool): + w2_weight_scale = load_weight_shard(w2_weight_scale, self.tp_size, + self.tp_rank, + TensorParallelMode.ROW) + # Keep weights in device buffer + dst_w2_weight_scale.copy_( + w2_weight_scale.view(dst_w2_weight_scale.dtype)) + + orig_shape = dst_w2_weight_scale.shape + if is_trtllm: + epilogue_tile_m = 128 # FIXME: read from kernel + # Assert should only be removed during debugging + assert dst_w2_weight_scale.is_cuda, "dst_w2_weight_scale.is_cuda should be true or suffer from slow speed" + # Interleave the weight and copy + dst_w2_weight_scale.copy_( + shuffle_matrix_sf_a( + dst_w2_weight_scale.view(float4_sf_dtype), + epilogue_tile_m, + 16).view(block_scales_dtype).reshape(orig_shape)) + else: + dst_w2_weight_scale.copy_( + torch.ops.tensorrt_llm.nvfp4_block_scale_interleave( + dst_w2_weight_scale.view(float4_sf_dtype)).view( + block_scales_dtype).reshape(orig_shape)) + + def load_expert_fc31_alpha_nvfp4(w1_weight_scale_2, w3_weight_scale_2, + final_fc31_input_scale: torch.Tensor, + dst_fc31_alpha: torch.Tensor): + w1_weight_scale_2 = w1_weight_scale_2[...].reshape([]) + w3_weight_scale_2 = w3_weight_scale_2[...].reshape([]) + assert torch.allclose( + w1_weight_scale_2, + w3_weight_scale_2), "w1_weight_scale_2 != w3_weight_scale_2" + + w3_w1_weight_scale_2 = 1.0 / w1_weight_scale_2 + dst_fc31_alpha.copy_( + 1.0 / (final_fc31_input_scale * w3_w1_weight_scale_2)) + + def load_expert_fc2_alpha_nvfp4(w2_weight_scale_2, + final_fc2_input_scale: torch.Tensor, + dst_w2_alpha: torch.Tensor): + w2_weight_scale_2 = 1.0 / w2_weight_scale_2[...].reshape([]) + dst_w2_alpha.copy_(1.0 / + (final_fc2_input_scale * w2_weight_scale_2)) + + for local_slot_id, expert_id in enumerate( + self.initial_local_expert_ids): + if self.weight_loading_mode == MoEWeightLoadingMode.VANILLA: + w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"] + w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"] + w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"] + w1_weight_scale_2 = weights[f"{expert_id}.w1.weight_scale_2"] + w3_weight_scale_2 = weights[f"{expert_id}.w3.weight_scale_2"] + w2_weight_scale_2 = weights[f"{expert_id}.w2.weight_scale_2"] + elif self.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ: + w1_w3_weight_scale = weights["gate_up_proj_weight_scale"][ + expert_id].transpose(0, 1).contiguous() + w1_weight_scale, w3_weight_scale = w1_w3_weight_scale.chunk( + 2, dim=0) + w2_weight_scale = weights["down_proj_weight_scale"][ + expert_id].transpose(0, 1).contiguous() + w1_weight_scale_2 = weights["gate_up_proj_weight_scale_2"] + w3_weight_scale_2 = weights["gate_up_proj_weight_scale_2"] + w2_weight_scale_2 = weights["down_proj_weight_scale_2"] + else: + raise NotImplementedError( + f"Unknown weight loading mode in MoE: {self.weight_loading_mode}" + ) + + expert_idx = local_slot_id + + load_expert_w3_w1_weight_scale_nvfp4( + w1_weight_scale, w3_weight_scale, + self.w3_w1_weight_scale.data[expert_idx], self.is_trtllm()) + load_expert_w2_weight_scale_nvfp4( + w2_weight_scale, self.w2_weight_scale.data[expert_idx], + self.is_trtllm()) + + load_expert_fc31_alpha_nvfp4(w1_weight_scale_2, w3_weight_scale_2, + self.fc31_input_scale.data, + self.fc31_alpha.data[expert_idx]) + load_expert_fc2_alpha_nvfp4(w2_weight_scale_2, + self.fc2_input_scale.data, + self.fc2_alpha.data[expert_idx]) + if self.is_trtllm(): + self.fc31_scale_c.data.copy_(self.fc2_input_scale.data * + self.fc31_alpha.data, + non_blocking=True) + + def _load_int4_groupwise_scales(self, weights: Dict): + # fc31 scales + assert (len(self.interleave) == 2) + all_w3_input_scales = [ + load_weight_shard(weights[f"{expert_id}.w3.input_scale"]) + for expert_id in self.initial_local_expert_ids + ] + all_w1_input_scales = [ + load_weight_shard(weights[f"{expert_id}.w1.input_scale"]) + for expert_id in self.initial_local_expert_ids + ] + all_w3_w1_input_scales = torch.max(torch.stack(all_w3_input_scales), + torch.stack(all_w1_input_scales)) + all_w3_w1_input_scales = torch.ones_like( + all_w3_w1_input_scales) * all_w3_w1_input_scales.max() + self.fc31_act_scale.data.copy_(1 / all_w3_w1_input_scales) + self.fc31_alpha.data.copy_(all_w3_w1_input_scales.float()) + + all_w3_scales = [ + load_weight_shard(weights[f"{expert_id}.w3.weight_scale_inv"], + self.tp_size, self.tp_rank, + TensorParallelMode.COLUMN) + for expert_id in self.initial_local_expert_ids + ] + all_w1_scales = [ + load_weight_shard(weights[f"{expert_id}.w1.weight_scale_inv"], + self.tp_size, self.tp_rank, + TensorParallelMode.COLUMN) + for expert_id in self.initial_local_expert_ids + ] + all_w3_w1_scales = torch.cat( + [torch.stack(all_w3_scales), + torch.stack(all_w1_scales)], dim=-2) + if self.sm_version == 89: + w3_w1_scales = all_w3_w1_scales.to(torch.float16).view(self.dtype) + else: + w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(self.dtype) + w3_w1_s_shape = w3_w1_scales.shape + w3_w1_scales_interleaved = w3_w1_scales.reshape( + w3_w1_s_shape[0], w3_w1_s_shape[1], + (w3_w1_s_shape[2] // self.interleave[0]), self.interleave[0]) + w3_w1_scales_interleaved = w3_w1_scales_interleaved.permute(0, 2, 1, 3) + w3_w1_scales_interleaved = w3_w1_scales_interleaved.reshape( + w3_w1_s_shape[0], w3_w1_s_shape[2] // self.interleave[0], + w3_w1_s_shape[1] * self.interleave[0]) + self.fc31_weight_scale.data.copy_(w3_w1_scales_interleaved.contiguous()) + + # fc2 scales + all_w2_input_scales = [ + load_weight_shard(weights[f"{expert_id}.w2.input_scale"]) + for expert_id in self.initial_local_expert_ids + ] + all_w2_input_scales = torch.stack(all_w2_input_scales).to(self.dtype) + all_w2_input_scales = torch.ones_like( + all_w2_input_scales) * all_w2_input_scales.max() + self.fc2_act_scale.data.copy_(1 / all_w2_input_scales) + self.fc2_alpha.data.copy_(all_w2_input_scales.float()) + + all_w2_scales = [ + load_weight_shard(weights[f"{expert_id}.w2.weight_scale_inv"], + self.tp_size, self.tp_rank, + TensorParallelMode.ROW) + for expert_id in self.initial_local_expert_ids + ] + if self.sm_version == 89: + w2_scales = torch.stack(all_w2_scales).to(torch.float16).view( + self.dtype) + else: + w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16).view( + self.dtype) + w2_s_shape = w2_scales.shape + w2_scales_interleaved = w2_scales.reshape( + w2_s_shape[0], w2_s_shape[1], (w2_s_shape[2] // self.interleave[1]), + self.interleave[1]) + w2_scales_interleaved = w2_scales_interleaved.permute(0, 2, 1, 3) + w2_scales_interleaved = w2_scales_interleaved.reshape( + w2_s_shape[0], w2_s_shape[2] // self.interleave[1], + w2_s_shape[1] * self.interleave[1]) + self.fc2_weight_scale.data.copy_(w2_scales_interleaved.contiguous()) + + +class FusedMoEQuantScalesFP8(NamedTuple): + fc1_dequant: torch.Tensor + fc2_quant: torch.Tensor + fc2_dequant: torch.Tensor + fc1_input_dequant: torch.Tensor + + +class FusedMoEQuantScalesNVFP4(NamedTuple): + fc1_act_global: torch.Tensor + fc1_weight_block: torch.Tensor + # fc1_global_scale = 1.0 / (fc1_weight_global_scale * fc1_act_global_scale) + fc1_global: torch.Tensor + + fc2_act_global: torch.Tensor + fc2_weight_block: torch.Tensor + # fc2_global_scale = 1.0 / (fc2_weight_global_scale * fc2_act_global_scale) + fc2_global: torch.Tensor + + +class FusedMoEQuantScalesFP8BlockScales(NamedTuple): + fc_weight_scales: torch.Tensor + proj_weight_scales: torch.Tensor + + +class FusedMoEQuantScalesW4A8(NamedTuple): + scale_1_interleaved: torch.Tensor + scale_2_interleaved: torch.Tensor + pre_quant_scale_1: torch.Tensor + pre_quant_scale_2: torch.Tensor + zero_1: torch.Tensor + zero_2: torch.Tensor + alpha_1: torch.Tensor + alpha_2: torch.Tensor diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py index e65d96daafb..f6a0e9323f7 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py @@ -69,7 +69,8 @@ def __init__( self.mapping = model_config.mapping self.parallel_size = self.mapping.tp_size - self.all_reduce = AllReduce(self.mapping) + self.all_reduce = AllReduce(mapping=self.mapping, + strategy=model_config.allreduce_backend) self.intermediate_size_per_partition = intermediate_size // self.tp_size diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py index bcf51067a72..3cc73d15dd8 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/interface.py +++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py @@ -78,7 +78,8 @@ def __init__( self.parallel_size = self.mapping.tp_size self.intermediate_size_per_partition = intermediate_size // self.tp_size - self.all_reduce = AllReduce(self.mapping) + self.all_reduce = AllReduce(mapping=self.mapping, + strategy=model_config.allreduce_backend) @abstractmethod def create_weights(self): diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index b0062d043e9..bd554b51089 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -17,6 +17,7 @@ from tensorrt_llm.mapping import Mapping from ...models.modeling_utils import QuantConfig +from ..model_config import ModelConfig from ..utils import Fp4QuantizedTensor @@ -694,7 +695,8 @@ def __init__( self.in_features = local_in_features self.out_features = local_out_features - self.all_reduce = AllReduce(self.mapping) if reduce_output else None + self.all_reduce = AllReduce(model_config=ModelConfig( + mapping=self.mapping)) if reduce_output else None self._weights_created = False self.reduce_output = reduce_output self.use_custom_cublas_mm = use_custom_cublas_mm diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index 533b21b0502..041ee1f6dad 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -86,6 +86,7 @@ class PyTorchConfig: # If true, enable min-latency mode. Currently only used for Llama4. enable_min_latency: bool = False + allreduce_strategy: str = "AUTO" EXETENDED_EXECUTOR_CONFIG_FIELDS = [ diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index c15e00c8568..e67156ec1ab 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -3880,6 +3880,7 @@ class AllReduceStrategy(IntEnum): ONESHOT = 4 TWOSHOT = 5 LOWPRECISION = 6 + MNNVL = 7 class AllReduceFusionOp(IntEnum): diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py index fec42bc3384..820fbb0b684 100644 --- a/tests/unittest/_torch/multi_gpu/test_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_allreduce.py @@ -124,7 +124,7 @@ def e2m1_and_ufp8sf_scale_to_float_v2(e2m1_tensor, ).cuda() norm = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda() - allreduce = AllReduce(mapping=mapping).cuda() + allreduce = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() scale = torch.tensor(1.0, dtype=torch.float32).cuda() linear.load_weights([dict(weight=weights[0])]) diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py index 3aa6871fbe4..3d8933f4115 100644 --- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py @@ -22,6 +22,7 @@ from mpi4py.futures import MPIPoolExecutor from tensorrt_llm._torch.distributed import AllReduceStrategy +from tensorrt_llm._torch.model_config import ModelConfig cloudpickle.register_pickle_by_value(sys.modules[__name__]) MPI.pickle.__init__( @@ -87,8 +88,9 @@ def __init__(self, rank=self.rank, ) - self.allreduce = AllReduce(mapping=self.mapping, - strategy=self.strategy).cuda() + self.allreduce = AllReduce(model_config=ModelConfig( + mapping=self.mapping, + allreduce_backend=self.strategy), ).cuda() self.input_tensors = [] for i in range(self.world_size): diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index e26946b1fb0..17e8c2636fd 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -26,6 +26,8 @@ import tensorrt_llm from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams) +from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm.functional import AllReduceStrategy from tensorrt_llm.mapping import Mapping cloudpickle.register_pickle_by_value(sys.modules[__name__]) @@ -97,13 +99,15 @@ def row_linear_residual_norm_fusion_forward( MPI.COMM_WORLD.barrier() - allreduce = AllReduce(mapping=Mapping( - world_size=tensor_parallel_size, - tp_size=tensor_parallel_size, - rank=tensor_parallel_rank, - ), - dtype=dtype, - ar_backend="MNVL") + allreduce = AllReduce( + model_config=ModelConfig(mapping=Mapping( + world_size=tensor_parallel_size, + tp_size=tensor_parallel_size, + rank=tensor_parallel_rank, + ), + strategy=AllReduceStrategy.MNNVL), + dtype=dtype, + ) # Since all the modules here are provided by TRT-LLM, # so it has to be fullgraph compatible diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py index 32b0af5ef8c..1207252c134 100644 --- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py +++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py @@ -17,6 +17,7 @@ from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams, AllReduceStrategy, userbuffers_allreduce_finalize) +from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.modules.linear import Linear, TensorParallelMode from tensorrt_llm._torch.modules.rms_norm import RMSNorm from tensorrt_llm.mapping import Mapping @@ -128,7 +129,8 @@ def run_single_rank_ar_rms_norm(tensor_parallel_size, a, b, c, gamma): tp_size=tensor_parallel_size, rank=rank, ) - ar = AllReduce(mapping, strategy=AllReduceStrategy.UB) + ar = AllReduce(model_config=ModelConfig( + mapping=mapping, allreduce_backend=AllReduceStrategy.UB)) ar_params = AllReduceParams( strategy=AllReduceStrategy.UB, fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM, @@ -220,7 +222,8 @@ def run_single_rank_ar_rms_norm_fp8(tensor_parallel_size, a, b, c, gamma, tp_size=tensor_parallel_size, rank=rank, ) - ar = AllReduce(mapping, strategy=AllReduceStrategy.UB) + ar = AllReduce(model_config=ModelConfig( + mapping=mapping, allreduce_backend=AllReduceStrategy.UB)) ar_params = AllReduceParams( strategy=AllReduceStrategy.UB, fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8, @@ -605,7 +608,8 @@ def run_single_rank_ar_rms_norm_fp4(tensor_parallel_size, a, b, c, gamma): tp_size=tensor_parallel_size, rank=rank, ) - ar = AllReduce(mapping, strategy=AllReduceStrategy.UB) + ar = AllReduce(model_config=ModelConfig( + mapping=mapping, allreduce_backend=AllReduceStrategy.UB)) ar_params = AllReduceParams( strategy=AllReduceStrategy.UB, fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4, @@ -692,9 +696,9 @@ def __init__(self, tp_size, rank, hidden_size, dtype, eps, norm0_gamma, tp_size=tp_size, rank=rank, ) - self.ar_0 = AllReduce(mapping).cuda() - self.ar_1 = AllReduce(mapping).cuda() - self.ar_2 = AllReduce(mapping).cuda() + self.ar_0 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() + self.ar_1 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() + self.ar_2 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() self.norm0 = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda() self.norm1 = RMSNorm(hidden_size=hidden_size, eps=eps, From bd1b058d085e8cf16c58ef615f3ce59be36a6f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hui=20Gao=C3=A2=C2=80?= Date: Mon, 9 Jun 2025 04:50:36 -0700 Subject: [PATCH 6/9] Fix docs test cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hui Gao Signed-off-by: Hui Gao†--- tests/unittest/api_stability/references_committed/llm.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml index f2c90635fbe..a321b13e64d 100644 --- a/tests/unittest/api_stability/references_committed/llm.yaml +++ b/tests/unittest/api_stability/references_committed/llm.yaml @@ -105,6 +105,9 @@ methods: kv_cache_config: annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig default: null + allreduce_strategy: + annotation: Optional[str] + default: AUTO return_annotation: None generate: parameters: From d93a8760c03a92d29ab188460ee25b8f64fe5af3 Mon Sep 17 00:00:00 2001 From: Hui Gao Date: Wed, 11 Jun 2025 04:36:05 +0000 Subject: [PATCH 7/9] Address comments Signed-off-by: Hui Gao --- .../advanced/lowprecision-pcie-allreduce.md | 14 ++-- .../_torch/auto_deploy/distributed/trtllm.py | 2 +- tensorrt_llm/_torch/distributed/ops.py | 64 +++++++++---------- tensorrt_llm/_torch/model_config.py | 7 +- .../_torch/models/modeling_deepseekv3.py | 8 ++- tensorrt_llm/_torch/models/modeling_llama.py | 8 ++- .../_torch/models/modeling_qwen3_moe.py | 6 +- tensorrt_llm/_torch/modules/fused_moe.py | 6 +- tensorrt_llm/_torch/modules/linear.py | 5 +- .../_torch/multi_gpu/test_allreduce.py | 2 +- .../multi_gpu/test_lowprecision_allreduce.py | 6 +- .../_torch/multi_gpu/test_mnnvl_allreduce.py | 5 +- .../_torch/multi_gpu/test_user_buffers.py | 16 ++--- .../references_committed/llm.yaml | 2 +- 14 files changed, 72 insertions(+), 79 deletions(-) diff --git a/docs/source/advanced/lowprecision-pcie-allreduce.md b/docs/source/advanced/lowprecision-pcie-allreduce.md index 57ca754c4e1..b7ab5070370 100644 --- a/docs/source/advanced/lowprecision-pcie-allreduce.md +++ b/docs/source/advanced/lowprecision-pcie-allreduce.md @@ -41,12 +41,12 @@ The Low-Precision-AllReduce algorithm can be enabled in two ways: ``` AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.LOWPRECISION); ``` -2. **Environment variable control** with AUTO strategy: + +2. Enable by LlmArgs ``` -// In your code -AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.AUTO); -// Set environment variable before running -export FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY=1 +Set allreduce_strategy field in LlmArgs. +Candidates of strategies are "AUTO", "NCCL", "UB", "MINLATENCY", "ONESHOT", "TWOSHOT", "LOWPRECISION" and "MNNVL". +If no strategy is set, AUTO will be set. ``` ## Performance and Accuracy Considerations @@ -58,8 +58,4 @@ Low-Precision-AllReduce reduces communication volume by using FP8 data format fo Users should evaluate the precision impact on their specific models and workloads. -## Environment Variables - -- `FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY`: When set to `1`, forces the use of low-precision algorithm with AUTO strategy. If the algorithm determines it cannot provide performance benefits, it will automatically fall back to other strategies. - **Note**: When compiling TensorRT-LLM without enabling the `ENABLE_FP8` option, setting Low Precision allreduce will not take effect. diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py index e0ac0db1b8e..dd9313df0f1 100644 --- a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py +++ b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py @@ -17,7 +17,7 @@ def trtllm_allreduce(tensor, op, all_reduce_params=None): rank, world_size = get_rank_world_size() assert op == ReduceOp.SUM, "TRT-LLM all reduce only supports SUM op." p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank) - torch_op = AllReduce(p_config) + torch_op = AllReduce(mapping=p_config) return torch_op(tensor, all_reduce_params=all_reduce_params) @torch.library.custom_op( diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 7e18458a0f6..1c8d8023fa4 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -15,8 +15,6 @@ from tensorrt_llm.mapping import Mapping from tensorrt_llm.plugin.plugin import CustomAllReduceHelper -from ..model_config import ModelConfig - _thread_local = threading.local() @@ -311,16 +309,16 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype): self.mapping = mapping self.dtype = dtype assert ( - dtype in MNNVLAllReduce.get_supported_dtype() + dtype in MNNVLAllReduce.get_supported_dtypes() and (not mapping.has_cp()) - ), "MNNVL all reduce only support dtype {MNNVLAllReduce.get_supported_dtype()} and without cp." + ), "MNNVL all reduce only supports dtype {MNNVLAllReduce.get_supported_dtypes()} and without cp." self.mcast_buffer_mnnvl, self.buffer_mnnvl, self.buffer_flags_mnnvl, self.max_num_elements_mnnvl = get_allreduce_mnnvl_workspace( self.mapping, dtype) @staticmethod - def get_supported_dtype(): - return [torch.bfloat16, torch.float32] + def get_supported_dtypes(): + return (torch.bfloat16, torch.float32) def forward( self, @@ -377,38 +375,38 @@ def forward( class AllReduce(nn.Module): def __init__(self, - dtype: Optional[torch.dtype] = None, - model_config: ModelConfig = ModelConfig()): + mapping: Mapping, + strategy: AllReduceStrategy = AllReduceStrategy.AUTO, + dtype: Optional[torch.dtype] = None): super().__init__() """ AllReduce is a module that performs an all-reduce operation on a tensor. Args: - model_config (ModelConfig): mapping and strategy in it are used. - mapping (Mapping): The parallel mapping config. - strategy (AllReduceStrategy): - The following all-reduce strategies are supported: + mapping (Mapping): The parallel mapping config. + strategy (AllReduceStrategy): + The following all-reduce strategies are supported: - - UB: AllReduce uses user-buffer based all-reduce kernel. + - UB: AllReduce uses user-buffer based all-reduce kernel. - - NCCL: Use NCCL allreduce. + - NCCL: Use NCCL allreduce. - - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel. + - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel. - - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy. + - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy. - - LOWPRECISION: AllReduce quantizes data to lower precision for transmission. - Should only be used on topologies with PCIe switches and without NVLink. - This strategy may result in some precision loss but can improve performance - on specific hardware configurations. + - LOWPRECISION: AllReduce quantizes data to lower precision for transmission. + Should only be used on topologies with PCIe switches and without NVLink. + This strategy may result in some precision loss but can improve performance + on specific hardware configurations. - All strategies support the following operations: - - NONE (AllReduce only) - - RESIDUAL_RMS_NORM - - RESIDUAL_RMS_NORM_QUANT_FP8 - - RESIDUAL_RMS_NORM_QUANT_NVFP4 - - RESIDUAL_RMS_NORM_OUT_QUANT_FP8 - - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4 + All strategies support the following operations: + - NONE (AllReduce only) + - RESIDUAL_RMS_NORM + - RESIDUAL_RMS_NORM_QUANT_FP8 + - RESIDUAL_RMS_NORM_QUANT_NVFP4 + - RESIDUAL_RMS_NORM_OUT_QUANT_FP8 + - RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4 Note: NCCL, UB, and LOWPRECISION strategies only support consequent kernel calls instead of fused operations. @@ -417,18 +415,14 @@ def __init__(self, For the reference implementation for each pattern, please refer to the following unit test: https://github.com/NVIDIA/TensorRT-LLM/blob/main/tests/unittest/_torch/multi_gpu/test_allreduce.py - The LOWPRECISION strategy can be selected either by directly specifying it in the constructor - or by setting the environment variable FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY when using - the AUTO strategy. + The LOWPRECISION strategy can be selected either by directly specifying it in the constructor. """ - self.mapping = model_config.mapping + self.mapping = mapping self.workspace = None - self.strategy = model_config.allreduce_backend + self.strategy = strategy self.mnnvl_allreduce = None - self.force_low_precision_env = os.environ.get( - "FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY") if self.mapping.tp_size > 1: # When Strategy is UB, it is guaranteed that the workspace is not used. if self.strategy != AllReduceStrategy.UB: @@ -438,7 +432,7 @@ def __init__(self, # Initialize MNNVL AllReduce if needed if self.strategy == AllReduceStrategy.MNNVL and ( - dtype and dtype in MNNVLAllReduce.get_supported_dtype() + dtype and dtype in MNNVLAllReduce.get_supported_dtypes() ) and (not self.mapping.has_cp()): self.mnnvl_allreduce = MNNVLAllReduce(self.mapping, dtype) if dtype else None diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index 0b0e8a9210e..05471144f5f 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -109,12 +109,12 @@ def __post_init__(self): self.is_generation = self.is_generation_model( self.pretrained_config.architectures) - def map_ar_strategy(strategy: str = "AUTO"): + def get_all_reduce_strategy(strategy: str = "AUTO"): maps = { "AUTO": AllReduceStrategy.AUTO, "NCCL": AllReduceStrategy.NCCL, "UB": AllReduceStrategy.UB, - "MIN_LATENCY": AllReduceStrategy.MIN_LATENCY, + "MINLATENCY": AllReduceStrategy.MIN_LATENCY, "ONESHOT": AllReduceStrategy.ONESHOT, "TWOSHOT": AllReduceStrategy.TWOSHOT, "LOWPRECISION": AllReduceStrategy.LOWPRECISION, @@ -124,7 +124,8 @@ def map_ar_strategy(strategy: str = "AUTO"): return maps[key] if key in maps else AllReduceStrategy.AUTO if isinstance(self.allreduce_backend, str): - self.allreduce_backend = map_ar_strategy(self.allreduce_backend) + self.allreduce_backend = get_all_reduce_strategy( + self.allreduce_backend) @property def fuse_pos_embd(self): diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 67973dc90ba..65fc249f48b 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -399,7 +399,8 @@ def __init__(self, overridden_tp_size=shared_tp_size, reduce_output=False) - self.allreduce = AllReduce(model_config=model_config) + self.allreduce = AllReduce(mapping=model_config.mapping, + strategy=model_config.allreduce_backend) self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared] self.event_dict = { key: torch.cuda.Event() @@ -628,8 +629,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], eps=config.rms_norm_eps, dtype=config.torch_dtype) self.layer_idx = layer_idx - self.allreduce = AllReduce(dtype=config.torch_dtype, - model_config=model_config) + self.allreduce = AllReduce(mapping=model_config.mapping, + strategy=model_config.allreduce_backend, + dtype=config.torch_dtype) self.moe_allreduce = MoEAllReduce(self.mapping) self.next_layer_layernorm: RMSNorm = None diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index a852560af10..4d2b677d46e 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -282,7 +282,10 @@ def __init__( quant_config=None) self.mapping = model_config.mapping - self.all_reduce = AllReduce(model_config=model_config) + self.all_reduce = AllReduce( + mapping=model_config.mapping, + strategy=model_config.allreduce_backend, + ) self.moe_event = [torch.cuda.Event(), torch.cuda.Event()] self.aux_stream = aux_stream @@ -414,7 +417,8 @@ def __init__( dtype=config.torch_dtype) self.mapping = model_config.mapping - self.all_reduce = AllReduce(model_config=model_config) + self.all_reduce = AllReduce(mapping=model_config.mapping, + strategy=model_config.allreduce_backend) self.next_layer_layernorm: RMSNorm = None self.next_attn: LlamaAttention = None diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index 6a1e13b1467..90ded9ad9c8 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -89,7 +89,8 @@ def __init__( self.top_k = config.num_experts_per_tok self.enable_attention_dp = model_config.mapping.enable_attention_dp self.mapping = model_config.mapping - self.allreduce = AllReduce(model_config=model_config) + self.allreduce = AllReduce(mapping=model_config.mapping, + strategy=model_config.allreduce_backend) self.enable_alltoall = Qwen3MoE.should_enable_alltoall( model_config, self.top_k) if self.enable_alltoall: @@ -202,7 +203,8 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig], dtype=config.torch_dtype) self.layer_idx = layer_idx - self.allreduce = AllReduce(model_config=model_config) + self.allreduce = AllReduce(mapping=model_config.mapping, + strategy=model_config.allreduce_backend) self.next_layer_layernorm: RMSNorm = None self.fusion_config = EagerFusionConfig() diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py index 7df752814e2..ba21b3eb738 100755 --- a/tensorrt_llm/_torch/modules/fused_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe.py @@ -355,7 +355,8 @@ def __init__( self.mapping = model_config.mapping self.parallel_size = self.mapping.tp_size - self.all_reduce = AllReduce(self.mapping) + self.all_reduce = AllReduce(mapping=self.mapping, + strategy=model_config.allreduce_backend) self.intermediate_size_per_partition = intermediate_size // self.tp_size @@ -933,7 +934,8 @@ def __init__( self.mapping = model_config.mapping self.parallel_size = self.mapping.tp_size - self.all_reduce = AllReduce(model_config=model_config) + self.all_reduce = AllReduce(mapping=self.mapping, + strategy=model_config.allreduce_backend) self.intermediate_size_per_partition = intermediate_size // self.tp_size diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index bd554b51089..48665b7b41e 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -17,7 +17,6 @@ from tensorrt_llm.mapping import Mapping from ...models.modeling_utils import QuantConfig -from ..model_config import ModelConfig from ..utils import Fp4QuantizedTensor @@ -695,8 +694,8 @@ def __init__( self.in_features = local_in_features self.out_features = local_out_features - self.all_reduce = AllReduce(model_config=ModelConfig( - mapping=self.mapping)) if reduce_output else None + self.all_reduce = AllReduce( + mapping=self.mapping) if reduce_output else None self._weights_created = False self.reduce_output = reduce_output self.use_custom_cublas_mm = use_custom_cublas_mm diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py index 820fbb0b684..fec42bc3384 100644 --- a/tests/unittest/_torch/multi_gpu/test_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_allreduce.py @@ -124,7 +124,7 @@ def e2m1_and_ufp8sf_scale_to_float_v2(e2m1_tensor, ).cuda() norm = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda() - allreduce = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() + allreduce = AllReduce(mapping=mapping).cuda() scale = torch.tensor(1.0, dtype=torch.float32).cuda() linear.load_weights([dict(weight=weights[0])]) diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py index 3d8933f4115..5245c454be4 100644 --- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py @@ -22,7 +22,6 @@ from mpi4py.futures import MPIPoolExecutor from tensorrt_llm._torch.distributed import AllReduceStrategy -from tensorrt_llm._torch.model_config import ModelConfig cloudpickle.register_pickle_by_value(sys.modules[__name__]) MPI.pickle.__init__( @@ -88,9 +87,8 @@ def __init__(self, rank=self.rank, ) - self.allreduce = AllReduce(model_config=ModelConfig( - mapping=self.mapping, - allreduce_backend=self.strategy), ).cuda() + self.allreduce = AllReduce(mapping=self.mapping, + allreduce_backend=self.strategy).cuda() self.input_tensors = [] for i in range(self.world_size): diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index 17e8c2636fd..595ff09d12e 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -26,7 +26,6 @@ import tensorrt_llm from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams) -from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm.functional import AllReduceStrategy from tensorrt_llm.mapping import Mapping @@ -100,12 +99,12 @@ def row_linear_residual_norm_fusion_forward( MPI.COMM_WORLD.barrier() allreduce = AllReduce( - model_config=ModelConfig(mapping=Mapping( + mapping=Mapping( world_size=tensor_parallel_size, tp_size=tensor_parallel_size, rank=tensor_parallel_rank, ), - strategy=AllReduceStrategy.MNNVL), + strategy=AllReduceStrategy.MNNVL, dtype=dtype, ) diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py index 1207252c134..66934a7ccc4 100644 --- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py +++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py @@ -17,7 +17,6 @@ from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams, AllReduceStrategy, userbuffers_allreduce_finalize) -from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.modules.linear import Linear, TensorParallelMode from tensorrt_llm._torch.modules.rms_norm import RMSNorm from tensorrt_llm.mapping import Mapping @@ -129,8 +128,7 @@ def run_single_rank_ar_rms_norm(tensor_parallel_size, a, b, c, gamma): tp_size=tensor_parallel_size, rank=rank, ) - ar = AllReduce(model_config=ModelConfig( - mapping=mapping, allreduce_backend=AllReduceStrategy.UB)) + ar = AllReduce(mapping=mapping, strategy=AllReduceStrategy.UB) ar_params = AllReduceParams( strategy=AllReduceStrategy.UB, fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM, @@ -222,8 +220,7 @@ def run_single_rank_ar_rms_norm_fp8(tensor_parallel_size, a, b, c, gamma, tp_size=tensor_parallel_size, rank=rank, ) - ar = AllReduce(model_config=ModelConfig( - mapping=mapping, allreduce_backend=AllReduceStrategy.UB)) + ar = AllReduce(mapping=mapping, strategy=AllReduceStrategy.UB) ar_params = AllReduceParams( strategy=AllReduceStrategy.UB, fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8, @@ -608,8 +605,7 @@ def run_single_rank_ar_rms_norm_fp4(tensor_parallel_size, a, b, c, gamma): tp_size=tensor_parallel_size, rank=rank, ) - ar = AllReduce(model_config=ModelConfig( - mapping=mapping, allreduce_backend=AllReduceStrategy.UB)) + ar = AllReduce(mapping=mapping, strategy=AllReduceStrategy.UB) ar_params = AllReduceParams( strategy=AllReduceStrategy.UB, fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4, @@ -696,9 +692,9 @@ def __init__(self, tp_size, rank, hidden_size, dtype, eps, norm0_gamma, tp_size=tp_size, rank=rank, ) - self.ar_0 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() - self.ar_1 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() - self.ar_2 = AllReduce(model_config=ModelConfig(mapping=mapping)).cuda() + self.ar_0 = AllReduce(mapping=mapping).cuda() + self.ar_1 = AllReduce(mapping=mapping).cuda() + self.ar_2 = AllReduce(mapping=mapping).cuda() self.norm0 = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda() self.norm1 = RMSNorm(hidden_size=hidden_size, eps=eps, diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml index a321b13e64d..cbb0f5681e1 100644 --- a/tests/unittest/api_stability/references_committed/llm.yaml +++ b/tests/unittest/api_stability/references_committed/llm.yaml @@ -106,7 +106,7 @@ methods: annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig default: null allreduce_strategy: - annotation: Optional[str] + annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']] default: AUTO return_annotation: None generate: From a7fab8b1ce9e8bc7802ca6e051961e8c76c06b3b Mon Sep 17 00:00:00 2001 From: Hui Gao Date: Tue, 10 Jun 2025 01:10:12 +0000 Subject: [PATCH 8/9] Address comments to remove code setting strategies to Linear whem no mappingg Signed-off-by: Hui Gao --- .../out_of_tree_example/modeling_opt.py | 34 +++++++++--------- .../_torch/auto_deploy/distributed/trtllm.py | 4 +-- tensorrt_llm/_torch/model_config.py | 8 ++--- .../_torch/models/modeling_deepseekv3.py | 4 +-- tensorrt_llm/_torch/models/modeling_llama.py | 6 ++-- .../_torch/models/modeling_nemotron_nas.py | 2 +- .../_torch/models/modeling_qwen3_moe.py | 4 +-- tensorrt_llm/_torch/modules/attention.py | 14 ++++---- tensorrt_llm/_torch/modules/fused_moe.py | 4 +-- .../modules/fused_moe/fused_moe_vanilla.py | 2 +- .../_torch/modules/fused_moe/interface.py | 2 +- tensorrt_llm/_torch/modules/gated_mlp.py | 4 +-- tensorrt_llm/_torch/modules/linear.py | 7 ++-- .../_torch/modules/mamba/mamba2_mixer.py | 36 +++++++++---------- tensorrt_llm/_torch/modules/mlp.py | 6 ++-- .../multi_gpu/test_lowprecision_allreduce.py | 2 +- 16 files changed, 71 insertions(+), 68 deletions(-) diff --git a/examples/pytorch/out_of_tree_example/modeling_opt.py b/examples/pytorch/out_of_tree_example/modeling_opt.py index 11c8b8d6746..320a431bc74 100644 --- a/examples/pytorch/out_of_tree_example/modeling_opt.py +++ b/examples/pytorch/out_of_tree_example/modeling_opt.py @@ -64,24 +64,22 @@ def __init__( config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine, dtype=config.torch_dtype) - self.fc1 = Linear( - config.hidden_size, - config.ffn_dim, - bias=config.enable_bias, - dtype=config.torch_dtype, - mapping=model_config.mapping, - tensor_parallel_mode=TensorParallelMode.COLUMN, - quant_config=model_config.get_quant_config(), - ) - self.fc2 = Linear( - config.ffn_dim, - config.hidden_size, - bias=config.enable_bias, - dtype=config.torch_dtype, - mapping=model_config.mapping, - tensor_parallel_mode=TensorParallelMode.ROW, - quant_config=model_config.get_quant_config(), - ) + self.fc1 = Linear(config.hidden_size, + config.ffn_dim, + bias=config.enable_bias, + dtype=config.torch_dtype, + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.COLUMN, + quant_config=model_config.get_quant_config(), + allreduce_strategy=model_config.allreduce_strategy) + self.fc2 = Linear(config.ffn_dim, + config.hidden_size, + bias=config.enable_bias, + dtype=config.torch_dtype, + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.ROW, + quant_config=model_config.get_quant_config(), + allreduce_strategy=model_config.allreduce_strategy) self.final_layer_norm = LayerNorm( config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine, diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py index dd9313df0f1..e42da002f6d 100644 --- a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py +++ b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py @@ -6,7 +6,7 @@ try: from ....mapping import Mapping from ...distributed import AllReduce, allgather - from ...modules.linear import AllReduceFusionOp, AllReduceParams + from ...modules.linear import AllReduceFusionOp, AllReduceParams, AllReduceStrategy def trtllm_allgather(tensor, dim, sizes=None): rank, world_size = get_rank_world_size() @@ -17,7 +17,7 @@ def trtllm_allreduce(tensor, op, all_reduce_params=None): rank, world_size = get_rank_world_size() assert op == ReduceOp.SUM, "TRT-LLM all reduce only supports SUM op." p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank) - torch_op = AllReduce(mapping=p_config) + torch_op = AllReduce(mapping=p_config, strategy=AllReduceStrategy.AUTO) return torch_op(tensor, all_reduce_params=all_reduce_params) @torch.library.custom_op( diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index 05471144f5f..f5a3d5f4199 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -79,7 +79,7 @@ class ModelConfig(Generic[TConfig]): attn_backend: str = 'TRTLLM' moe_backend: str = 'CUTLASS' # options can be CUTLASS, TRTLLM - allreduce_backend: AllReduceStrategy = AllReduceStrategy.AUTO + allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO # If true, enable min-latency mode. Currently only used for Llama4. enable_min_latency: bool = False @@ -123,9 +123,9 @@ def get_all_reduce_strategy(strategy: str = "AUTO"): key = strategy.upper() return maps[key] if key in maps else AllReduceStrategy.AUTO - if isinstance(self.allreduce_backend, str): - self.allreduce_backend = get_all_reduce_strategy( - self.allreduce_backend) + if isinstance(self.allreduce_strategy, str): + self.allreduce_strategy = get_all_reduce_strategy( + self.allreduce_strategy) @property def fuse_pos_embd(self): diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 65fc249f48b..f5d3417f88f 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -400,7 +400,7 @@ def __init__(self, reduce_output=False) self.allreduce = AllReduce(mapping=model_config.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.aux_stream = aux_stream_dict[AuxStreamType.MoeShared] self.event_dict = { key: torch.cuda.Event() @@ -630,7 +630,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], dtype=config.torch_dtype) self.layer_idx = layer_idx self.allreduce = AllReduce(mapping=model_config.mapping, - strategy=model_config.allreduce_backend, + strategy=model_config.allreduce_strategy, dtype=config.torch_dtype) self.moe_allreduce = MoEAllReduce(self.mapping) self.next_layer_layernorm: RMSNorm = None diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 4d2b677d46e..d6ffeac2ca1 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -284,7 +284,7 @@ def __init__( self.mapping = model_config.mapping self.all_reduce = AllReduce( mapping=model_config.mapping, - strategy=model_config.allreduce_backend, + strategy=model_config.allreduce_strategy, ) self.moe_event = [torch.cuda.Event(), torch.cuda.Event()] self.aux_stream = aux_stream @@ -418,7 +418,7 @@ def __init__( self.mapping = model_config.mapping self.all_reduce = AllReduce(mapping=model_config.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.next_layer_layernorm: RMSNorm = None self.next_attn: LlamaAttention = None @@ -629,7 +629,7 @@ def __init__( quant_config=model_config.get_quant_config(), skip_create_weights_in_init=model_config. skip_create_weights_in_init, - ) + allreduce_strategy=model_config.allreduce_strategy) class Eagle3LlamaDecoderLayer(DecoderLayer): diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_nas.py b/tensorrt_llm/_torch/models/modeling_nemotron_nas.py index ef562979543..333f52532aa 100644 --- a/tensorrt_llm/_torch/models/modeling_nemotron_nas.py +++ b/tensorrt_llm/_torch/models/modeling_nemotron_nas.py @@ -44,7 +44,7 @@ def _create_linear_from_configs(model_config: ModelConfig[PretrainedConfig], gather_output=True, quant_config=model_config.get_quant_config(), skip_create_weights_in_init=model_config.skip_create_weights_in_init, - ) + allreduce_strategy=model_config.allreduce_strategy) class NemotronNASAttention(Attention): diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index 90ded9ad9c8..5e6f67a8d42 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -90,7 +90,7 @@ def __init__( self.enable_attention_dp = model_config.mapping.enable_attention_dp self.mapping = model_config.mapping self.allreduce = AllReduce(mapping=model_config.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.enable_alltoall = Qwen3MoE.should_enable_alltoall( model_config, self.top_k) if self.enable_alltoall: @@ -204,7 +204,7 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig], self.layer_idx = layer_idx self.allreduce = AllReduce(mapping=model_config.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.next_layer_layernorm: RMSNorm = None self.fusion_config = EagerFusionConfig() diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index cc9031bc288..94574d3f9d7 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -126,7 +126,7 @@ def __init__( weight_mode=WeightMode.FUSED_QKV_LINEAR), quant_config=config.get_quant_config(), skip_create_weights_in_init=config.skip_create_weights_in_init, - ) + allreduce_strategy=config.allreduce_strategy) self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE], [self.hidden_size]) @@ -140,7 +140,7 @@ def __init__( quant_config=config.get_quant_config(), skip_create_weights_in_init=config.skip_create_weights_in_init, lora=self.o_lora, - ) + allreduce_strategy=config.allreduce_strategy) self.quant_config = config.get_quant_config() self.attn_backend = config.attn_backend @@ -481,7 +481,8 @@ def __init__( mapping=mapping, tensor_parallel_mode=TensorParallelMode.COLUMN, quant_config=quant_config, - skip_create_weights_in_init=config.skip_create_weights_in_init) + skip_create_weights_in_init=config.skip_create_weights_in_init, + allreduce_strategy=config.allreduce_strategy) else: self.fused_a = Linear( hidden_size, @@ -501,7 +502,7 @@ def __init__( tensor_parallel_mode=TensorParallelMode.COLUMN, quant_config=quant_config, skip_create_weights_in_init=config.skip_create_weights_in_init, - ) + allreduce_strategy=config.allreduce_strategy) self.q_b_proj = self.q_proj self.kv_a_layernorm = RMSNorm(hidden_size=kv_lora_rank, @@ -517,7 +518,8 @@ def __init__( mapping=mapping, tensor_parallel_mode=TensorParallelMode.COLUMN, quant_config=quant_config, - skip_create_weights_in_init=config.skip_create_weights_in_init) + skip_create_weights_in_init=config.skip_create_weights_in_init, + allreduce_strategy=config.allreduce_strategy) # This parameter will view into self.kv_b_proj.weight after loading weights. # For dummy weight initialization, this parameter is initialized with empty tensor. # Used in forward_generation only @@ -538,7 +540,7 @@ def __init__( tensor_parallel_mode=TensorParallelMode.ROW, quant_config=quant_config, skip_create_weights_in_init=config.skip_create_weights_in_init, - ) + allreduce_strategy=config.allreduce_strategy) def yarn_get_mscale(scale=1, mscale=1): if scale <= 1: diff --git a/tensorrt_llm/_torch/modules/fused_moe.py b/tensorrt_llm/_torch/modules/fused_moe.py index ba21b3eb738..334919050ec 100755 --- a/tensorrt_llm/_torch/modules/fused_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe.py @@ -356,7 +356,7 @@ def __init__( self.parallel_size = self.mapping.tp_size self.all_reduce = AllReduce(mapping=self.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.intermediate_size_per_partition = intermediate_size // self.tp_size @@ -935,7 +935,7 @@ def __init__( self.parallel_size = self.mapping.tp_size self.all_reduce = AllReduce(mapping=self.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.intermediate_size_per_partition = intermediate_size // self.tp_size diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py index f6a0e9323f7..f87647ce511 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py @@ -70,7 +70,7 @@ def __init__( self.parallel_size = self.mapping.tp_size self.all_reduce = AllReduce(mapping=self.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) self.intermediate_size_per_partition = intermediate_size // self.tp_size diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py index 3cc73d15dd8..d305a3b763e 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/interface.py +++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py @@ -79,7 +79,7 @@ def __init__( self.intermediate_size_per_partition = intermediate_size // self.tp_size self.all_reduce = AllReduce(mapping=self.mapping, - strategy=model_config.allreduce_backend) + strategy=model_config.allreduce_strategy) @abstractmethod def create_weights(self): diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py index a727cc93ab9..7fab30e1eee 100644 --- a/tensorrt_llm/_torch/modules/gated_mlp.py +++ b/tensorrt_llm/_torch/modules/gated_mlp.py @@ -73,7 +73,7 @@ def __init__(self, quant_config=config.get_quant_config(), reduce_output=False, skip_create_weights_in_init=config.skip_create_weights_in_init, - ) + allreduce_strategy=config.allreduce_strategy) self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H], [self.hidden_size]) @@ -89,7 +89,7 @@ def __init__(self, reduce_output=reduce_output, skip_create_weights_in_init=config.skip_create_weights_in_init, lora=self.down_lora, - ) + allreduce_strategy=config.allreduce_strategy) # These two modules are mutually exclusive - either splitted_gate_up_lora or fused_gate_up_lora will be used, # but never both at the same time. splitted_gate_up_lora handles gate and up separately while fused_gate_up_lora diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index 48665b7b41e..b97f2ea489b 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -13,7 +13,8 @@ import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils from tensorrt_llm._torch.peft.lora.layer import LoraLayer -from tensorrt_llm.functional import AllReduceFusionOp, AllReduceParams +from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams, + AllReduceStrategy) from tensorrt_llm.mapping import Mapping from ...models.modeling_utils import QuantConfig @@ -658,6 +659,7 @@ def __init__( skip_create_weights_in_init: bool = False, use_custom_cublas_mm: bool = False, lora: Optional[LoraLayer] = None, + allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO, ): from ..distributed import AllReduce @@ -695,7 +697,8 @@ def __init__( self.out_features = local_out_features self.all_reduce = AllReduce( - mapping=self.mapping) if reduce_output else None + mapping=self.mapping, + strategy=allreduce_strategy) if reduce_output else None self._weights_created = False self.reduce_output = reduce_output self.use_custom_cublas_mm = use_custom_cublas_mm diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py index 2b9019be6eb..55a21dae991 100644 --- a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py +++ b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py @@ -88,15 +88,14 @@ def __init__( self.is_paged_state = False # in_proj - self.in_proj = Linear( - d_model, - d_in_proj, - bias=bias, - dtype=dtype, - mapping=self.mapping, - tensor_parallel_mode=TensorParallelMode.COLUMN, - quant_config=config.get_quant_config(), - ) + self.in_proj = Linear(d_model, + d_in_proj, + bias=bias, + dtype=dtype, + mapping=self.mapping, + tensor_parallel_mode=TensorParallelMode.COLUMN, + quant_config=config.get_quant_config(), + allreduce_strategy=config.allreduce_strategy) # conv1d, reuse Linear to store weights since it has support for TP > 1 already self.conv1d = Linear( @@ -108,7 +107,7 @@ def __init__( tensor_parallel_mode=TensorParallelMode.COLUMN, quant_config=config.get_quant_config(), skip_create_weights_in_init=config.skip_create_weights_in_init, - ) + allreduce_strategy=config.allreduce_strategy) # A self.A = nn.Parameter( @@ -138,15 +137,14 @@ def __init__( ) # out_proj - self.out_proj = Linear( - d_inner, - d_model, - bias=bias, - dtype=dtype, - mapping=self.mapping, - tensor_parallel_mode=TensorParallelMode.ROW, - quant_config=config.get_quant_config(), - ) + self.out_proj = Linear(d_inner, + d_model, + bias=bias, + dtype=dtype, + mapping=self.mapping, + tensor_parallel_mode=TensorParallelMode.ROW, + quant_config=config.get_quant_config(), + allreduce_strategy=config.allreduce_strategy) def forward( self, diff --git a/tensorrt_llm/_torch/modules/mlp.py b/tensorrt_llm/_torch/modules/mlp.py index 8d026e1fa2f..b38da2177bd 100644 --- a/tensorrt_llm/_torch/modules/mlp.py +++ b/tensorrt_llm/_torch/modules/mlp.py @@ -43,7 +43,8 @@ def __init__(self, weight_mode=WeightMode.VANILLA), quant_config=config.get_quant_config(), skip_create_weights_in_init=config.skip_create_weights_in_init, - lora=self.up_lora) + lora=self.up_lora, + allreduce_strategy=config.allreduce_strategy) self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H], [self.hidden_size]) @@ -56,7 +57,8 @@ def __init__(self, tensor_parallel_mode=TensorParallelMode.ROW, quant_config=config.get_quant_config(), skip_create_weights_in_init=config.skip_create_weights_in_init, - lora=self.down_lora) + lora=self.down_lora, + allreduce_strategy=config.allreduce_strategy) def forward( self, diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py index 5245c454be4..3aa6871fbe4 100644 --- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py @@ -88,7 +88,7 @@ def __init__(self, ) self.allreduce = AllReduce(mapping=self.mapping, - allreduce_backend=self.strategy).cuda() + strategy=self.strategy).cuda() self.input_tensors = [] for i in range(self.world_size): From e4426ac9012e5b091229c5b7cc49d1d6a6b27a34 Mon Sep 17 00:00:00 2001 From: Hui Gao Date: Wed, 11 Jun 2025 22:53:11 +0000 Subject: [PATCH 9/9] Fix format Signed-off-by: Hui Gao --- tensorrt_llm/_torch/distributed/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 1c8d8023fa4..7c188ec38d0 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -11,7 +11,6 @@ from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams, AllReduceStrategy, MoEAllReduceParams) -from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.plugin.plugin import CustomAllReduceHelper