NVIDIA · ruodil · Jul 15, 2025 · Jul 29, 2025 · Jul 31, 2025 · Jul 31, 2025
@@ -31,3 +31,4 @@ ruff==0.9.4
 lm_eval[api]==0.4.8
 docstring_parser
 genai-perf==0.0.13
+triton==3.3.1; platform_machine == "x86_64"
@@ -127,6 +127,29 @@ def get_model_yaml_config(model_label: str,
                 'enable_attention_dp': False,
                 'moe_backend': 'TRTLLM'
             }
+        },
+        # Llama-v3.3 models with fp8 quantization
+        {
+            'patterns': [
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,1000-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4',
+                'llama_v3.1_405b_instruct_fp4',
+                'llama_v4_scout_17b_16e_instruct_fp4',
+                'llama_v4_maverick_17b_128e_instruct_fp8'
+            ],
+            'config': {
+                'use_cuda_graph':
+                True,
+                'cuda_graph_padding_enabled':
+                True,
+                'cuda_graph_batch_sizes': [
+                    1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048,
+                    4096, 8192
+                ]
+            }
         }
     ]
 

@@ -71,7 +71,7 @@
     "llama_v4_maverick_17b_128e_instruct":
     "llama4-models/Llama-4-Maverick-17B-128E-Instruct",
     "llama_v4_maverick_17b_128e_instruct_fp8":
-    "llama4-models/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
     # "llama_30b": "llama-models/llama-30b-hf",
     "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
     "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
@@ -321,34 +321,33 @@ class PerfTestConfig:
     This should hold only the attributes that distinguish different tests.
     """
 
-    def __init__(
-        self,
-        *,
-        model_name: str = "",
-        runtime: str = "python",
-        static_batching: str = "",
-        api: str = "",
-        streaming: str = "",
-        backend: str = "",
-        mode: str = "plugin",
-        data_type: str = "float16",
-        max_batch_size: int = 512,
-        max_num_tokens: int = 2048,
-        gpu_weights_percent: float = -1,
-        batch_sizes: List[int] = [0],
-        input_lens: List[int] = [8],
-        output_lens: List[int] = [1],
-        num_beams: int = 1,
-        num_loras: int = 0,
-        num_reqs: int = 512,
-        concurrency: int = -1,
-        quantization: str = "",
-        kv_cache_dtype: str = "auto",
-        ep_size: int = None,
-        tp_size: int = 1,
-        pp_size: int = 1,
-        num_gpus: int = 1,
-    ):
+    def __init__(self,
+                 *,
+                 model_name: str = "",
+                 runtime: str = "python",
+                 static_batching: str = "",
+                 api: str = "",
+                 streaming: str = "",
+                 backend: str = "",
+                 mode: str = "plugin",
+                 data_type: str = "float16",
+                 max_batch_size: int = 512,
+                 max_num_tokens: int = 2048,
+                 kv_cache_free_gpu_mem_fraction: float = 0.9,
+                 gpu_weights_percent: float = -1,
+                 batch_sizes: List[int] = [0],
+                 input_lens: List[int] = [8],
+                 output_lens: List[int] = [1],
+                 num_beams: int = 1,
+                 num_loras: int = 0,
+                 num_reqs: int = 512,
+                 concurrency: int = -1,
+                 quantization: str = "",
+                 kv_cache_dtype: str = "auto",
+                 ep_size: int = None,
+                 tp_size: int = 1,
+                 pp_size: int = 1,
+                 num_gpus: int = 1):
         # The model name.
         self.model_name = model_name
         # Python or cpp/cppmanager runtime.
@@ -371,6 +370,8 @@ def __init__(
         self.max_batch_size = max_batch_size
         # Max number of tokens to build TRT engine with.
         self.max_num_tokens = max_num_tokens
+        # kv cache free gpu mem fraction
+        self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction
         # List of batch sizes to run benchmark with.
         self.batch_sizes = batch_sizes
         # List of input lens to run benchmark with.
@@ -401,6 +402,8 @@ def __init__(
         self.num_gpus = num_gpus
         # Just build engines
         self.build_only = False
+        # kv cache free gpu mem fraction
+        self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction
 
     def to_string(self,
                   custom_bs: int = None,
@@ -444,6 +447,10 @@ def to_string(self,
         # Add Max number of tokens.
         entries.append(f"maxnt:{self.max_num_tokens}")
 
+        # Add kv cache free gpu mem fraction.
+        if self.kv_cache_free_gpu_mem_fraction != 0.9:
+            entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")
+
         if self.build_only:
             entries.append(f"build_only")
 
@@ -514,6 +521,10 @@ def to_string(self,
         if self.num_gpus > 1:
             entries.append(f"gpus:{self.num_gpus}")
 
+        # Add kv cache free gpu mem fraction.
+        if self.kv_cache_free_gpu_mem_fraction != 0.9:
+            entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")
+
         # Concatenate labels with "-".
         return "-".join(entries)
 
@@ -553,6 +564,10 @@ def load_from_str(self, test_param_labels) -> None:
         if labels[0].startswith("maxnt"):
             self.max_num_tokens = int(labels.pop(0).replace("maxnt:", ""))
 
+        if labels[0].startswith("kv_frac:"):
+            self.kv_cache_free_gpu_mem_fraction = float(
+                labels.pop(0).replace("kv_frac:", ""))
+
         if labels[0] == "build_only":
             self.build_only = True
             labels.pop(0)
@@ -621,6 +636,11 @@ def load_from_str(self, test_param_labels) -> None:
             self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
                 labels.pop(0).replace("gpus:", ""))
 
+        if len(labels) > 0:
+            self.kv_cache_free_gpu_mem_fraction = 0.9 if not labels[
+                0].startswith("kv_frac:") else float(
+                    labels.pop(0).replace("kv_frac:", ""))
+
         assert len(
             labels
         ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
@@ -1210,9 +1230,10 @@ def get_trtllm_bench_command(self, engine_dir):
             f"--model_path={model_dir}",
             "throughput",
             f"--dataset={dataset_path}",
-            f"--max_batch_size={self._config.max_batch_size}",
-            f"--max_num_tokens={self._config.max_num_tokens}",
+            # f"--max_batch_size={self._config.max_batch_size}",
+            # f"--max_num_tokens={self._config.max_num_tokens}",
             f"--report_json={report_path}",
+            f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}"
         ]
         if self._config.backend != "pytorch":
             benchmark_cmd += [f"--engine_dir={engine_dir}"]

@@ -375,6 +375,35 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8]
+  # llama_v3.1_405b_fp8
+  #pytorch backend
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
+
+  #llama_v4_maverick_17b_128e_instruct_fp8
+  #pytorch backend
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  #rcca case
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(240)
+
+  #llama_v4_scout_17b_16e_instruct_fp8
+  #pytorch backend
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8]
+
+  #deepseek_r1_fp8
+  #pytorch backend
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
 
 
 - condition: