Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ ruff==0.9.4
lm_eval[api]==0.4.8
docstring_parser
genai-perf==0.0.13
triton==3.3.1; platform_machine == "x86_64"
23 changes: 23 additions & 0 deletions tests/integration/defs/perf/pytorch_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,29 @@ def get_model_yaml_config(model_label: str,
'enable_attention_dp': False,
'moe_backend': 'TRTLLM'
}
},
# Llama-v3.3 models with fp8 quantization
{
'patterns': [
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:4',
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,1000-gpus:4',
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4',
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4',
'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4',
'llama_v3.1_405b_instruct_fp4',
'llama_v4_scout_17b_16e_instruct_fp4',
'llama_v4_maverick_17b_128e_instruct_fp8'
],
'config': {
'use_cuda_graph':
True,
'cuda_graph_padding_enabled':
True,
'cuda_graph_batch_sizes': [
1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048,
4096, 8192
]
}
}
]

Expand Down
83 changes: 52 additions & 31 deletions tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"llama_v4_maverick_17b_128e_instruct":
"llama4-models/Llama-4-Maverick-17B-128E-Instruct",
"llama_v4_maverick_17b_128e_instruct_fp8":
"llama4-models/Llama-4-Maverick-17B-128E-Instruct-FP8",
"llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
# "llama_30b": "llama-models/llama-30b-hf",
"mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
"mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
Expand Down Expand Up @@ -321,34 +321,33 @@ class PerfTestConfig:
This should hold only the attributes that distinguish different tests.
"""

def __init__(
self,
*,
model_name: str = "",
runtime: str = "python",
static_batching: str = "",
api: str = "",
streaming: str = "",
backend: str = "",
mode: str = "plugin",
data_type: str = "float16",
max_batch_size: int = 512,
max_num_tokens: int = 2048,
gpu_weights_percent: float = -1,
batch_sizes: List[int] = [0],
input_lens: List[int] = [8],
output_lens: List[int] = [1],
num_beams: int = 1,
num_loras: int = 0,
num_reqs: int = 512,
concurrency: int = -1,
quantization: str = "",
kv_cache_dtype: str = "auto",
ep_size: int = None,
tp_size: int = 1,
pp_size: int = 1,
num_gpus: int = 1,
):
def __init__(self,
*,
model_name: str = "",
runtime: str = "python",
static_batching: str = "",
api: str = "",
streaming: str = "",
backend: str = "",
mode: str = "plugin",
data_type: str = "float16",
max_batch_size: int = 512,
max_num_tokens: int = 2048,
kv_cache_free_gpu_mem_fraction: float = 0.9,
gpu_weights_percent: float = -1,
batch_sizes: List[int] = [0],
input_lens: List[int] = [8],
output_lens: List[int] = [1],
num_beams: int = 1,
num_loras: int = 0,
num_reqs: int = 512,
concurrency: int = -1,
quantization: str = "",
kv_cache_dtype: str = "auto",
ep_size: int = None,
tp_size: int = 1,
pp_size: int = 1,
num_gpus: int = 1):
# The model name.
self.model_name = model_name
# Python or cpp/cppmanager runtime.
Expand All @@ -371,6 +370,8 @@ def __init__(
self.max_batch_size = max_batch_size
# Max number of tokens to build TRT engine with.
self.max_num_tokens = max_num_tokens
# kv cache free gpu mem fraction
self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction
# List of batch sizes to run benchmark with.
self.batch_sizes = batch_sizes
# List of input lens to run benchmark with.
Expand Down Expand Up @@ -401,6 +402,8 @@ def __init__(
self.num_gpus = num_gpus
# Just build engines
self.build_only = False
# kv cache free gpu mem fraction
self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction

def to_string(self,
custom_bs: int = None,
Expand Down Expand Up @@ -444,6 +447,10 @@ def to_string(self,
# Add Max number of tokens.
entries.append(f"maxnt:{self.max_num_tokens}")

# Add kv cache free gpu mem fraction.
if self.kv_cache_free_gpu_mem_fraction != 0.9:
entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")

Comment on lines +450 to +453
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Remove duplicate string representation logic.

The kv_cache_free_gpu_mem_fraction parameter is added to the string representation twice (lines 453-456 and 527-530), which will create duplicate entries.

Remove the duplicate logic:

        # Add number of GPUs.
        if self.num_gpus > 1:
            entries.append(f"gpus:{self.num_gpus}")

-        # Add kv cache free gpu mem fraction.
-        if self.kv_cache_free_gpu_mem_fraction != 0.9:
-            entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")
-
        # Concatenate labels with "-".
        return "-".join(entries)

Also applies to: 527-530

🤖 Prompt for AI Agents
In tests/integration/defs/perf/test_perf.py around lines 453 to 456, the code
appends the string representation of kv_cache_free_gpu_mem_fraction to entries,
but this is duplicated again at lines 527 to 530. To fix this, remove the block
at lines 453 to 456 that adds kv_cache_free_gpu_mem_fraction to entries,
ensuring it only appears once in the string representation logic at lines 527 to
530.

if self.build_only:
entries.append(f"build_only")

Expand Down Expand Up @@ -514,6 +521,10 @@ def to_string(self,
if self.num_gpus > 1:
entries.append(f"gpus:{self.num_gpus}")

# Add kv cache free gpu mem fraction.
if self.kv_cache_free_gpu_mem_fraction != 0.9:
entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")

# Concatenate labels with "-".
return "-".join(entries)

Expand Down Expand Up @@ -553,6 +564,10 @@ def load_from_str(self, test_param_labels) -> None:
if labels[0].startswith("maxnt"):
self.max_num_tokens = int(labels.pop(0).replace("maxnt:", ""))

if labels[0].startswith("kv_frac:"):
self.kv_cache_free_gpu_mem_fraction = float(
labels.pop(0).replace("kv_frac:", ""))

if labels[0] == "build_only":
self.build_only = True
labels.pop(0)
Expand Down Expand Up @@ -621,6 +636,11 @@ def load_from_str(self, test_param_labels) -> None:
self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
labels.pop(0).replace("gpus:", ""))

if len(labels) > 0:
self.kv_cache_free_gpu_mem_fraction = 0.9 if not labels[
0].startswith("kv_frac:") else float(
labels.pop(0).replace("kv_frac:", ""))

assert len(
labels
) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
Expand Down Expand Up @@ -1210,9 +1230,10 @@ def get_trtllm_bench_command(self, engine_dir):
f"--model_path={model_dir}",
"throughput",
f"--dataset={dataset_path}",
f"--max_batch_size={self._config.max_batch_size}",
f"--max_num_tokens={self._config.max_num_tokens}",
# f"--max_batch_size={self._config.max_batch_size}",
# f"--max_num_tokens={self._config.max_num_tokens}",
f"--report_json={report_path}",
f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}"
]
if self._config.backend != "pytorch":
benchmark_cmd += [f"--engine_dir={engine_dir}"]
Expand Down
29 changes: 29 additions & 0 deletions tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,35 @@ trt_llm_release_perf_test:
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8]
# llama_v3.1_405b_fp8
#pytorch backend
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]

#llama_v4_maverick_17b_128e_instruct_fp8
#pytorch backend
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8] TIMEOUT(120)
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(120)
#rcca case
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(240)

#llama_v4_scout_17b_16e_instruct_fp8
#pytorch backend
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8]

#deepseek_r1_fp8
#pytorch backend
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]


- condition:
Expand Down