Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 51 additions & 27 deletions tests/integration/defs/perf/pytorch_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,42 +32,66 @@ def get_model_yaml_config(model_label: str) -> dict:
'use_cuda_graph': True,
'cuda_graph_padding_enabled': True,
}
model_configs = {
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8':

# Pattern-based configurations for models matching specific substrings
# This allows for flexible configuration of models based on naming patterns
pattern_configs = [
# DeepSeek R1 models with MTP speculative decoding
{
'use_cuda_graph': True,
'speculative_config': {
'decoding_type': 'MTP',
'num_nextn_predict_layers': 3
'patterns': [
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8',
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8'
],
'config': {
'use_cuda_graph': True,
'speculative_config': {
'decoding_type': 'MTP',
'num_nextn_predict_layers': 3
}
}
},
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8':
# DeepSeek R1 models with large batch sizes and cuda graph padding
{
'use_cuda_graph': True,
'speculative_config': {
'decoding_type': 'MTP',
'num_nextn_predict_layers': 3
'patterns': [
'deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8',
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8'
],
'config': {
'cuda_graph_padding_enabled': True,
'cuda_graph_batch_sizes':
[1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
}
},
'deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-gpus:8':
# DeepSeek R1 model with specific batch size 128
{
'cuda_graph_batch_sizes': [128]
'patterns':
'deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-gpus:8',
'config': {
'cuda_graph_batch_sizes': [128]
}
},
'deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8':
# Llama Nemotron models with attention_dp disabled to prevent hangs
{
'cuda_graph_padding_enabled': True,
'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
'patterns': [
'llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8',
'llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8',
'llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16'
],
'config': {
# True causes hang, needs model-specific fix.
'enable_attention_dp': False,
}
},
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8':
{
'cuda_graph_padding_enabled': True,
'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
}
}
# get model name from model_label
model_name = next(
(key for key in model_configs if key in model_label.lower()), None)
if model_name:
base_config.update(model_configs[model_name])
]

# Apply pattern-based configurations on top of base config
for pattern_config in pattern_configs:
patterns = pattern_config['patterns']
if isinstance(patterns, str):
patterns = [patterns]
for pattern in patterns:
if pattern in model_label.lower():
base_config.update(pattern_config['config'])
break # Stop checking other patterns for this config once we find a match

return base_config
20 changes: 15 additions & 5 deletions tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,13 @@
"llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
"llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
"llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
"llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8",
"llama_v3.3_nemotron_super_49b":
"nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
"llama_v3.1_nemotron_ultra_253b":
"nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1",
"llama_v3.3_nemotron_super_49b_fp8":
"nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8",
"llama_v3.1_nemotron_ultra_253b_fp8":
"nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
# "llama_30b": "llama-models/llama-30b-hf",
"mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
"mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
Expand Down Expand Up @@ -107,10 +110,14 @@
"llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
"llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
"llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
"llama_v3.1_nemotron_nano_8b_fp8_hf":
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8",
"llama_v3.3_nemotron_super_49b_hf":
"nvidia/Llama-3_3-Nemotron-Super-49B-v1",
"llama_v3.1_nemotron_ultra_253b_hf":
"nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
"llama_v3.3_nemotron_super_49b_fp8_hf":
"nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
"llama_v3.1_nemotron_ultra_253b_fp8_hf":
"nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
"mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1",
"mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
Expand All @@ -125,7 +132,10 @@
TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")

TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=True
"llama_v3.3_nemotron_super_49b"
"llama_v3.3_nemotron_super_49b",
"llama_v3.3_nemotron_super_49b_fp8",
"llama_v3.1_nemotron_ultra_253b",
"llama_v3.1_nemotron_ultra_253b_fp8",
}


Expand Down
29 changes: 29 additions & 0 deletions tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ trt_llm_release_perf_test:
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-pytorch-bfloat16-maxbs:512-input_output_len:500,2000-reqs:500-con:250]
# FP8 prequantized pyt backend
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:8-con:1]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:500-con:250]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:500,2000-reqs:500-con:250]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:500-con:250]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:20000-input_output_len:20000,2000-reqs:500-con:250]
#long time llama_nemotron cases
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:5000,500-reqs:8-con:1] # timeout for l20, l40s, a100
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:8-con:1] #timeout for l20, l40s, failed for a100
Expand Down Expand Up @@ -276,13 +285,27 @@ trt_llm_release_perf_test:
tests:
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-input_output_len:128,128-tp:4]
# Llama-Nemotron-Super-49B-v3.3
# cpp
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:5000,500-reqs:4-con:1-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:4-con:1-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-reqs:4-con:1-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:4-con:1-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-con:250-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-quant:fp8-con:250-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4]
# pyt
# bfloat16
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
# fp8 prequantized
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:4-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]


- condition:
ranges:
Expand Down Expand Up @@ -331,6 +354,12 @@ trt_llm_release_perf_test:
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:8-con:1-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:5000,500-quant:fp8-reqs:250-con:250-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-bfloat16-maxbs:64-input_output_len:500,2000-quant:fp8-reqs:250-con:250-tp:8-gpus:8]
# pyt backend, fp8 pre-quantized
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-maxnt:5000-input_output_len:5000,500-reqs:8-con:1-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8]
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8]


- condition:
ranges:
Expand Down