diff --git a/requirements-dev.txt b/requirements-dev.txt index c8293761eaa..05b62535812 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -31,3 +31,4 @@ ruff==0.9.4 lm_eval[api]==0.4.8 docstring_parser genai-perf==0.0.13 +triton==3.3.1; platform_machine == "x86_64" diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index ba01027d465..7704811dc35 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -127,6 +127,29 @@ def get_model_yaml_config(model_label: str, 'enable_attention_dp': False, 'moe_backend': 'TRTLLM' } + }, + # Llama-v3.3 models with fp8 quantization + { + 'patterns': [ + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,1000-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4', + 'llama_v3.1_405b_instruct_fp4', + 'llama_v4_scout_17b_16e_instruct_fp4', + 'llama_v4_maverick_17b_128e_instruct_fp8' + ], + 'config': { + 'use_cuda_graph': + True, + 'cuda_graph_padding_enabled': + True, + 'cuda_graph_batch_sizes': [ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, + 4096, 8192 + ] + } } ] diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 2e136f7a5cf..fe156fd1ba5 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -71,7 +71,7 @@ "llama_v4_maverick_17b_128e_instruct": "llama4-models/Llama-4-Maverick-17B-128E-Instruct", "llama_v4_maverick_17b_128e_instruct_fp8": - "llama4-models/Llama-4-Maverick-17B-128E-Instruct-FP8", + "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", # "llama_30b": "llama-models/llama-30b-hf", "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1", "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1", @@ -321,34 +321,33 @@ class PerfTestConfig: This should hold only the attributes that distinguish different tests. """ - def __init__( - self, - *, - model_name: str = "", - runtime: str = "python", - static_batching: str = "", - api: str = "", - streaming: str = "", - backend: str = "", - mode: str = "plugin", - data_type: str = "float16", - max_batch_size: int = 512, - max_num_tokens: int = 2048, - gpu_weights_percent: float = -1, - batch_sizes: List[int] = [0], - input_lens: List[int] = [8], - output_lens: List[int] = [1], - num_beams: int = 1, - num_loras: int = 0, - num_reqs: int = 512, - concurrency: int = -1, - quantization: str = "", - kv_cache_dtype: str = "auto", - ep_size: int = None, - tp_size: int = 1, - pp_size: int = 1, - num_gpus: int = 1, - ): + def __init__(self, + *, + model_name: str = "", + runtime: str = "python", + static_batching: str = "", + api: str = "", + streaming: str = "", + backend: str = "", + mode: str = "plugin", + data_type: str = "float16", + max_batch_size: int = 512, + max_num_tokens: int = 2048, + kv_cache_free_gpu_mem_fraction: float = 0.9, + gpu_weights_percent: float = -1, + batch_sizes: List[int] = [0], + input_lens: List[int] = [8], + output_lens: List[int] = [1], + num_beams: int = 1, + num_loras: int = 0, + num_reqs: int = 512, + concurrency: int = -1, + quantization: str = "", + kv_cache_dtype: str = "auto", + ep_size: int = None, + tp_size: int = 1, + pp_size: int = 1, + num_gpus: int = 1): # The model name. self.model_name = model_name # Python or cpp/cppmanager runtime. @@ -371,6 +370,8 @@ def __init__( self.max_batch_size = max_batch_size # Max number of tokens to build TRT engine with. self.max_num_tokens = max_num_tokens + # kv cache free gpu mem fraction + self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction # List of batch sizes to run benchmark with. self.batch_sizes = batch_sizes # List of input lens to run benchmark with. @@ -401,6 +402,8 @@ def __init__( self.num_gpus = num_gpus # Just build engines self.build_only = False + # kv cache free gpu mem fraction + self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction def to_string(self, custom_bs: int = None, @@ -444,6 +447,10 @@ def to_string(self, # Add Max number of tokens. entries.append(f"maxnt:{self.max_num_tokens}") + # Add kv cache free gpu mem fraction. + if self.kv_cache_free_gpu_mem_fraction != 0.9: + entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}") + if self.build_only: entries.append(f"build_only") @@ -514,6 +521,10 @@ def to_string(self, if self.num_gpus > 1: entries.append(f"gpus:{self.num_gpus}") + # Add kv cache free gpu mem fraction. + if self.kv_cache_free_gpu_mem_fraction != 0.9: + entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}") + # Concatenate labels with "-". return "-".join(entries) @@ -553,6 +564,10 @@ def load_from_str(self, test_param_labels) -> None: if labels[0].startswith("maxnt"): self.max_num_tokens = int(labels.pop(0).replace("maxnt:", "")) + if labels[0].startswith("kv_frac:"): + self.kv_cache_free_gpu_mem_fraction = float( + labels.pop(0).replace("kv_frac:", "")) + if labels[0] == "build_only": self.build_only = True labels.pop(0) @@ -621,6 +636,11 @@ def load_from_str(self, test_param_labels) -> None: self.num_gpus = 1 if not labels[0].startswith("gpus:") else int( labels.pop(0).replace("gpus:", "")) + if len(labels) > 0: + self.kv_cache_free_gpu_mem_fraction = 0.9 if not labels[ + 0].startswith("kv_frac:") else float( + labels.pop(0).replace("kv_frac:", "")) + assert len( labels ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}" @@ -1210,9 +1230,10 @@ def get_trtllm_bench_command(self, engine_dir): f"--model_path={model_dir}", "throughput", f"--dataset={dataset_path}", - f"--max_batch_size={self._config.max_batch_size}", - f"--max_num_tokens={self._config.max_num_tokens}", + # f"--max_batch_size={self._config.max_batch_size}", + # f"--max_num_tokens={self._config.max_num_tokens}", f"--report_json={report_path}", + f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}" ] if self._config.backend != "pytorch": benchmark_cmd += [f"--engine_dir={engine_dir}"] diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml index af4f1f714e3..4378c9cd5ad 100644 --- a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml +++ b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml @@ -375,6 +375,35 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8] + # llama_v3.1_405b_fp8 + #pytorch backend + - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8] + + #llama_v4_maverick_17b_128e_instruct_fp8 + #pytorch backend + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(120) + #rcca case + - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(240) + + #llama_v4_scout_17b_16e_instruct_fp8 + #pytorch backend + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8] + + #deepseek_r1_fp8 + #pytorch backend + - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] - condition: