diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index ce885a4ed19..33c264b9e47 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -21,6 +21,9 @@ meta-llama/Llama-3.3-70B-Instruct: accuracy: 84.08 meta-llama/Llama-4-Maverick-17B-128E-Instruct: - accuracy: 92.20 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 90.20 meta-llama/Llama-4-Scout-17B-16E-Instruct: - accuracy: 89.70 - quant_algo: NVFP4 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index ee7cc99c40d..9dd1c25d3c3 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -71,6 +71,9 @@ meta-llama/Llama-3.3-70B-Instruct: accuracy: 80.34 meta-llama/Llama-4-Maverick-17B-128E-Instruct: - accuracy: 86.40 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 86.40 - quant_algo: FP8 kv_cache_quant_algo: FP8 spec_dec_algo: Eagle diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 69ad940a645..f0a8e923289 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -27,7 +27,8 @@ SamplingParams, TorchCompileConfig) from tensorrt_llm.quantization import QuantAlgo -from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_hopper, +from ..conftest import (get_device_count, get_device_memory, llm_models_root, + parametrize_with_ids, skip_no_hopper, skip_post_blackwell, skip_pre_ada, skip_pre_blackwell, skip_pre_hopper) from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond, @@ -509,12 +510,18 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Maverick-17B-128E-Instruct" @skip_pre_blackwell - @pytest.mark.skip_less_mpi_world_size(8) @parametrize_with_ids("cuda_graph", [False, True]) - @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), - (8, 1, 8)], - ids=["tp8", "tp8ep4", "tp8ep8"]) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1), + (4, 1, 2), (4, 1, 4)], + ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"]) def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size): + if get_device_memory() < 270000 and get_device_count() < 8: + pytest.skip("Not enough memory for this test") + if get_device_count() != tp_size * pp_size: + pytest.skip("Device count mismatch with world size") + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) with LLM( self.MODEL_PATH, tensor_parallel_size=tp_size, @@ -522,6 +529,7 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size): max_seq_len=8192, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) as llm: task = MMLU(self.MODEL_NAME) @@ -547,12 +555,18 @@ def test_chunked_prefill(self, attn_backend): task.evaluate(llm) @skip_pre_hopper - @pytest.mark.skip_less_mpi_world_size(8) + @pytest.mark.skip_less_device_memory(80000) @parametrize_with_ids("cuda_graph", [False, True]) - @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), - (8, 1, 8)], - ids=["tp8", "tp8ep4", "tp8ep8"]) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1), + (4, 1, 2), (4, 1, 4)], + ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"]) def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): + if get_device_memory() < 140000 and get_device_count() < 8: + pytest.skip("Not enough memory for this test") + if get_device_count() != tp_size * pp_size: + pytest.skip("Device count mismatch with world size") + with LLM( f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", tensor_parallel_size=tp_size, @@ -560,7 +574,8 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): max_seq_len=8192, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - use_cuda_graph=cuda_graph) as llm: + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = MMLU(self.MODEL_NAME) @@ -583,7 +598,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size): moe_expert_parallel_size=ep_size, enable_chunked_prefill=True, max_num_tokens=256, - use_cuda_graph=cuda_graph) as llm: + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = MMLU(self.MODEL_NAME) @@ -622,16 +638,21 @@ def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile): task.evaluate(llm) +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.skip_less_host_memory(100000) class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct" @skip_pre_hopper - @pytest.mark.skip_less_mpi_world_size(8) @parametrize_with_ids("cuda_graph", [False, True]) - @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), - (8, 1, 8)], - ids=["tp8", "tp8ep4", "tp8ep8"]) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1), + (4, 1, 2), (4, 1, 4)], + ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"]) def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size): + if get_device_count() != tp_size * pp_size: + pytest.skip("Device count mismatch with world size") + model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct" with LLM( model_path, @@ -648,11 +669,13 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size): task.evaluate(llm) @skip_pre_hopper - @pytest.mark.skip_less_mpi_world_size(8) @parametrize_with_ids("cuda_graph", [True]) @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)], ids=["tp8ep8", "tp4"]) def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): + if get_device_count() != tp_size * pp_size: + pytest.skip("Device count mismatch with world size") + model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8" with LLM( model_path, @@ -661,6 +684,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): max_seq_len=8192, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, + kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8), cuda_graph_config=CudaGraphConfig() if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 @@ -670,11 +694,13 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size): task.evaluate(llm) @skip_pre_blackwell - @pytest.mark.skip_less_mpi_world_size(8) @parametrize_with_ids("cuda_graph", [True]) @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)], ids=["tp8ep8", "tp4"]) def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size): + if get_device_count() != tp_size * pp_size: + pytest.skip("Device count mismatch with world size") + model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4" with LLM( model_path, @@ -706,7 +732,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size): moe_expert_parallel_size=ep_size, enable_chunked_prefill=True, max_num_tokens=256, - use_cuda_graph=cuda_graph) as llm: + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = MMLU(self.MODEL_NAME) @@ -715,7 +742,7 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size): task.evaluate(llm) @skip_pre_blackwell - @pytest.mark.skip_less_mpi_world_size(8) + @pytest.mark.skip_less_mpi_world_size(4) @parametrize_with_ids("cuda_graph", [True]) @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)], ids=["tp4ep4"]) @@ -728,7 +755,8 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size): max_seq_len=22000, enable_chunked_prefill=True, max_num_tokens=256, - use_cuda_graph=cuda_graph) as llm: + cuda_graph_config=CudaGraphConfig() + if cuda_graph else None) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index aa24e5b9c6d..c427dea2bc7 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -467,11 +467,17 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] @@ -479,6 +485,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 8dc118d991c..c977a77d3c2 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -71,17 +71,33 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagl accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized