Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/integration/defs/accuracy/references/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ meta-llama/Llama-3.3-70B-Instruct:
accuracy: 84.08
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
- accuracy: 92.20
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 90.20
meta-llama/Llama-4-Scout-17B-16E-Instruct:
- accuracy: 89.70
- quant_algo: NVFP4
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/defs/accuracy/references/mmlu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ meta-llama/Llama-3.3-70B-Instruct:
accuracy: 80.34
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
- accuracy: 86.40
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 86.40
- quant_algo: FP8
kv_cache_quant_algo: FP8
spec_dec_algo: Eagle
Expand Down
68 changes: 48 additions & 20 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
SamplingParams, TorchCompileConfig)
from tensorrt_llm.quantization import QuantAlgo

from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_hopper,
from ..conftest import (get_device_count, get_device_memory, llm_models_root,
parametrize_with_ids, skip_no_hopper,
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
skip_pre_hopper)
from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
Expand Down Expand Up @@ -509,19 +510,26 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Maverick-17B-128E-Instruct"

@skip_pre_blackwell
@pytest.mark.skip_less_mpi_world_size(8)
@parametrize_with_ids("cuda_graph", [False, True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
(4, 1, 2), (4, 1, 4)],
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
if get_device_memory() < 270000 and get_device_count() < 8:
pytest.skip("Not enough memory for this test")
if get_device_count() != tp_size * pp_size:
pytest.skip("Device count mismatch with world size")

kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
with LLM(
self.MODEL_PATH,
tensor_parallel_size=tp_size,
# Keep this low to avoid warmup OOM in CI
max_seq_len=8192,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
task = MMLU(self.MODEL_NAME)
Expand All @@ -547,20 +555,27 @@ def test_chunked_prefill(self, attn_backend):
task.evaluate(llm)

@skip_pre_hopper
@pytest.mark.skip_less_mpi_world_size(8)
@pytest.mark.skip_less_device_memory(80000)
@parametrize_with_ids("cuda_graph", [False, True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
(4, 1, 2), (4, 1, 4)],
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
if get_device_memory() < 140000 and get_device_count() < 8:
pytest.skip("Not enough memory for this test")
if get_device_count() != tp_size * pp_size:
pytest.skip("Device count mismatch with world size")

with LLM(
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
tensor_parallel_size=tp_size,
# Keep this low to avoid warmup OOM in CI
max_seq_len=8192,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
use_cuda_graph=cuda_graph) as llm:
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
Expand All @@ -583,7 +598,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
moe_expert_parallel_size=ep_size,
enable_chunked_prefill=True,
max_num_tokens=256,
use_cuda_graph=cuda_graph) as llm:
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
Expand Down Expand Up @@ -622,16 +638,21 @@ def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
task.evaluate(llm)


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_host_memory(100000)
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

@skip_pre_hopper
@pytest.mark.skip_less_mpi_world_size(8)
@parametrize_with_ids("cuda_graph", [False, True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
(4, 1, 2), (4, 1, 4)],
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
if get_device_count() != tp_size * pp_size:
pytest.skip("Device count mismatch with world size")

model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
with LLM(
model_path,
Expand All @@ -648,11 +669,13 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
task.evaluate(llm)

@skip_pre_hopper
@pytest.mark.skip_less_mpi_world_size(8)
@parametrize_with_ids("cuda_graph", [True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
ids=["tp8ep8", "tp4"])
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
if get_device_count() != tp_size * pp_size:
pytest.skip("Device count mismatch with world size")

model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
with LLM(
model_path,
Expand All @@ -661,6 +684,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
max_seq_len=8192,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8),
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
Expand All @@ -670,11 +694,13 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
task.evaluate(llm)

@skip_pre_blackwell
@pytest.mark.skip_less_mpi_world_size(8)
@parametrize_with_ids("cuda_graph", [True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
ids=["tp8ep8", "tp4"])
def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size):
if get_device_count() != tp_size * pp_size:
pytest.skip("Device count mismatch with world size")

model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
with LLM(
model_path,
Expand Down Expand Up @@ -706,7 +732,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
moe_expert_parallel_size=ep_size,
enable_chunked_prefill=True,
max_num_tokens=256,
use_cuda_graph=cuda_graph) as llm:
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
Expand All @@ -715,7 +742,7 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
task.evaluate(llm)

@skip_pre_blackwell
@pytest.mark.skip_less_mpi_world_size(8)
@pytest.mark.skip_less_mpi_world_size(4)
@parametrize_with_ids("cuda_graph", [True])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
ids=["tp4ep4"])
Expand All @@ -728,7 +755,8 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
max_seq_len=22000,
enable_chunked_prefill=True,
max_num_tokens=256,
use_cuda_graph=cuda_graph) as llm:
cuda_graph_config=CudaGraphConfig()
if cuda_graph else None) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
Expand Down
9 changes: 9 additions & 0 deletions tests/integration/test_lists/qa/llm_function_full.txt
Original file line number Diff line number Diff line change
Expand Up @@ -467,18 +467,27 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
Expand Down
16 changes: 16 additions & 0 deletions tests/integration/test_lists/qa/llm_function_sanity.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,33 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagl
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
Expand Down