Skip to content

Commit e2dc37a

Browse files
committed
add llama4 tp4 tests
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent daa2a65 commit e2dc37a

File tree

5 files changed

+80
-21
lines changed

5 files changed

+80
-21
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ meta-llama/Llama-3.3-70B-Instruct:
2121
accuracy: 84.08
2222
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
2323
- accuracy: 92.20
24+
- quant_algo: FP8
25+
kv_cache_quant_algo: FP8
26+
accuracy: 90.20
2427
meta-llama/Llama-4-Scout-17B-16E-Instruct:
2528
- accuracy: 89.70
2629
- quant_algo: NVFP4

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ meta-llama/Llama-3.3-70B-Instruct:
7171
accuracy: 80.34
7272
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
7373
- accuracy: 86.40
74+
- quant_algo: FP8
75+
kv_cache_quant_algo: FP8
76+
accuracy: 86.40
7477
- quant_algo: FP8
7578
kv_cache_quant_algo: FP8
7679
spec_dec_algo: Eagle

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
SamplingParams, TorchCompileConfig)
2828
from tensorrt_llm.quantization import QuantAlgo
2929

30-
from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_hopper,
30+
from ..conftest import (get_device_count, get_device_memory, llm_models_root,
31+
parametrize_with_ids, skip_no_hopper,
3132
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
3233
skip_pre_hopper)
3334
from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
@@ -509,19 +510,26 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
509510
MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Maverick-17B-128E-Instruct"
510511

511512
@skip_pre_blackwell
512-
@pytest.mark.skip_less_mpi_world_size(8)
513513
@parametrize_with_ids("cuda_graph", [False, True])
514-
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
515-
(8, 1, 8)],
516-
ids=["tp8", "tp8ep4", "tp8ep8"])
514+
@pytest.mark.parametrize(
515+
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
516+
(4, 1, 2), (4, 1, 4)],
517+
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
517518
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
519+
if get_device_memory() < 270000 and get_device_count() < 8:
520+
pytest.skip("Not enough memory for this test")
521+
if get_device_count() != tp_size * pp_size:
522+
pytest.skip("Device count mismatch with world size")
523+
524+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
518525
with LLM(
519526
self.MODEL_PATH,
520527
tensor_parallel_size=tp_size,
521528
# Keep this low to avoid warmup OOM in CI
522529
max_seq_len=8192,
523530
pipeline_parallel_size=pp_size,
524531
moe_expert_parallel_size=ep_size,
532+
kv_cache_config=kv_cache_config,
525533
cuda_graph_config=CudaGraphConfig()
526534
if cuda_graph else None) as llm:
527535
task = MMLU(self.MODEL_NAME)
@@ -547,20 +555,27 @@ def test_chunked_prefill(self, attn_backend):
547555
task.evaluate(llm)
548556

549557
@skip_pre_hopper
550-
@pytest.mark.skip_less_mpi_world_size(8)
558+
@pytest.mark.skip_less_device_memory(80000)
551559
@parametrize_with_ids("cuda_graph", [False, True])
552-
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
553-
(8, 1, 8)],
554-
ids=["tp8", "tp8ep4", "tp8ep8"])
560+
@pytest.mark.parametrize(
561+
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
562+
(4, 1, 2), (4, 1, 4)],
563+
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
555564
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
565+
if get_device_memory() < 140000 and get_device_count() < 8:
566+
pytest.skip("Not enough memory for this test")
567+
if get_device_count() != tp_size * pp_size:
568+
pytest.skip("Device count mismatch with world size")
569+
556570
with LLM(
557571
f"{llm_models_root()}/llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
558572
tensor_parallel_size=tp_size,
559573
# Keep this low to avoid warmup OOM in CI
560574
max_seq_len=8192,
561575
pipeline_parallel_size=pp_size,
562576
moe_expert_parallel_size=ep_size,
563-
use_cuda_graph=cuda_graph) as llm:
577+
cuda_graph_config=CudaGraphConfig()
578+
if cuda_graph else None) as llm:
564579
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
565580
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
566581
task = MMLU(self.MODEL_NAME)
@@ -583,7 +598,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
583598
moe_expert_parallel_size=ep_size,
584599
enable_chunked_prefill=True,
585600
max_num_tokens=256,
586-
use_cuda_graph=cuda_graph) as llm:
601+
cuda_graph_config=CudaGraphConfig()
602+
if cuda_graph else None) as llm:
587603
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
588604
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
589605
task = MMLU(self.MODEL_NAME)
@@ -622,16 +638,21 @@ def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
622638
task.evaluate(llm)
623639

624640

641+
@pytest.mark.skip_less_device_memory(80000)
642+
@pytest.mark.skip_less_host_memory(200000)
625643
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
626644
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
627645

628646
@skip_pre_hopper
629-
@pytest.mark.skip_less_mpi_world_size(8)
630647
@parametrize_with_ids("cuda_graph", [False, True])
631-
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
632-
(8, 1, 8)],
633-
ids=["tp8", "tp8ep4", "tp8ep8"])
648+
@pytest.mark.parametrize(
649+
"tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4), (8, 1, 8), (4, 1, 1),
650+
(4, 1, 2), (4, 1, 4)],
651+
ids=["tp8", "tp8ep4", "tp8ep8", "tp4", "tp4ep2", "tp4ep4"])
634652
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
653+
if get_device_count() != tp_size * pp_size:
654+
pytest.skip("Device count mismatch with world size")
655+
635656
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
636657
with LLM(
637658
model_path,
@@ -648,11 +669,13 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
648669
task.evaluate(llm)
649670

650671
@skip_pre_hopper
651-
@pytest.mark.skip_less_mpi_world_size(8)
652672
@parametrize_with_ids("cuda_graph", [True])
653673
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
654674
ids=["tp8ep8", "tp4"])
655675
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
676+
if get_device_count() != tp_size * pp_size:
677+
pytest.skip("Device count mismatch with world size")
678+
656679
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
657680
with LLM(
658681
model_path,
@@ -661,6 +684,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
661684
max_seq_len=8192,
662685
pipeline_parallel_size=pp_size,
663686
moe_expert_parallel_size=ep_size,
687+
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.8),
664688
cuda_graph_config=CudaGraphConfig()
665689
if cuda_graph else None) as llm:
666690
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
@@ -670,11 +694,13 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
670694
task.evaluate(llm)
671695

672696
@skip_pre_blackwell
673-
@pytest.mark.skip_less_mpi_world_size(8)
674697
@parametrize_with_ids("cuda_graph", [True])
675698
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
676699
ids=["tp8ep8", "tp4"])
677700
def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size):
701+
if get_device_count() != tp_size * pp_size:
702+
pytest.skip("Device count mismatch with world size")
703+
678704
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
679705
with LLM(
680706
model_path,
@@ -706,7 +732,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
706732
moe_expert_parallel_size=ep_size,
707733
enable_chunked_prefill=True,
708734
max_num_tokens=256,
709-
use_cuda_graph=cuda_graph) as llm:
735+
cuda_graph_config=CudaGraphConfig()
736+
if cuda_graph else None) as llm:
710737
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
711738
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
712739
task = MMLU(self.MODEL_NAME)
@@ -715,7 +742,7 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
715742
task.evaluate(llm)
716743

717744
@skip_pre_blackwell
718-
@pytest.mark.skip_less_mpi_world_size(8)
745+
@pytest.mark.skip_less_mpi_world_size(4)
719746
@parametrize_with_ids("cuda_graph", [True])
720747
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
721748
ids=["tp4ep4"])
@@ -728,7 +755,8 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
728755
max_seq_len=22000,
729756
enable_chunked_prefill=True,
730757
max_num_tokens=256,
731-
use_cuda_graph=cuda_graph) as llm:
758+
cuda_graph_config=CudaGraphConfig()
759+
if cuda_graph else None) as llm:
732760
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
733761
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
734762
task = MMLU(self.MODEL_NAME)

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,18 +461,27 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
461461
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
462462
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
463463
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
464+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
465+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
466+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
464467
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
465468
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
466469
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
467470
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
468471
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
472+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
473+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
474+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
469475
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
470476
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
471477
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
472478
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
473479
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
474480
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
475481
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
482+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
483+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
484+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
476485
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
477486
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
478487
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
@@ -481,7 +490,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_pref
481490
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
482491
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
483492
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
484-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
493+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
485494
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
486495
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
487496
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

tests/integration/test_lists/qa/llm_function_sanity.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,33 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagl
7171
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
7272
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
7373
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
74+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False]
75+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
76+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
7477
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
7578
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
79+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True]
80+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True]
81+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True]
82+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True]
83+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True]
84+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True]
85+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
86+
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
7687
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True]
7788
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False]
7889
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
7990
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
8091
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
92+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False]
93+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True]
94+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True]
8195
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
8296
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
8397
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
8498
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
99+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
100+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
85101
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
86102
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
87103
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized

0 commit comments

Comments
 (0)