27
27
SamplingParams , TorchCompileConfig )
28
28
from tensorrt_llm .quantization import QuantAlgo
29
29
30
- from ..conftest import (llm_models_root , parametrize_with_ids , skip_no_hopper ,
30
+ from ..conftest import (get_device_count , get_device_memory , llm_models_root ,
31
+ parametrize_with_ids , skip_no_hopper ,
31
32
skip_post_blackwell , skip_pre_ada , skip_pre_blackwell ,
32
33
skip_pre_hopper )
33
34
from .accuracy_core import (GSM8K , MMLU , CnnDailymail , GPQADiamond ,
@@ -509,19 +510,26 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
509
510
MODEL_PATH = f"{ llm_models_root ()} /llama4-models/Llama-4-Maverick-17B-128E-Instruct"
510
511
511
512
@skip_pre_blackwell
512
- @pytest .mark .skip_less_mpi_world_size (8 )
513
513
@parametrize_with_ids ("cuda_graph" , [False , True ])
514
- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
515
- (8 , 1 , 8 )],
516
- ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
514
+ @pytest .mark .parametrize (
515
+ "tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), (8 , 1 , 8 ), (4 , 1 , 1 ),
516
+ (4 , 1 , 2 ), (4 , 1 , 4 )],
517
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" , "tp4" , "tp4ep2" , "tp4ep4" ])
517
518
def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
519
+ if get_device_memory () < 270000 and get_device_count () < 8 :
520
+ pytest .skip ("Not enough memory for this test" )
521
+ if get_device_count () != tp_size * pp_size :
522
+ pytest .skip ("Device count mismatch with world size" )
523
+
524
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
518
525
with LLM (
519
526
self .MODEL_PATH ,
520
527
tensor_parallel_size = tp_size ,
521
528
# Keep this low to avoid warmup OOM in CI
522
529
max_seq_len = 8192 ,
523
530
pipeline_parallel_size = pp_size ,
524
531
moe_expert_parallel_size = ep_size ,
532
+ kv_cache_config = kv_cache_config ,
525
533
cuda_graph_config = CudaGraphConfig ()
526
534
if cuda_graph else None ) as llm :
527
535
task = MMLU (self .MODEL_NAME )
@@ -547,20 +555,27 @@ def test_chunked_prefill(self, attn_backend):
547
555
task .evaluate (llm )
548
556
549
557
@skip_pre_hopper
550
- @pytest .mark .skip_less_mpi_world_size ( 8 )
558
+ @pytest .mark .skip_less_device_memory ( 80000 )
551
559
@parametrize_with_ids ("cuda_graph" , [False , True ])
552
- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
553
- (8 , 1 , 8 )],
554
- ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
560
+ @pytest .mark .parametrize (
561
+ "tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), (8 , 1 , 8 ), (4 , 1 , 1 ),
562
+ (4 , 1 , 2 ), (4 , 1 , 4 )],
563
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" , "tp4" , "tp4ep2" , "tp4ep4" ])
555
564
def test_fp8 (self , cuda_graph , tp_size , pp_size , ep_size ):
565
+ if get_device_memory () < 140000 and get_device_count () < 8 :
566
+ pytest .skip ("Not enough memory for this test" )
567
+ if get_device_count () != tp_size * pp_size :
568
+ pytest .skip ("Device count mismatch with world size" )
569
+
556
570
with LLM (
557
571
f"{ llm_models_root ()} /llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" ,
558
572
tensor_parallel_size = tp_size ,
559
573
# Keep this low to avoid warmup OOM in CI
560
574
max_seq_len = 8192 ,
561
575
pipeline_parallel_size = pp_size ,
562
576
moe_expert_parallel_size = ep_size ,
563
- use_cuda_graph = cuda_graph ) as llm :
577
+ cuda_graph_config = CudaGraphConfig ()
578
+ if cuda_graph else None ) as llm :
564
579
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
565
580
assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
566
581
task = MMLU (self .MODEL_NAME )
@@ -583,7 +598,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
583
598
moe_expert_parallel_size = ep_size ,
584
599
enable_chunked_prefill = True ,
585
600
max_num_tokens = 256 ,
586
- use_cuda_graph = cuda_graph ) as llm :
601
+ cuda_graph_config = CudaGraphConfig ()
602
+ if cuda_graph else None ) as llm :
587
603
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
588
604
assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
589
605
task = MMLU (self .MODEL_NAME )
@@ -622,16 +638,21 @@ def test_fp8_eagle3(self, tp_size, pp_size, ep_size, torch_compile):
622
638
task .evaluate (llm )
623
639
624
640
641
+ @pytest .mark .skip_less_device_memory (80000 )
642
+ @pytest .mark .skip_less_host_memory (200000 )
625
643
class TestLlama4ScoutInstruct (LlmapiAccuracyTestHarness ):
626
644
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
627
645
628
646
@skip_pre_hopper
629
- @pytest .mark .skip_less_mpi_world_size (8 )
630
647
@parametrize_with_ids ("cuda_graph" , [False , True ])
631
- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
632
- (8 , 1 , 8 )],
633
- ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
648
+ @pytest .mark .parametrize (
649
+ "tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ), (8 , 1 , 8 ), (4 , 1 , 1 ),
650
+ (4 , 1 , 2 ), (4 , 1 , 4 )],
651
+ ids = ["tp8" , "tp8ep4" , "tp8ep8" , "tp4" , "tp4ep2" , "tp4ep4" ])
634
652
def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
653
+ if get_device_count () != tp_size * pp_size :
654
+ pytest .skip ("Device count mismatch with world size" )
655
+
635
656
model_path = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct"
636
657
with LLM (
637
658
model_path ,
@@ -648,11 +669,13 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
648
669
task .evaluate (llm )
649
670
650
671
@skip_pre_hopper
651
- @pytest .mark .skip_less_mpi_world_size (8 )
652
672
@parametrize_with_ids ("cuda_graph" , [True ])
653
673
@pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 ), (4 , 1 , 1 )],
654
674
ids = ["tp8ep8" , "tp4" ])
655
675
def test_fp8 (self , cuda_graph , tp_size , pp_size , ep_size ):
676
+ if get_device_count () != tp_size * pp_size :
677
+ pytest .skip ("Device count mismatch with world size" )
678
+
656
679
model_path = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
657
680
with LLM (
658
681
model_path ,
@@ -661,6 +684,7 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
661
684
max_seq_len = 8192 ,
662
685
pipeline_parallel_size = pp_size ,
663
686
moe_expert_parallel_size = ep_size ,
687
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 ),
664
688
cuda_graph_config = CudaGraphConfig ()
665
689
if cuda_graph else None ) as llm :
666
690
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
@@ -670,11 +694,13 @@ def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
670
694
task .evaluate (llm )
671
695
672
696
@skip_pre_blackwell
673
- @pytest .mark .skip_less_mpi_world_size (8 )
674
697
@parametrize_with_ids ("cuda_graph" , [True ])
675
698
@pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 ), (4 , 1 , 1 )],
676
699
ids = ["tp8ep8" , "tp4" ])
677
700
def test_fp4 (self , cuda_graph , tp_size , pp_size , ep_size ):
701
+ if get_device_count () != tp_size * pp_size :
702
+ pytest .skip ("Device count mismatch with world size" )
703
+
678
704
model_path = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
679
705
with LLM (
680
706
model_path ,
@@ -706,7 +732,8 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
706
732
moe_expert_parallel_size = ep_size ,
707
733
enable_chunked_prefill = True ,
708
734
max_num_tokens = 256 ,
709
- use_cuda_graph = cuda_graph ) as llm :
735
+ cuda_graph_config = CudaGraphConfig ()
736
+ if cuda_graph else None ) as llm :
710
737
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
711
738
assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
712
739
task = MMLU (self .MODEL_NAME )
@@ -715,7 +742,7 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
715
742
task .evaluate (llm )
716
743
717
744
@skip_pre_blackwell
718
- @pytest .mark .skip_less_mpi_world_size (8 )
745
+ @pytest .mark .skip_less_mpi_world_size (4 )
719
746
@parametrize_with_ids ("cuda_graph" , [True ])
720
747
@pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 4 )],
721
748
ids = ["tp4ep4" ])
@@ -728,7 +755,8 @@ def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
728
755
max_seq_len = 22000 ,
729
756
enable_chunked_prefill = True ,
730
757
max_num_tokens = 256 ,
731
- use_cuda_graph = cuda_graph ) as llm :
758
+ cuda_graph_config = CudaGraphConfig ()
759
+ if cuda_graph else None ) as llm :
732
760
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
733
761
assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
734
762
task = MMLU (self .MODEL_NAME )
0 commit comments