From 8de63d833e0c51e2b47fe862593ba5d08ca6cd3f Mon Sep 17 00:00:00 2001 From: bhsueh <11360707+byshiue@users.noreply.github.com> Date: Mon, 18 Aug 2025 06:26:49 -0700 Subject: [PATCH 1/5] fix qwen3 235b eagle3 ci Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 46 ++++++++++++++++++- .../test_lists/qa/llm_function_full.txt | 2 +- .../test_lists/qa/llm_function_sanity.txt | 2 +- .../test-db/l0_gb200_multi_nodes.yml | 1 - tests/integration/test_lists/waives.txt | 1 - 5 files changed, 46 insertions(+), 6 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 7af8c437d06..3e358616a16 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2445,15 +2445,15 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, [ (8, 1, 8, True, True, True, "CUTLASS", False), (8, 1, 8, True, True, True, "TRTLLM", False), - (8, 1, 8, False, False, False, "TRTLLM", True), ], ids=[ "latency_moe_cutlass", "latency_moe_trtllm", - "latency_moe_trtllm_eagle3" ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, moe_backend, eagle3): + if moe_backend == "TRITON": + pytest.skip("Triton kernels are not available") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -2484,6 +2484,48 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_blackwell + @pytest.mark.skip_less_mpi_world_size(4) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", + [ + (4, 1, 4, False, False, False, "TRTLLM", True), # TP8 has bug when we use TRTLLM moe backend and eagle3 + ], + ids=[ + "latency_moe_trtllm_eagle3", + ], + ) + def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, + overlap_scheduler, moe_backend, eagle3): + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + enable_block_reuse=not eagle3) + spec_config = None + if eagle3: + spec_config = EagleDecodingConfig( + max_draft_len=2, + speculative_model_dir= + f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/", + eagle3_one_model=True) + with LLM( + f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-4-mini-instruct" diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 9e6e12b4007..9c4dcaa5b2b 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index c977a77d3c2..7aec01a9824 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index 9c04ad70906..857319c44c2 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -19,4 +19,3 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90) - - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index db4f9198552..0dd528f6893 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241) test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) From 7d705795136b7533194104d71dfd5e5f580a0a98 Mon Sep 17 00:00:00 2001 From: bhsueh <11360707+byshiue@users.noreply.github.com> Date: Mon, 18 Aug 2025 06:29:32 -0700 Subject: [PATCH 2/5] remove useless codes Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 3e358616a16..c34564b42d9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2452,8 +2452,6 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, moe_backend, eagle3): - if moe_backend == "TRITON": - pytest.skip("Triton kernels are not available") pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, From c92c814e874c9a4a3a6fb31742411d4d61d5f5c2 Mon Sep 17 00:00:00 2001 From: bhsueh <11360707+byshiue@users.noreply.github.com> Date: Mon, 18 Aug 2025 14:08:26 +0000 Subject: [PATCH 3/5] fix bugs of pre-commit Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- .../integration/defs/accuracy/test_llm_api_pytorch.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c34564b42d9..8d5b7c08c67 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2447,7 +2447,8 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, (8, 1, 8, True, True, True, "TRTLLM", False), ], ids=[ - "latency_moe_cutlass", "latency_moe_trtllm", + "latency_moe_cutlass", + "latency_moe_trtllm", ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, @@ -2487,14 +2488,15 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", [ - (4, 1, 4, False, False, False, "TRTLLM", True), # TP8 has bug when we use TRTLLM moe backend and eagle3 + (4, 1, 4, False, False, False, "TRTLLM", + True), # TP8 has bug when we use TRTLLM moe backend and eagle3 ], ids=[ "latency_moe_trtllm_eagle3", ], ) - def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, - overlap_scheduler, moe_backend, eagle3): + def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, + cuda_graph, overlap_scheduler, moe_backend, eagle3): pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -2525,6 +2527,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-4-mini-instruct" MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct" From d69b2adfc3ddf241598801af50d7700c57a73fb4 Mon Sep 17 00:00:00 2001 From: bhsueh <11360707+byshiue@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:44:46 +0000 Subject: [PATCH 4/5] move qwen3 235b 4gpus test to l0_b200 Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_gb200.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index ac39fbdc88c..7d1cc92fef5 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -69,3 +69,4 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90) From 1127dac198ab60413b3790d26c1ddb614117200f Mon Sep 17 00:00:00 2001 From: bhsueh <11360707+byshiue@users.noreply.github.com> Date: Thu, 21 Aug 2025 04:52:05 +0000 Subject: [PATCH 5/5] add qwen3 235b tp8ep8 eagle3 test back Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 8d5b7c08c67..bb2a368bcb4 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2445,10 +2445,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, [ (8, 1, 8, True, True, True, "CUTLASS", False), (8, 1, 8, True, True, True, "TRTLLM", False), + (8, 1, 8, True, True, True, "TRTLLM", True), ], ids=[ "latency_moe_cutlass", "latency_moe_trtllm", + "latency_moe_trtllm_eagle3", ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,