From 8de63d833e0c51e2b47fe862593ba5d08ca6cd3f Mon Sep 17 00:00:00 2001
From: bhsueh <11360707+byshiue@users.noreply.github.com>
Date: Mon, 18 Aug 2025 06:26:49 -0700
Subject: [PATCH 1/5] fix qwen3 235b eagle3 ci

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     | 46 ++++++++++++++++++-
 .../test_lists/qa/llm_function_full.txt       |  2 +-
 .../test_lists/qa/llm_function_sanity.txt     |  2 +-
 .../test-db/l0_gb200_multi_nodes.yml          |  1 -
 tests/integration/test_lists/waives.txt       |  1 -
 5 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 7af8c437d06..3e358616a16 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2445,15 +2445,15 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         [
             (8, 1, 8, True, True, True, "CUTLASS", False),
             (8, 1, 8, True, True, True, "TRTLLM", False),
-            (8, 1, 8, False, False, False, "TRTLLM", True),
         ],
         ids=[
             "latency_moe_cutlass", "latency_moe_trtllm",
-            "latency_moe_trtllm_eagle3"
         ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                    overlap_scheduler, moe_backend, eagle3):
+        if moe_backend == "TRITON":
+            pytest.skip("Triton kernels are not available")
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
@@ -2484,6 +2484,48 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(4)
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+        [
+            (4, 1, 4, False, False, False, "TRTLLM", True), # TP8 has bug when we use TRTLLM moe backend and eagle3
+        ],
+        ids=[
+            "latency_moe_trtllm_eagle3",
+        ],
+    )
+    def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
+                   overlap_scheduler, moe_backend, eagle3):
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                        enable_block_reuse=not eagle3)
+        spec_config = None
+        if eagle3:
+            spec_config = EagleDecodingConfig(
+                max_draft_len=2,
+                speculative_model_dir=
+                f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
+                eagle3_one_model=True)
+        with LLM(
+                f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                **pytorch_config,
+                enable_attention_dp=attention_dp,
+                kv_cache_config=kv_cache_config,
+                speculative_config=spec_config) as llm:
+
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
 
 class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "microsoft/Phi-4-mini-instruct"
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index 9e6e12b4007..9c4dcaa5b2b 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
 accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index c977a77d3c2..7aec01a9824 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
index 9c04ad70906..857319c44c2 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -19,4 +19,3 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index db4f9198552..0dd528f6893 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
 test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)

From 7d705795136b7533194104d71dfd5e5f580a0a98 Mon Sep 17 00:00:00 2001
From: bhsueh <11360707+byshiue@users.noreply.github.com>
Date: Mon, 18 Aug 2025 06:29:32 -0700
Subject: [PATCH 2/5] remove useless codes

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 3e358616a16..c34564b42d9 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2452,8 +2452,6 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                    overlap_scheduler, moe_backend, eagle3):
-        if moe_backend == "TRITON":
-            pytest.skip("Triton kernels are not available")
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,

From c92c814e874c9a4a3a6fb31742411d4d61d5f5c2 Mon Sep 17 00:00:00 2001
From: bhsueh <11360707+byshiue@users.noreply.github.com>
Date: Mon, 18 Aug 2025 14:08:26 +0000
Subject: [PATCH 3/5] fix bugs of pre-commit

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 .../integration/defs/accuracy/test_llm_api_pytorch.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index c34564b42d9..8d5b7c08c67 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2447,7 +2447,8 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             (8, 1, 8, True, True, True, "TRTLLM", False),
         ],
         ids=[
-            "latency_moe_cutlass", "latency_moe_trtllm",
+            "latency_moe_cutlass",
+            "latency_moe_trtllm",
         ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
@@ -2487,14 +2488,15 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
         [
-            (4, 1, 4, False, False, False, "TRTLLM", True), # TP8 has bug when we use TRTLLM moe backend and eagle3
+            (4, 1, 4, False, False, False, "TRTLLM",
+             True),  # TP8 has bug when we use TRTLLM moe backend and eagle3
         ],
         ids=[
             "latency_moe_trtllm_eagle3",
         ],
     )
-    def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
-                   overlap_scheduler, moe_backend, eagle3):
+    def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
+                         cuda_graph, overlap_scheduler, moe_backend, eagle3):
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
@@ -2525,6 +2527,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+
 class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "microsoft/Phi-4-mini-instruct"
     MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct"

From d69b2adfc3ddf241598801af50d7700c57a73fb4 Mon Sep 17 00:00:00 2001
From: bhsueh <11360707+byshiue@users.noreply.github.com>
Date: Wed, 20 Aug 2025 14:44:46 +0000
Subject: [PATCH 4/5] move qwen3 235b 4gpus test to l0_b200

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 tests/integration/test_lists/test-db/l0_gb200.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml
index ac39fbdc88c..7d1cc92fef5 100644
--- a/tests/integration/test_lists/test-db/l0_gb200.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200.yml
@@ -69,3 +69,4 @@ l0_gb200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)

From 1127dac198ab60413b3790d26c1ddb614117200f Mon Sep 17 00:00:00 2001
From: bhsueh <11360707+byshiue@users.noreply.github.com>
Date: Thu, 21 Aug 2025 04:52:05 +0000
Subject: [PATCH 5/5] add qwen3 235b tp8ep8 eagle3 test back

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 8d5b7c08c67..bb2a368bcb4 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2445,10 +2445,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         [
             (8, 1, 8, True, True, True, "CUTLASS", False),
             (8, 1, 8, True, True, True, "TRTLLM", False),
+            (8, 1, 8, True, True, True, "TRTLLM", True),
         ],
         ids=[
             "latency_moe_cutlass",
             "latency_moe_trtllm",
+            "latency_moe_trtllm_eagle3",
         ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,