NVIDIA · byshiue · Jul 31, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
@@ -26,6 +26,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
     - [Serving](#serving)
       - [trtllm-serve](#trtllm-serve)
       - [Disaggregated Serving](#disaggregated-serving)
+    - [Eagle3](#eagle3)
   - [Dynamo](#dynamo)
   - [Notes and Troubleshooting](#notes-and-troubleshooting)
   - [Credits](#credits)
@@ -891,6 +892,38 @@ Note that the optimal disaggregated serving configuration (i.e. tp/pp/ep mapping
 on the request parameters, the number of concurrent requests and the GPU type. It is recommended to experiment to identify optimal
 settings for your specific use case.
 
+#### Eagle3
+
+Qwen3 now supports Eagle3 (Speculative Decoding with Eagle3). To enable Eagle3 on Qwen3, you need to set the following arguments when running `trtllm-bench` or `trtllm-serve`:
+
+- `speculative_config.decoding_type: Eagle`
+  Set the decoding type to "Eagle" to enable Eagle3 speculative decoding.
+- `speculative_config.max_draft_len: 3`
+  Set the maximum number of draft tokens generated per step (this value can be adjusted as needed).
+- `speculative_config.speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>`
+  Specify the path to the Eagle3 draft model (ensure the corresponding draft model weights are prepared).
+
+Currently, there are some limitations when enabling Eagle3:
+
+1. `attention_dp` is not supported. Please disable it or do not set the related flag (it is disabled by default).
+2. If you want to use `enable_block_reuse`, the kv cache type of the target model and the draft model must be the same. Since the draft model only supports fp16/bf16, you need to disable `enable_block_reuse` when using fp8 kv cache.
+
+Example `extra-llm-api-config.yml` snippet for Eagle3:
+
+```bash
+echo "
+enable_attention_dp: false
+speculative_config:
+    decoding_type: Eagle
+    max_draft_len: 3
+    speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>
+kv_cache_config:
+    enable_block_reuse: false
+" >> ${path_config}
+```
+
+For further details, please refer to [speculative-decoding.md](../../../../docs/source/advanced/speculative-decoding.md)
+
 ### Dynamo
 
 NVIDIA Dynamo is a high-throughput low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments.

diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -86,6 +86,10 @@ Qwen3/Qwen3-235B-A22B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 85.78
+  - spec_dec_algo: Eagle
+    quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 85.78
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 92.57
   - quant_algo: FP8

diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -170,6 +170,10 @@ Qwen3/Qwen3-235B-A22B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 86
+  - spec_dec_algo: Eagle
+    quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 86
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
   - accuracy: 79.43
   - quant_algo: FP8

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1888,28 +1888,44 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     @skip_pre_blackwell
     @pytest.mark.skip_less_mpi_world_size(8)
     @pytest.mark.parametrize(
-        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend",
-        [(8, 1, 8, True, True, True, "CUTLASS"),
-         (8, 1, 8, True, True, True, "TRTLLM")],
-        ids=["latency_moe_cutlass", "latency_moe_trtllm"],
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
+        [
+            (8, 1, 8, True, True, True, "CUTLASS", False),
+            (8, 1, 8, True, True, True, "TRTLLM", False),
+            (8, 1, 8, False, False, False, "TRTLLM", True),
+        ],
+        ids=[
+            "latency_moe_cutlass", "latency_moe_trtllm",
+            "latency_moe_trtllm_eagle3"
+        ],
     )
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
-                   overlap_scheduler, moe_backend):
+                   overlap_scheduler, moe_backend, eagle3):
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                        enable_block_reuse=not eagle3)
+        spec_config = None
+        if eagle3:
+            spec_config = EagleDecodingConfig(
+                max_draft_len=2,
+                speculative_model_dir=
+                f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
+                eagle3_one_model=True)
         with LLM(
                 f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
                 tensor_parallel_size=tp_size,
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
                 **pytorch_config,
                 enable_attention_dp=attention_dp,
-                kv_cache_config=kv_cache_config) as llm:
+                kv_cache_config=kv_cache_config,
+                speculative_config=spec_config) as llm:
+
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)

diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -18,3 +18,4 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -393,7 +393,6 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)
 test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5320234)
-examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5374145)
 stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5376087)
 full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5375966)