diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md index f5177a8d2d6..80bf337a1a9 100644 --- a/examples/models/core/qwen/README.md +++ b/examples/models/core/qwen/README.md @@ -26,6 +26,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m - [Serving](#serving) - [trtllm-serve](#trtllm-serve) - [Disaggregated Serving](#disaggregated-serving) + - [Eagle3](#eagle3) - [Dynamo](#dynamo) - [Notes and Troubleshooting](#notes-and-troubleshooting) - [Credits](#credits) @@ -891,6 +892,38 @@ Note that the optimal disaggregated serving configuration (i.e. tp/pp/ep mapping on the request parameters, the number of concurrent requests and the GPU type. It is recommended to experiment to identify optimal settings for your specific use case. +#### Eagle3 + +Qwen3 now supports Eagle3 (Speculative Decoding with Eagle3). To enable Eagle3 on Qwen3, you need to set the following arguments when running `trtllm-bench` or `trtllm-serve`: + +- `speculative_config.decoding_type: Eagle` + Set the decoding type to "Eagle" to enable Eagle3 speculative decoding. +- `speculative_config.max_draft_len: 3` + Set the maximum number of draft tokens generated per step (this value can be adjusted as needed). +- `speculative_config.speculative_model_dir: ` + Specify the path to the Eagle3 draft model (ensure the corresponding draft model weights are prepared). + +Currently, there are some limitations when enabling Eagle3: + +1. `attention_dp` is not supported. Please disable it or do not set the related flag (it is disabled by default). +2. If you want to use `enable_block_reuse`, the kv cache type of the target model and the draft model must be the same. Since the draft model only supports fp16/bf16, you need to disable `enable_block_reuse` when using fp8 kv cache. + +Example `extra-llm-api-config.yml` snippet for Eagle3: + +```bash +echo " +enable_attention_dp: false +speculative_config: + decoding_type: Eagle + max_draft_len: 3 + speculative_model_dir: +kv_cache_config: + enable_block_reuse: false +" >> ${path_config} +``` + +For further details, please refer to [speculative-decoding.md](../../../../docs/source/advanced/speculative-decoding.md) + ### Dynamo NVIDIA Dynamo is a high-throughput low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments. diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index dbbd6eb79f4..b001554c251 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -86,6 +86,10 @@ Qwen3/Qwen3-235B-A22B: - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 85.78 + - spec_dec_algo: Eagle + quant_algo: NVFP4 + kv_cache_quant_algo: FP8 + accuracy: 85.78 nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 92.57 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index d86ebb0ce39..cd19aa0e835 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -170,6 +170,10 @@ Qwen3/Qwen3-235B-A22B: - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 86 + - spec_dec_algo: Eagle + quant_algo: NVFP4 + kv_cache_quant_algo: FP8 + accuracy: 86 nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 79.43 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ce1e1cc1367..bde1f20228f 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1888,20 +1888,34 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, @skip_pre_blackwell @pytest.mark.skip_less_mpi_world_size(8) @pytest.mark.parametrize( - "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend", - [(8, 1, 8, True, True, True, "CUTLASS"), - (8, 1, 8, True, True, True, "TRTLLM")], - ids=["latency_moe_cutlass", "latency_moe_trtllm"], + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", + [ + (8, 1, 8, True, True, True, "CUTLASS", False), + (8, 1, 8, True, True, True, "TRTLLM", False), + (8, 1, 8, False, False, False, "TRTLLM", True), + ], + ids=[ + "latency_moe_cutlass", "latency_moe_trtllm", + "latency_moe_trtllm_eagle3" + ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, - overlap_scheduler, moe_backend): + overlap_scheduler, moe_backend, eagle3): pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, moe_config=MoeConfig(backend=moe_backend)) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + enable_block_reuse=not eagle3) + spec_config = None + if eagle3: + spec_config = EagleDecodingConfig( + max_draft_len=2, + speculative_model_dir= + f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/", + eagle3_one_model=True) with LLM( f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", tensor_parallel_size=tp_size, @@ -1909,7 +1923,9 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, moe_expert_parallel_size=ep_size, **pytorch_config, enable_attention_dp=attention_dp, - kv_cache_config=kv_cache_config) as llm: + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index 0aa3e9e5fb8..ba60bd1c2f0 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -18,3 +18,4 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 43889db226e..aeae25cd65f 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -393,7 +393,6 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP ( examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086) test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5320234) -examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5374145) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5376087) full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5375966)