From 02f81f65f76bc6266e811862aed629ea75ef6288 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:30:18 +0800 Subject: [PATCH 1/2] add ds v3 bf16 chunked_prefill on hopper Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +- tests/integration/test_lists/qa/llm_function_core.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 999e9cf15bc..333e2dda1c9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1138,7 +1138,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): # Chunked Prefill for MLA can only be enabled on SM100 @parametrize_with_ids( "enable_chunked_prefill", - [False, pytest.param(True, marks=skip_pre_blackwell)]) + [False, pytest.param(True, marks=skip_pre_hopper)]) @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", [(False, False, False), (True, False, False), diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 3258266b142..a08f9bff478 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -461,6 +461,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_pref accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] From a1bbdb6b5afd15210ad67999736f0fb299f21608 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:09:18 +0800 Subject: [PATCH 2/2] add chunked_prefill + mtp cases on deepseekr1 Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 107 ++++++++++++++++++ .../test_lists/qa/llm_function_core.txt | 4 + .../qa/llm_function_core_sanity.txt | 5 + 3 files changed, 116 insertions(+) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 333e2dda1c9..b5ad70d7bf6 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1947,6 +1947,68 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, # task.evaluate(llm, # extra_evaluator_kwargs=dict(apply_chat_template=True)) + @skip_pre_blackwell + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", + [ + # Use a larger batch_size to speed up the tests + pytest.param(8, + 1, + 4, + 3, + False, + False, + True, + True, + 32, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(4, + 1, + 1, + 0, + True, + True, + True, + True, + 16, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(4)), + ], + ids=["latency", "throughput_tp4"]) + def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size, + mtp_nextn, fp8kv, attention_dp, + cuda_graph, overlap_scheduler, + max_batch_size, moe_backend): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + if fp8kv: + kv_cache_config.dtype = "fp8" + + mtp_config = None + if mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4", + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config, + enable_chunked_prefill=True) as llm: + + assert llm.args.moe_config.backend == moe_backend + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.skip_less_mpi_world_size(8) @skip_pre_hopper @pytest.mark.parametrize( @@ -1993,6 +2055,51 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @pytest.mark.skip_less_mpi_world_size(8) + @skip_pre_hopper + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size", + [(8, 1, 4, 3, False, False, True, True, 1), + (8, 1, 8, 0, True, True, True, True, 24)], + ids=["latency", "throughput"]) + def test_fp8_blockscale_chunked_prefill(self, tp_size, pp_size, ep_size, + mtp_nextn, fp8kv, attention_dp, + cuda_graph, overlap_scheduler, + max_batch_size): + if get_sm_version() == 100: + moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + else: + moe_config = MoeConfig() + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=moe_config, + ) + + if fp8kv: + kv_cache_config.dtype = "fp8" + + mtp_config = None + if mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1", + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config, + enable_chunked_prefill=True) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.timeout(7200) @pytest.mark.skip_less_device_memory(100000) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index a08f9bff478..fea4053fdd7 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -486,7 +486,11 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[throughput_tp4] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 61c6f75837f..87ebd731e80 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -37,6 +37,11 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_chunked_prefill[throughput_tp4] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]