diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp index d95ca1b412b..5e7e56af2c8 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @@ -847,16 +847,23 @@ void CacheFormatter::unformat(TransferSession& session) } int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size(); int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism; + int destPPSize = destConfig.getParallelConfig().mPipelineParallelism; + int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size(); + + if (selfPPSize == destPPSize) + { + return true; + } if (selfNumLayers % selfPPSize != 0) { - TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism"); + TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d", + selfNumLayers, selfPPSize); return false; } - int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size(); - int destPPSize = destConfig.getParallelConfig().mPipelineParallelism; if (destNumLayers % destPPSize != 0) { - TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism"); + TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ", + destNumLayers, destPPSize); return false; } return true; diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp index 824a31129f8..8e083db6751 100644 --- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp @@ -591,6 +591,28 @@ void MLACacheFormatter::unformat(TransferSession& session) return false; } + int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size(); + int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism; + int destPPSize = destConfig.getParallelConfig().mPipelineParallelism; + int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size(); + + if (selfPPSize == destPPSize) + { + return true; + } + if (selfNumLayers % selfPPSize != 0) + { + TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d", + selfNumLayers, selfPPSize); + return false; + } + if (destNumLayers % destPPSize != 0) + { + TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ", + destNumLayers, destPPSize); + return false; + } + return true; } } // namespace tensorrt_llm::batch_manager::kv_cache_manager diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index aacf0f75a96..2088ccaad79 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -663,13 +663,14 @@ def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv, @pytest.mark.skip_less_device(8) -@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True) +@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], + indirect=True) def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv, disaggregated_example_root, llama_model_root): src_dst_dict = { llama_model_root: - f"{llm_venv.get_working_directory()}/llama-3.1-models/Meta-Llama-3.1-8B", + f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", } for src, dst in src_dst_dict.items(): if not os.path.islink(dst): diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 4f707478e70..a82af88c6b0 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -678,7 +678,7 @@ disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[ disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8] disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b] +disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index 80cb845b291..42667225456 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -30,7 +30,7 @@ l0_dgx_h200: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] - - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b] + - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0] - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] - unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora