From ec69aa88fd01b8845bdf9261876241bbc3ebde7e Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Tue, 13 May 2025 11:27:36 +0000 Subject: [PATCH 1/4] fix: XQA is not enabled when history_length < kMinHistoryTokensPerBlock. chore: Print the reason if XQA is not used. Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- .../decoderXQAImplJIT/decoderXQAImplJIT.cpp | 18 ++++++++++++++++-- .../decoderXQAImplPrecompiled.cpp | 14 ++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index d06fa9675f1..bea382bf57d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -79,7 +79,8 @@ bool DecoderXQAImplJIT::mayHavePerfGain(XQAParams const& xqaParams) const if (xqaParams.multi_block_mode) { int history_length = xqaParams.max_past_kv_length; - multi_block_count = history_length / kMinHistoryTokensPerBlock; + // Always use at least 1 block regardless of history length + multi_block_count = std::max(1, history_length / kMinHistoryTokensPerBlock); } int block_count = num_kv_heads * batch_size * multi_block_count; return static_cast(block_count) * kEnableMinBlockFactor >= static_cast(mRunner->mMultiProcessorCount); @@ -98,12 +99,25 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo return true; } } + TLLM_LOG_WARNING("JIT XQA is not used: no supported configuration found for any beam_width"); return false; } else { auto const& xqaParams = umbrellaXQAParams; - return supportConfig(xqaParams, forConfigurePlugin) && mayHavePerfGain(xqaParams); + bool isConfigSupported = supportConfig(xqaParams, forConfigurePlugin); + if (!isConfigSupported) + { + TLLM_LOG_WARNING("JIT XQA is not used: unsupported configuration"); + return false; + } + bool hasPerfGain = mayHavePerfGain(xqaParams); + if (!hasPerfGain) + { + TLLM_LOG_WARNING("JIT XQA is not used: no performance gain"); + return false; + } + return true; } } diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index c24505e60ce..56e16599f42 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -438,6 +438,7 @@ void DecoderXQAImplPrecompiled::runDispatchBuffer( #define SUPPORT_RETURN_FALSE(X) \ { \ + TLLM_LOG_WARNING("XQA is not used. Reason: %s", X); \ return false; \ } @@ -522,8 +523,17 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo } XQAKernelList const* xqa_kernel = getXQAKernels(mRunner->mDataType, tensorrt_llm::common::getSMVersion()); - return xqa_kernel->supportConfig(xqaParams) - && xqa_kernel->mayHavePerfGain(xqaParams, mRunner->mMultiProcessorCount); + bool supportConfig = xqa_kernel->supportConfig(xqaParams); + if (!supportConfig) + { + SUPPORT_RETURN_FALSE("supportConfig"); + } + bool mayHavePerfGain = xqa_kernel->mayHavePerfGain(xqaParams, mRunner->mMultiProcessorCount); + if (!mayHavePerfGain) + { + SUPPORT_RETURN_FALSE("mayHavePerfGain"); + } + return true; } #undef SUPPORT_RETURN_FALSE From 6620b73c9cc17cd64fc5c3b1318d40ffc89d6f93 Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Wed, 14 May 2025 22:36:36 +0800 Subject: [PATCH 2/4] Change log level from warning to debug. Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- .../decoderXQAImplJIT/decoderXQAImplJIT.cpp | 6 +++--- .../decoderXQAImplPrecompiled.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index bea382bf57d..a1941672a11 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -99,7 +99,7 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo return true; } } - TLLM_LOG_WARNING("JIT XQA is not used: no supported configuration found for any beam_width"); + TLLM_LOG_DEBUG("JIT XQA is not used: no supported configuration found for any beam_width"); return false; } else @@ -108,13 +108,13 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo bool isConfigSupported = supportConfig(xqaParams, forConfigurePlugin); if (!isConfigSupported) { - TLLM_LOG_WARNING("JIT XQA is not used: unsupported configuration"); + TLLM_LOG_DEBUG("JIT XQA is not used: unsupported configuration"); return false; } bool hasPerfGain = mayHavePerfGain(xqaParams); if (!hasPerfGain) { - TLLM_LOG_WARNING("JIT XQA is not used: no performance gain"); + TLLM_LOG_DEBUG("JIT XQA is not used: no performance gain"); return false; } return true; diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index 56e16599f42..bd245fa5b37 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -438,7 +438,7 @@ void DecoderXQAImplPrecompiled::runDispatchBuffer( #define SUPPORT_RETURN_FALSE(X) \ { \ - TLLM_LOG_WARNING("XQA is not used. Reason: %s", X); \ + TLLM_LOG_DEBUG("XQA is not used. Reason: %s", X); \ return false; \ } From eb2eb6e3c1139c7b156bf8c7ae8dff484959112d Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Tue, 10 Jun 2025 11:50:33 +0000 Subject: [PATCH 3/4] Fix the failed test. Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- .../decoderXQAImplJIT/decoderXQAImplJIT.cpp | 2 +- tests/unittest/llmapi/test_llm.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index a1941672a11..ecb57ea1ffa 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -114,7 +114,7 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo bool hasPerfGain = mayHavePerfGain(xqaParams); if (!hasPerfGain) { - TLLM_LOG_DEBUG("JIT XQA is not used: no performance gain"); + TLLM_LOG_DEBUG("JIT XQA is not used: maybe no performance gain"); return false; } return true; diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index bb44ba57d45..57e560eb313 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -3,6 +3,14 @@ import gc import json import os + +# Required for test_generate_with_seed to pass. +# See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891 +# Note that currently functions like getEnvForceDeterministic are implemented using static variables, +# which means they are only initialized once per CPP translation unit (and should be refactored to be non static later). +# Therefore, the following line must be ahead of any tensorrt_llm imports. +os.environ['FORCE_DETERMINISTIC'] = '1' + import random import shutil import sys From f0392bd9903fd8443245181d12ffd98b4d417d54 Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Tue, 10 Jun 2025 15:49:18 +0000 Subject: [PATCH 4/4] Need to use TRTLLM_FORCE_XQA. Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 57e560eb313..6ebf12c0a3f 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -6,10 +6,12 @@ # Required for test_generate_with_seed to pass. # See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891 -# Note that currently functions like getEnvForceDeterministic are implemented using static variables, -# which means they are only initialized once per CPP translation unit (and should be refactored to be non static later). -# Therefore, the following line must be ahead of any tensorrt_llm imports. -os.environ['FORCE_DETERMINISTIC'] = '1' +# The following line must be ahead of any tensorrt_llm imports, +# since currently env util functions like getEnvForceDeterministic are implemented using static variables, +# which means they are only initialized once the CPP translation unit is loaded (should be refactored to be non static later). +os.environ['TRTLLM_FORCE_XQA'] = '1' +# Note that we cannot use os.environ['FORCE_DETERMINISTIC'] = '1' here, +# since it will disable KV cache reuse and make test_llm_api_draft_target fail. import random import shutil