diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index d06fa9675f1..ecb57ea1ffa 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -79,7 +79,8 @@ bool DecoderXQAImplJIT::mayHavePerfGain(XQAParams const& xqaParams) const if (xqaParams.multi_block_mode) { int history_length = xqaParams.max_past_kv_length; - multi_block_count = history_length / kMinHistoryTokensPerBlock; + // Always use at least 1 block regardless of history length + multi_block_count = std::max(1, history_length / kMinHistoryTokensPerBlock); } int block_count = num_kv_heads * batch_size * multi_block_count; return static_cast(block_count) * kEnableMinBlockFactor >= static_cast(mRunner->mMultiProcessorCount); @@ -98,12 +99,25 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo return true; } } + TLLM_LOG_DEBUG("JIT XQA is not used: no supported configuration found for any beam_width"); return false; } else { auto const& xqaParams = umbrellaXQAParams; - return supportConfig(xqaParams, forConfigurePlugin) && mayHavePerfGain(xqaParams); + bool isConfigSupported = supportConfig(xqaParams, forConfigurePlugin); + if (!isConfigSupported) + { + TLLM_LOG_DEBUG("JIT XQA is not used: unsupported configuration"); + return false; + } + bool hasPerfGain = mayHavePerfGain(xqaParams); + if (!hasPerfGain) + { + TLLM_LOG_DEBUG("JIT XQA is not used: maybe no performance gain"); + return false; + } + return true; } } diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index c24505e60ce..bd245fa5b37 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -438,6 +438,7 @@ void DecoderXQAImplPrecompiled::runDispatchBuffer( #define SUPPORT_RETURN_FALSE(X) \ { \ + TLLM_LOG_DEBUG("XQA is not used. Reason: %s", X); \ return false; \ } @@ -522,8 +523,17 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo } XQAKernelList const* xqa_kernel = getXQAKernels(mRunner->mDataType, tensorrt_llm::common::getSMVersion()); - return xqa_kernel->supportConfig(xqaParams) - && xqa_kernel->mayHavePerfGain(xqaParams, mRunner->mMultiProcessorCount); + bool supportConfig = xqa_kernel->supportConfig(xqaParams); + if (!supportConfig) + { + SUPPORT_RETURN_FALSE("supportConfig"); + } + bool mayHavePerfGain = xqa_kernel->mayHavePerfGain(xqaParams, mRunner->mMultiProcessorCount); + if (!mayHavePerfGain) + { + SUPPORT_RETURN_FALSE("mayHavePerfGain"); + } + return true; } #undef SUPPORT_RETURN_FALSE diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index bb44ba57d45..6ebf12c0a3f 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -3,6 +3,16 @@ import gc import json import os + +# Required for test_generate_with_seed to pass. +# See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891 +# The following line must be ahead of any tensorrt_llm imports, +# since currently env util functions like getEnvForceDeterministic are implemented using static variables, +# which means they are only initialized once the CPP translation unit is loaded (should be refactored to be non static later). +os.environ['TRTLLM_FORCE_XQA'] = '1' +# Note that we cannot use os.environ['FORCE_DETERMINISTIC'] = '1' here, +# since it will disable KV cache reuse and make test_llm_api_draft_target fail. + import random import shutil import sys