From ec69aa88fd01b8845bdf9261876241bbc3ebde7e Mon Sep 17 00:00:00 2001
From: Bo Li <22713281+bobboli@users.noreply.github.com>
Date: Tue, 13 May 2025 11:27:36 +0000
Subject: [PATCH 1/4] fix: XQA is not enabled when history_length <
 kMinHistoryTokensPerBlock. chore: Print the reason if XQA is not used.

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
---
 .../decoderXQAImplJIT/decoderXQAImplJIT.cpp    | 18 ++++++++++++++++--
 .../decoderXQAImplPrecompiled.cpp              | 14 ++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
index d06fa9675f1..bea382bf57d 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -79,7 +79,8 @@ bool DecoderXQAImplJIT::mayHavePerfGain(XQAParams const& xqaParams) const
     if (xqaParams.multi_block_mode)
     {
         int history_length = xqaParams.max_past_kv_length;
-        multi_block_count = history_length / kMinHistoryTokensPerBlock;
+        // Always use at least 1 block regardless of history length
+        multi_block_count = std::max(1, history_length / kMinHistoryTokensPerBlock);
     }
     int block_count = num_kv_heads * batch_size * multi_block_count;
     return static_cast<float>(block_count) * kEnableMinBlockFactor >= static_cast<float>(mRunner->mMultiProcessorCount);
@@ -98,12 +99,25 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo
                 return true;
             }
         }
+        TLLM_LOG_WARNING("JIT XQA is not used: no supported configuration found for any beam_width");
         return false;
     }
     else
     {
         auto const& xqaParams = umbrellaXQAParams;
-        return supportConfig(xqaParams, forConfigurePlugin) && mayHavePerfGain(xqaParams);
+        bool isConfigSupported = supportConfig(xqaParams, forConfigurePlugin);
+        if (!isConfigSupported)
+        {
+            TLLM_LOG_WARNING("JIT XQA is not used: unsupported configuration");
+            return false;
+        }
+        bool hasPerfGain = mayHavePerfGain(xqaParams);
+        if (!hasPerfGain)
+        {
+            TLLM_LOG_WARNING("JIT XQA is not used: no performance gain");
+            return false;
+        }
+        return true;
     }
 }
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
index c24505e60ce..56e16599f42 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
@@ -438,6 +438,7 @@ void DecoderXQAImplPrecompiled::runDispatchBuffer(
 
 #define SUPPORT_RETURN_FALSE(X)                                                                                        \
     {                                                                                                                  \
+        TLLM_LOG_WARNING("XQA is not used. Reason: %s", X);                                                            \
         return false;                                                                                                  \
     }
 
@@ -522,8 +523,17 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo
     }
 
     XQAKernelList const* xqa_kernel = getXQAKernels(mRunner->mDataType, tensorrt_llm::common::getSMVersion());
-    return xqa_kernel->supportConfig(xqaParams)
-        && xqa_kernel->mayHavePerfGain(xqaParams, mRunner->mMultiProcessorCount);
+    bool supportConfig = xqa_kernel->supportConfig(xqaParams);
+    if (!supportConfig)
+    {
+        SUPPORT_RETURN_FALSE("supportConfig");
+    }
+    bool mayHavePerfGain = xqa_kernel->mayHavePerfGain(xqaParams, mRunner->mMultiProcessorCount);
+    if (!mayHavePerfGain)
+    {
+        SUPPORT_RETURN_FALSE("mayHavePerfGain");
+    }
+    return true;
 }
 
 #undef SUPPORT_RETURN_FALSE

From 6620b73c9cc17cd64fc5c3b1318d40ffc89d6f93 Mon Sep 17 00:00:00 2001
From: Bo Li <22713281+bobboli@users.noreply.github.com>
Date: Wed, 14 May 2025 22:36:36 +0800
Subject: [PATCH 2/4] Change log level from warning to debug.

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
---
 .../decoderXQAImplJIT/decoderXQAImplJIT.cpp                 | 6 +++---
 .../decoderXQAImplPrecompiled.cpp                           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
index bea382bf57d..a1941672a11 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -99,7 +99,7 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo
                 return true;
             }
         }
-        TLLM_LOG_WARNING("JIT XQA is not used: no supported configuration found for any beam_width");
+        TLLM_LOG_DEBUG("JIT XQA is not used: no supported configuration found for any beam_width");
         return false;
     }
     else
@@ -108,13 +108,13 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo
         bool isConfigSupported = supportConfig(xqaParams, forConfigurePlugin);
         if (!isConfigSupported)
         {
-            TLLM_LOG_WARNING("JIT XQA is not used: unsupported configuration");
+            TLLM_LOG_DEBUG("JIT XQA is not used: unsupported configuration");
             return false;
         }
         bool hasPerfGain = mayHavePerfGain(xqaParams);
         if (!hasPerfGain)
         {
-            TLLM_LOG_WARNING("JIT XQA is not used: no performance gain");
+            TLLM_LOG_DEBUG("JIT XQA is not used: no performance gain");
             return false;
         }
         return true;
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
index 56e16599f42..bd245fa5b37 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
@@ -438,7 +438,7 @@ void DecoderXQAImplPrecompiled::runDispatchBuffer(
 
 #define SUPPORT_RETURN_FALSE(X)                                                                                        \
     {                                                                                                                  \
-        TLLM_LOG_WARNING("XQA is not used. Reason: %s", X);                                                            \
+        TLLM_LOG_DEBUG("XQA is not used. Reason: %s", X);                                                              \
         return false;                                                                                                  \
     }
 

From eb2eb6e3c1139c7b156bf8c7ae8dff484959112d Mon Sep 17 00:00:00 2001
From: Bo Li <22713281+bobboli@users.noreply.github.com>
Date: Tue, 10 Jun 2025 11:50:33 +0000
Subject: [PATCH 3/4] Fix the failed test.

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
---
 .../decoderXQAImplJIT/decoderXQAImplJIT.cpp               | 2 +-
 tests/unittest/llmapi/test_llm.py                         | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
index a1941672a11..ecb57ea1ffa 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -114,7 +114,7 @@ bool DecoderXQAImplJIT::shouldUse(XQAParams const& umbrellaXQAParams, bool forCo
         bool hasPerfGain = mayHavePerfGain(xqaParams);
         if (!hasPerfGain)
         {
-            TLLM_LOG_DEBUG("JIT XQA is not used: no performance gain");
+            TLLM_LOG_DEBUG("JIT XQA is not used: maybe no performance gain");
             return false;
         }
         return true;
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index bb44ba57d45..57e560eb313 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -3,6 +3,14 @@
 import gc
 import json
 import os
+
+# Required for test_generate_with_seed to pass.
+# See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891
+# Note that currently functions like getEnvForceDeterministic are implemented using static variables,
+# which means they are only initialized once per CPP translation unit (and should be refactored to be non static later).
+# Therefore, the following line must be ahead of any tensorrt_llm imports.
+os.environ['FORCE_DETERMINISTIC'] = '1'
+
 import random
 import shutil
 import sys

From f0392bd9903fd8443245181d12ffd98b4d417d54 Mon Sep 17 00:00:00 2001
From: Bo Li <22713281+bobboli@users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:49:18 +0000
Subject: [PATCH 4/4] Need to use TRTLLM_FORCE_XQA.

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
---
 tests/unittest/llmapi/test_llm.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 57e560eb313..6ebf12c0a3f 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -6,10 +6,12 @@
 
 # Required for test_generate_with_seed to pass.
 # See the discussion in https://github.com/NVIDIA/TensorRT-LLM/pull/4264#issuecomment-2943269891
-# Note that currently functions like getEnvForceDeterministic are implemented using static variables,
-# which means they are only initialized once per CPP translation unit (and should be refactored to be non static later).
-# Therefore, the following line must be ahead of any tensorrt_llm imports.
-os.environ['FORCE_DETERMINISTIC'] = '1'
+# The following line must be ahead of any tensorrt_llm imports,
+# since currently env util functions like getEnvForceDeterministic are implemented using static variables,
+# which means they are only initialized once the CPP translation unit is loaded (should be refactored to be non static later).
+os.environ['TRTLLM_FORCE_XQA'] = '1'
+# Note that we cannot use os.environ['FORCE_DETERMINISTIC'] = '1' here,
+# since it will disable KV cache reuse and make test_llm_api_draft_target fail.
 
 import random
 import shutil