Skip to content

Commit 295dbf3

Browse files
committed
[kv cache manager] Expose knob TRTLLM_WINDOW_SIZE_SHARES to adjust memory proportion shared
Usage example: export TRTLLM_WINDOW_SIZE_SHARES=0.4,0.6 Signed-off-by: eopXD <[email protected]>
1 parent 259cc66 commit 295dbf3

File tree

1 file changed

+41
-5
lines changed

1 file changed

+41
-5
lines changed

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2450,12 +2450,48 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
24502450
};
24512451

24522452
std::map<SizeType32, float> windowSizeToShare;
2453-
// NOTE: Righteously, blocks allocated should be proportional with
2454-
// regard to window size. Currently, we are first allocating identical
2455-
// number of blocks for all layers to achieve identical performance.
2456-
for (auto const& [windowSize, _] : windowSizeToLayers)
2453+
if (auto envStr = std::getenv("TRTLLM_WINDOW_SIZE_SHARES"))
24572454
{
2458-
windowSizeToShare[windowSize] = 1.0f / windowSizeToLayers.size();
2455+
std::stringstream ss(envStr);
2456+
std::vector<float> shares;
2457+
float share;
2458+
while (ss >> share)
2459+
{
2460+
shares.push_back(share);
2461+
if (ss.peek() == ',')
2462+
ss.ignore();
2463+
}
2464+
2465+
TLLM_CHECK_WITH_INFO(shares.size() == windowSizeToLayers.size(),
2466+
"Number of shares in TRTLLM_WINDOW_SIZE_SHARES (%ld) must match number of window sizes (%ld)",
2467+
shares.size(), windowSizeToLayers.size());
2468+
float sumShares = 0.0f;
2469+
for (auto s : shares)
2470+
{
2471+
TLLM_CHECK_WITH_INFO(0.0f <= s && s <= 1.0f, "Shares must be in value range [0,1], got %f", s);
2472+
sumShares += s;
2473+
}
2474+
TLLM_CHECK_WITH_INFO(sumShares > 0.0f, "Sum of shares must be > 0.");
2475+
// Normalize shares to 1.0
2476+
for (auto& s : shares)
2477+
{
2478+
s /= sumShares;
2479+
}
2480+
size_t i = 0;
2481+
for (auto const& [windowSize, _] : windowSizeToLayers)
2482+
{
2483+
windowSizeToShare[windowSize] = shares[i++];
2484+
}
2485+
}
2486+
else
2487+
{
2488+
// NOTE: Righteously, blocks allocated should be proportional with
2489+
// regard to window size. Currently, we are first allocating identical
2490+
// number of blocks for all layers to achieve identical performance.
2491+
for (auto const& [windowSize, _] : windowSizeToLayers)
2492+
{
2493+
windowSizeToShare[windowSize] = 1.0f / windowSizeToLayers.size();
2494+
}
24592495
}
24602496

24612497
std::vector<SizeType32> blocksPrimary;

0 commit comments

Comments
 (0)