Skip to content

Commit 54a398d

Browse files
committed
[kv cache manager] Expose knob TRTLLM_WINDOW_SIZE_SHARES to adjust memory proportion shared
Usage example: export TRTLLM_WINDOW_SIZE_SHARES=0.4,0.6 Signed-off-by: eopXD <[email protected]>
1 parent 3598e9f commit 54a398d

File tree

1 file changed

+33
-5
lines changed

1 file changed

+33
-5
lines changed

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2450,12 +2450,40 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
24502450
};
24512451

24522452
std::map<SizeType32, float> windowSizeToShare;
2453-
// NOTE: Righteously, blocks allocated should be proportional with
2454-
// regard to window size. Currently, we are first allocating identical
2455-
// number of blocks for all layers to achieve identical performance.
2456-
for (auto const& [windowSize, _] : windowSizeToLayers)
2453+
if (auto envStr = std::getenv("TRTLLM_WINDOW_SIZE_SHARES"))
24572454
{
2458-
windowSizeToShare[windowSize] = 1.0f / windowSizeToLayers.size();
2455+
float const fraction = windowSizeSum / windowSizesTotalSum;
2456+
TLLM_CHECK(0.0f < fraction && fraction <= 1.0f);
2457+
windowSizeToShare[windowSize] = fraction;
2458+
std::stringstream ss(envStr);
2459+
std::vector<float> shares;
2460+
float share;
2461+
while (ss >> share)
2462+
{
2463+
shares.push_back(share);
2464+
if (ss.peek() == ',')
2465+
ss.ignore();
2466+
}
2467+
2468+
TLLM_CHECK_WITH_INFO(shares.size() == windowSizeToLayers.size(),
2469+
"Number of shares in TRTLLM_WINDOW_SIZE_SHARES (%ld) must match number of window sizes (%ld)",
2470+
shares.size(), windowSizeToLayers.size());
2471+
2472+
size_t i = 0;
2473+
for (auto const& [windowSize, _] : windowSizeToLayers)
2474+
{
2475+
windowSizeToShare[windowSize] = shares[i++];
2476+
}
2477+
}
2478+
else
2479+
{
2480+
// NOTE: Righteously, blocks allocated should be proportional with
2481+
// regard to window size. Currently, we are first allocating identical
2482+
// number of blocks for all layers to achieve identical performance.
2483+
for (auto const& [windowSize, _] : windowSizeToLayers)
2484+
{
2485+
windowSizeToShare[windowSize] = 1.0f / windowSizeToLayers.size();
2486+
}
24592487
}
24602488

24612489
std::vector<SizeType32> blocksPrimary;

0 commit comments

Comments
 (0)