@@ -2450,12 +2450,40 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
2450
2450
};
2451
2451
2452
2452
std::map<SizeType32, float > windowSizeToShare;
2453
- // NOTE: Righteously, blocks allocated should be proportional with
2454
- // regard to window size. Currently, we are first allocating identical
2455
- // number of blocks for all layers to achieve identical performance.
2456
- for (auto const & [windowSize, _] : windowSizeToLayers)
2453
+ if (auto envStr = std::getenv (" TRTLLM_WINDOW_SIZE_SHARES" ))
2457
2454
{
2458
- windowSizeToShare[windowSize] = 1 .0f / windowSizeToLayers.size ();
2455
+ float const fraction = windowSizeSum / windowSizesTotalSum;
2456
+ TLLM_CHECK (0 .0f < fraction && fraction <= 1 .0f );
2457
+ windowSizeToShare[windowSize] = fraction;
2458
+ std::stringstream ss (envStr);
2459
+ std::vector<float > shares;
2460
+ float share;
2461
+ while (ss >> share)
2462
+ {
2463
+ shares.push_back (share);
2464
+ if (ss.peek () == ' ,' )
2465
+ ss.ignore ();
2466
+ }
2467
+
2468
+ TLLM_CHECK_WITH_INFO (shares.size () == windowSizeToLayers.size (),
2469
+ " Number of shares in TRTLLM_WINDOW_SIZE_SHARES (%ld) must match number of window sizes (%ld)" ,
2470
+ shares.size (), windowSizeToLayers.size ());
2471
+
2472
+ size_t i = 0 ;
2473
+ for (auto const & [windowSize, _] : windowSizeToLayers)
2474
+ {
2475
+ windowSizeToShare[windowSize] = shares[i++];
2476
+ }
2477
+ }
2478
+ else
2479
+ {
2480
+ // NOTE: Righteously, blocks allocated should be proportional with
2481
+ // regard to window size. Currently, we are first allocating identical
2482
+ // number of blocks for all layers to achieve identical performance.
2483
+ for (auto const & [windowSize, _] : windowSizeToLayers)
2484
+ {
2485
+ windowSizeToShare[windowSize] = 1 .0f / windowSizeToLayers.size ();
2486
+ }
2459
2487
}
2460
2488
2461
2489
std::vector<SizeType32> blocksPrimary;
0 commit comments