@@ -502,7 +502,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
502
502
std::shared_ptr<runtime::CudaStream> stream, std::optional<SizeType32> maxSequenceLength, SizeType32 maxBeamWidth,
503
503
std::vector<SizeType32> const & maxAttentionWindowVec,
504
504
std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
505
- SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
505
+ SizeType32 sinkBubbleLength, CacheType cacheType,
506
506
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
507
507
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
508
508
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
@@ -534,8 +534,8 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
534
534
TLLM_CHECK (allottedPrimaryBlocks > 0 ); // You can't have a model with negative primary blocks...
535
535
mWindowBlockManagers .try_emplace (windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
536
536
sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
537
- onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enablePartialReuse,
538
- copyOnPartialReuse, kvCacheConnectorManager);
537
+ cacheType, secondaryOffloadMinPriority, mEventManager , enablePartialReuse, copyOnPartialReuse ,
538
+ kvCacheConnectorManager);
539
539
}
540
540
541
541
auto const numAllPools = getNumPools ();
@@ -575,15 +575,14 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
575
575
WindowBlockManager::WindowBlockManager (nvinfer1::DataType dtype, SizeType32 windowSize,
576
576
std::vector<SizeType32> const & managedLayers, std::vector<SizeType32> const & numKvHeadsPerLayer,
577
577
SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
578
- SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
578
+ SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, CacheType cacheType,
579
579
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
580
580
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
581
581
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
582
582
: mDataType {dtype}
583
583
, mWindowSize {windowSize}
584
584
, mNumPrimaryBlocks {blocksInPrimaryPool}
585
585
, mNumSecondaryBlocks {blocksInSecondaryPool}
586
- , mOnboardBlocks (onboardBlocks)
587
586
, mBufferManager {std::move (stream)}
588
587
, mSchedulingNumFreeBlocks {0 }
589
588
, mTokensPerBlock {tokensPerBlock}
@@ -869,9 +868,7 @@ BlockPtr WindowBlockManager::getFreeBlock(
869
868
// 1. Block contains state (evidenced by presence of tokens)
870
869
// 2. Eviction policy indicated block can be offloaded
871
870
// 3. At least one free block in secondary memory
872
- // 4. Onboarding is enabled (allowing block to be brought back into primary)
873
- if (!block->getUniqueTokens ().empty () && canOffload && mEvictionPolicy ->getNumFreeBlocks (kSecondaryLevel ) > 0
874
- && mOnboardBlocks )
871
+ if (!block->getUniqueTokens ().empty () && canOffload && mEvictionPolicy ->getNumFreeBlocks (kSecondaryLevel ) > 0 )
875
872
{
876
873
// If we're swapping a block to secondary memory, maintain the prior priority values.
877
874
mEvictionPolicy ->claimBlock (block);
@@ -936,7 +933,7 @@ void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowS
936
933
937
934
void WindowBlockManager::onboardBlock (BlockPtr const & offloadBlock)
938
935
{
939
- if (mOnboardBlocks && !offloadBlock->isPrimary ())
936
+ if (!offloadBlock->isPrimary ())
940
937
{
941
938
auto block = getFreeBlock ();
942
939
mTransferManager ->onboard (offloadBlock, block, mPools );
@@ -961,7 +958,7 @@ void BlockManager::offloadBlock(BlockPtr const& block, SizeType32 windowSize)
961
958
962
959
void WindowBlockManager::offloadBlock (BlockPtr const & block)
963
960
{
964
- if (mOnboardBlocks && block->isPrimary ())
961
+ if (block->isPrimary ())
965
962
{
966
963
// Offload block in primary memory before repurposing
967
964
auto offloadBlock = std::get<0 >(mEvictionPolicy ->getFreeBlock (kSecondaryLevel ));
@@ -1631,11 +1628,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
1631
1628
SizeType32 maxBeamWidth, std::vector<SizeType32> const & maxAttentionWindowVec,
1632
1629
std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
1633
1630
SizeType32 sinkTokenLength, int64_t stream, std::optional<runtime::SizeType32> maxSequenceLength,
1634
- bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, bool enablePartialReuse, bool copyOnPartialReuse)
1631
+ bool enableBlockReuse, CacheType cacheType, bool enablePartialReuse, bool copyOnPartialReuse)
1635
1632
: KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
1636
1633
maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
1637
1634
std::make_shared<runtime::CudaStream>(reinterpret_cast <cudaStream_t>(stream)), maxSequenceLength,
1638
- enableBlockReuse, onboardBlocks, cacheType, std::nullopt , nullptr , enablePartialReuse, copyOnPartialReuse)
1635
+ enableBlockReuse, cacheType, std::nullopt , nullptr , enablePartialReuse, copyOnPartialReuse)
1639
1636
{
1640
1637
}
1641
1638
@@ -1644,15 +1641,14 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
1644
1641
SizeType32 maxBeamWidth, std::vector<SizeType32> const & maxAttentionWindowVec,
1645
1642
std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
1646
1643
SizeType32 sinkTokenLength, int64_t stream, std::optional<runtime::SizeType32> maxSequenceLength,
1647
- bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
1648
- std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1644
+ bool enableBlockReuse, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1649
1645
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
1650
1646
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
1651
1647
: KVCacheManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth,
1652
1648
maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
1653
1649
std::make_shared<runtime::CudaStream>(reinterpret_cast <cudaStream_t>(stream)), maxSequenceLength,
1654
- enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, eventManager, enablePartialReuse,
1655
- copyOnPartialReuse, kvCacheConnectorManager)
1650
+ enableBlockReuse, cacheType, secondaryOffloadMinPriority, eventManager, enablePartialReuse, copyOnPartialReuse ,
1651
+ kvCacheConnectorManager)
1656
1652
{
1657
1653
}
1658
1654
@@ -1661,8 +1657,7 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
1661
1657
SizeType32 maxBeamWidth, std::vector<SizeType32> const & maxAttentionWindowVec,
1662
1658
std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
1663
1659
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
1664
- bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
1665
- std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1660
+ bool enableBlockReuse, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1666
1661
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
1667
1662
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
1668
1663
: mMaxBeamWidth (maxBeamWidth)
@@ -1673,8 +1668,8 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
1673
1668
, mSinkBlockTokenLength(mSinkBubbleLength + sinkTokenLength)
1674
1669
, mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
1675
1670
std::move (stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
1676
- mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager),
1677
- enablePartialReuse, copyOnPartialReuse, std::move(kvCacheConnectorManager))
1671
+ mSinkBubbleLength, cacheType, secondaryOffloadMinPriority, std::move(eventManager), enablePartialReuse ,
1672
+ copyOnPartialReuse, std::move(kvCacheConnectorManager))
1678
1673
// disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
1679
1674
, mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse}
1680
1675
{
@@ -1696,13 +1691,12 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
1696
1691
SizeType32 maxBeamWidth, std::vector<SizeType32> const & maxAttentionWindowVec,
1697
1692
std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
1698
1693
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
1699
- bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
1700
- std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1694
+ bool enableBlockReuse, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1701
1695
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
1702
1696
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
1703
1697
: KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
1704
1698
maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
1705
- std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority,
1699
+ std::move(stream), maxSequenceLength, enableBlockReuse, cacheType, secondaryOffloadMinPriority,
1706
1700
std::move(eventManager), enablePartialReuse, copyOnPartialReuse, std::move(kvCacheConnectorManager))
1707
1701
{
1708
1702
}
@@ -2272,9 +2266,7 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
2272
2266
= static_cast <SizeType32>(allottedSecondaryMemBytes * windowSizeShare / cacheSizeBytesPerToken);
2273
2267
SizeType32 const blocksInSecondaryPool = std::max (0 , maxTokensSecondary / tokensPerBlock);
2274
2268
TLLM_LOG_DEBUG (
2275
- " Number of blocks in KV cache secondary pool for windowSize %d: %d, onboard blocks to primary memory "
2276
- " before reuse: %s" ,
2277
- windowSize, blocksInSecondaryPool, config.getOnboardBlocks () ? " true" : " false" );
2269
+ " Number of blocks in KV cache secondary pool for windowSize %d: %d" , windowSize, blocksInSecondaryPool);
2278
2270
return blocksInSecondaryPool;
2279
2271
};
2280
2272
0 commit comments