@@ -130,14 +130,17 @@ struct WindowSizeMetadata
130
130
SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence.
131
131
// Only needed when chunked context + sliding window attention are used
132
132
// together. And it should only be considered when allocating blocks.
133
+ SizeType32 windowSize;
134
+ bool isSWA;
133
135
134
136
std::string toString ()
135
137
{
136
138
return tensorrt_llm::common::fmtstr (
137
139
" WindowSizeMetadata{ .allottedPrimaryBlocks=%d, .allottedSecondaryBlocks=%d, .absolutePoolsOffset=%d, "
138
- " .numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d }" ,
140
+ " .numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d, "
141
+ " .windowSize=%d, .isSWA=%d }" ,
139
142
allottedPrimaryBlocks, allottedSecondaryBlocks, absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq,
140
- maxNumBlocks, temporaryAttentionWindow);
143
+ maxNumBlocks, temporaryAttentionWindow, windowSize, isSWA );
141
144
}
142
145
};
143
146
@@ -512,6 +515,8 @@ class GenerationRequest
512
515
executor::KvCacheRetentionConfig mKvCacheRetentionConfig ;
513
516
// Number of front blocks removed from the sequence
514
517
SizeType32 mNumFrontBlocksRemoved ;
518
+ // Set of used blocks by the sequence
519
+ std::set<KVCacheBlock::IdType> mUsedBlocks ;
515
520
};
516
521
517
522
// attach metadata to a pool pointer
@@ -763,7 +768,7 @@ class WindowBlockManager
763
768
764
769
// ! \brief Bring offloaded block from secondary to primary memory.
765
770
// ! \details Does nothing if block is already in primary memory.
766
- void onboardBlock (BlockPtr const & offloadBlock,
771
+ void onboardBlock (GenerationRequest& sequence, BlockPtr const & offloadBlock,
767
772
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const & directory = " " );
768
773
769
774
// ! \brief Bring block from primary to secondary memory.
@@ -826,6 +831,23 @@ class WindowBlockManager
826
831
// ! \brief Unpin blocks by starting from a block id and walking prev pointers.
827
832
void unpinBlocksById (KVCacheBlock::IdType blockId);
828
833
834
+ void initializeSequenceStorageValidity (LlmRequest::RequestIdType requestId)
835
+ {
836
+ mIsValidStoreForReuseSequence [requestId] = true ;
837
+ }
838
+
839
+ void releaseSequenceStorageValidity (LlmRequest::RequestIdType requestId)
840
+ {
841
+ mIsValidStoreForReuseSequence .erase (requestId);
842
+ }
843
+
844
+ // ! \brief Return whether this sequence is valid for store for reuse
845
+ [[nodiscard]] bool isSequenceValidForStoreForReuse (LlmRequest::RequestIdType requestId) const
846
+ {
847
+ TLLM_CHECK_WITH_INFO (mIsValidStoreForReuseSequence .count (requestId) > 0 , " Sequence should be bookkeeped" );
848
+ return mIsValidStoreForReuseSequence .at (requestId);
849
+ }
850
+
829
851
private:
830
852
// ! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
831
853
void addBlockToBeam (BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -846,7 +868,8 @@ class WindowBlockManager
846
868
std::optional<std::chrono::milliseconds> durationMs);
847
869
848
870
// ! \brief Find block least likely to be reused, free it if necessary and return.
849
- [[nodiscard]] BlockPtr getFreeBlock (
871
+ // ! \param sequence Sequence which the free block is allocated for
872
+ [[nodiscard]] BlockPtr getFreeBlock (GenerationRequest& sequence,
850
873
executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority ,
851
874
std::optional<std::chrono::milliseconds> durationMs = std::nullopt ,
852
875
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const & directory = " " );
@@ -933,6 +956,14 @@ class WindowBlockManager
933
956
934
957
// Mutex for the cached blocks root
935
958
std::mutex mCachedBlocksRootMutex ;
959
+
960
+ // Record which sequence is using the block
961
+ std::map<KVCacheBlock::IdType, LlmRequest::RequestIdType> mBlockToSequence ;
962
+ // Record whether a sequence has all blocks held valid.
963
+ // The boolean value is set to true upon first encounter of a new sequence.
964
+ // It may be invalidated to false when other sequence acquires a block that
965
+ // is used by another sequence.
966
+ std::map<LlmRequest::RequestIdType, bool > mIsValidStoreForReuseSequence ;
936
967
};
937
968
938
969
class BlockManager
@@ -1008,7 +1039,7 @@ class BlockManager
1008
1039
1009
1040
// ! \brief Bring block from primary to secondary memory for window size.
1010
1041
// ! \details Does nothing if block is already in primary memory.
1011
- void onboardBlock (BlockPtr const & offloadBlock, SizeType32 windowSize,
1042
+ void onboardBlock (GenerationRequest& sequence, BlockPtr const & offloadBlock, SizeType32 windowSize,
1012
1043
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const & directory = " " );
1013
1044
1014
1045
// ! \brief Bring block from primary to secondary memory for window size.
@@ -1244,6 +1275,48 @@ class BlockManager
1244
1275
// ! there may be more than one context block that goes OOW.
1245
1276
void adjustBlocksIfNeeded (GenerationRequest& sequence, bool isEnableBlockReuse);
1246
1277
1278
+ // ! \brief Return whether the sequence is already managed by the block manager
1279
+ [[nodiscard]] bool isSequenceHeld (LlmRequest::RequestIdType requestId) const
1280
+ {
1281
+ return mManagedSequences .count (requestId) > 0 ;
1282
+ }
1283
+
1284
+ // ! \brief Add a sequence to the managed sequences
1285
+ // ! \details Take the sequence into account for the manager. Initialize
1286
+ // ! sequence storage validity under all window sizes.
1287
+ void holdSequence (LlmRequest::RequestIdType requestId)
1288
+ {
1289
+ mManagedSequences .insert (requestId);
1290
+ for (auto const & [windowSize, metadata] : mWindowSizeToMetadata )
1291
+ {
1292
+ mWindowBlockManagers .at (windowSize).initializeSequenceStorageValidity (requestId);
1293
+ }
1294
+ }
1295
+
1296
+ // ! \brief Remove a sequence from the managed sequences.
1297
+ // ! \details Remove sequence from the managed sequences and remove sequence
1298
+ // ! storage
1299
+ void releaseSequence (LlmRequest::RequestIdType requestId)
1300
+ {
1301
+ mManagedSequences .erase (requestId);
1302
+ for (auto const & [windowSize, metadata] : mWindowSizeToMetadata )
1303
+ {
1304
+ mWindowBlockManagers .at (windowSize).releaseSequenceStorageValidity (requestId);
1305
+ }
1306
+ }
1307
+
1308
+ // ! \brief Return whether the sequence is still valid for store-for-reuse
1309
+ // ! regarding the specific window size.
1310
+ // ! \details Currently this utility function is only used under
1311
+ // ! kvCacheManagerTest.cpp. Checking for store-for-reuse for each window
1312
+ // ! size is done in an iterating fashion under BlockManager::releaseBlocks.
1313
+ bool isSequenceValidForStoreForReuse (LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
1314
+ {
1315
+ TLLM_CHECK_WITH_INFO (
1316
+ mWindowBlockManagers .count (windowSize) > 0 , " Querying window size is not found under mWindowBlockManager" );
1317
+ return mWindowBlockManagers .at (windowSize).isSequenceValidForStoreForReuse (requestId);
1318
+ }
1319
+
1247
1320
private:
1248
1321
[[nodiscard]] WindowBlockManager const & windowManagerByLayer (SizeType32 layerIdx) const
1249
1322
{
@@ -1278,6 +1351,8 @@ class BlockManager
1278
1351
std::vector<SizeType32> mLayerToWindowSize ;
1279
1352
std::vector<SizeType32> mAbsolutePoolToWindowSize ;
1280
1353
std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex ;
1354
+ // Record what sequences are currently managed by the block manager
1355
+ std::set<LlmRequest::RequestIdType> mManagedSequences ;
1281
1356
};
1282
1357
1283
1358
struct OffsetTableDimensions
0 commit comments