-
Notifications
You must be signed in to change notification settings - Fork 1.8k
[TRTLLM-6371][feat] Restructure C++ KVCacheManager to better handle limited attention layers #7510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 21 commits
9cc92f6
28d6dd1
8a06a28
f30fa42
9d8c420
6ceed6b
64f4504
5eaf9d5
800b05d
21643e3
e0499cf
4840a98
8647828
91445c5
76f317e
ca48af9
3399798
78bcf4d
8836822
86ed0f8
42fe938
cef3334
ad2aa8f
cfe0609
02abe39
f025cfb
735fe3c
a678b91
787b681
50a6f3c
a487b55
78c6253
bb421b9
f3e5c13
d427948
d57c3b4
3f696f6
c8177d3
3802864
e3a1921
24dfa89
61f9702
984a4d2
23c28fb
1014a3e
a8ad970
39e81e1
64ef5f3
fd364d9
3f22026
9367bbe
98c0d29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -55,15 +55,21 @@ static constexpr SizeType32 kPrimaryLevel = 0; | |||||
| static constexpr SizeType32 kSecondaryLevel = 1; | ||||||
|
|
||||||
| class KVCacheBlock; | ||||||
| class KVCachePromptLookupNode; | ||||||
| class KVCachePromptLookup; | ||||||
| class BlockManager; | ||||||
| class KVCacheManager; | ||||||
| class KVCacheTransferManager; | ||||||
| class WindowBlockManager; | ||||||
| class GenerationRequest; | ||||||
|
|
||||||
| using SizeType32 = tensorrt_llm::runtime::SizeType32; | ||||||
| using TokenIdType = tensorrt_llm::runtime::TokenIdType; | ||||||
| using VecTokens = std::vector<TokenIdType>; | ||||||
| using BeamTokens = std::vector<VecTokens>; | ||||||
| using BlockPtr = std::shared_ptr<KVCacheBlock>; | ||||||
| using LookupNodePtr = std::shared_ptr<KVCachePromptLookupNode>; | ||||||
| using LookupPtr = std::shared_ptr<KVCachePromptLookup>; | ||||||
| using FreeBlocksQueue = std::list<BlockPtr>; | ||||||
| using UniqueToken = tensorrt_llm::runtime::UniqueToken; | ||||||
| using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens; | ||||||
|
|
@@ -96,6 +102,7 @@ struct WindowSizeMetadata | |||||
| // Only needed when chunked context + sliding window attention are used | ||||||
| // together. And it should only be considered when allocating blocks. | ||||||
|
|
||||||
|
|
||||||
thorjohnsen marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| std::string toString() | ||||||
| { | ||||||
| return tensorrt_llm::common::fmtstr( | ||||||
|
|
@@ -169,7 +176,7 @@ struct BlockKeyHasher | |||||
| } | ||||||
| }; | ||||||
|
|
||||||
| using NextBlockMap = std::unordered_map<BlockKey, BlockPtr, BlockKeyHasher>; | ||||||
| using NextNodeMap = std::unordered_map<BlockKey, LookupNodePtr, BlockKeyHasher>; | ||||||
|
|
||||||
| struct KvCacheStats | ||||||
| { | ||||||
|
|
@@ -197,6 +204,114 @@ struct KvCacheStats | |||||
| std::size_t allocatedBytes{}; | ||||||
| }; | ||||||
|
|
||||||
| using LookupResult = std::vector<std::tuple<bool,SizeType32,LookupNodePtr>>; | ||||||
|
|
||||||
| // Vector of LookupResult, one for each BlockKey used during search. | ||||||
| // If no match was found, vector will be empty. | ||||||
| // If an exact match was found, vector will have one item. | ||||||
| // If partial matching is enabled and no exact match was found, | ||||||
| // vector will list all nodes with at least one matching token. | ||||||
| // Partially matching nodes are sorted in descending order of number of matching tokens. | ||||||
| using LookupResults = std::vector<LookupResult>; | ||||||
|
|
||||||
| // Implement an object that represents a given prompt prefix in search structure. | ||||||
| // The node contains pointers to all reusable state for the prompt prefix. | ||||||
| class KVCachePromptLookupNode | ||||||
| { | ||||||
| public: | ||||||
| explicit KVCachePromptLookupNode(BlockKey const& blockKey, bool isFull); | ||||||
|
|
||||||
| void setBlockKey(BlockKey const& blockKey, bool isFull); | ||||||
|
|
||||||
| BlockKey getBlockKey() const; | ||||||
|
|
||||||
| [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const; | ||||||
|
|
||||||
| LookupNodePtr const& getPrevNode() const; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| void setPrevNode(LookupNodePtr prevNode); | ||||||
|
|
||||||
| [[nodiscard]] NextNodeMap getNextNodes() const; | ||||||
|
|
||||||
| void addNextNode(BlockKey const& blockKey, LookupNodePtr block); | ||||||
|
|
||||||
| void removeNextNode(BlockKey const& blockKey); | ||||||
|
|
||||||
| //! \brief Find block matching blockKey. If allowPartial is true, the returned block may match only a prefix of | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, these are unrelated, but I will admit they sound (very) related. allowPartial allows blocks that are not completely full to be considered for reuse as long as it is a perfect match for the search blockKey. enablePartialReuse allows blocks where some but not all tokens match the search blockKey to be reused, usually by copying the matching tokens into a new block. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the explanation. If so, I recommend to align the comment here to The ambiguity of "allow partially filled blocks (to be saved for reuse)" and "enable partial reuse of a saved block" should be clarified somewhere. |
||||||
| //! blockKey. | ||||||
| //! @return tuple of [partialMatch, numMatched, block], partialMatch is true if not all the tokens of the block were | ||||||
| //! matched. | ||||||
| [[nodiscard]] LookupResult findMatchingNodes( | ||||||
| BlockKey const& blockKey, bool enablePartialReuse) const; | ||||||
|
|
||||||
| void setBlock(SizeType32 windowSize, BlockPtr block); | ||||||
|
|
||||||
| [[nodiscard]] BlockPtr getBlock(SizeType32 windowSize) const; | ||||||
|
|
||||||
| [[nodiscard]] bool hasBlocks() const; | ||||||
|
|
||||||
| [[nodiscard]] bool isFull() const; | ||||||
|
|
||||||
| [[nodiscard]] bool isLeaf() const; | ||||||
|
|
||||||
| private: | ||||||
| // Key of this block in mNextBlocks map in block pointed to by mPrevBlock | ||||||
| BlockKey mBlockKey; | ||||||
| // Flag indicating if block is full | ||||||
| bool mIsFull; | ||||||
| // Previous node in search structure | ||||||
| LookupNodePtr mPrevNode; | ||||||
| // Next node(s) in sequence(s) | ||||||
| NextNodeMap mNextNodes; | ||||||
| // Pointers to blocks holding KV state for this prompt prefix | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reading through this, SizeType32 is a meaningless type-definition in my opinion. I would rather have an extra layer of to make the object explicit that it is a mapping of window size to block pointers. |
||||||
| std::unordered_map<SizeType32, BlockPtr> mBlocks; | ||||||
| }; | ||||||
|
|
||||||
| class KVCachePromptLookup | ||||||
| { | ||||||
| public: | ||||||
| explicit KVCachePromptLookup(CacheType cacheType, SizeType32 tokensPerBlock); | ||||||
|
|
||||||
| [[nodiscard]] std::vector<BlockKey> getBlockKeys(LlmRequest const& llmRequest, SizeType32 inputLength, bool allowPartiallyFilledBlock) const; | ||||||
|
|
||||||
| //! \brief Find first new context block for each window block manager. | ||||||
| //! \param llmRequest The new request. | ||||||
| //! \param inputLength Number of useful prompt tokens. If zero, length of prompt minus 1 is used. | ||||||
| //! \param allowPartiallyFilledBlock Allow matching of blocks that are not full. | ||||||
| //! \param windowBlockManagers Map of window block managers vs window size. Method will search for a new context block for each window size. | ||||||
thorjohnsen marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| //! \return map of BlockKey vs windowSize. The block key is that of first new context block for that window size. | ||||||
| [[nodiscard]] std::unordered_map<SizeType32,BlockKey> findNewContextBlock(LlmRequest const& llmRequest, SizeType32 inputLength, bool allowPartiallyFilledBlock, std::vector<SizeType32> const& windowSizes) const; | ||||||
|
|
||||||
| //! \brief Find matching nodes for a given prompt prefix | ||||||
| //! \param allowPartiallyFilledBlock Allow last block in prompt to have less than tokensPerBlock tokens. | ||||||
| //! \param enablePartialReuse Allow matching tokens to be copied from block that does not match entire prompt. | ||||||
thorjohnsen marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| [[nodiscard]] LookupResults lookup(LlmRequest const & llmRequest, SizeType32 inputLength, bool allowPartiallyFilledBlock, bool enablePartialReuse, bool createNodes); | ||||||
|
|
||||||
| //! \brief Find matching blocks for a given prompt prefix for all window sizes. | ||||||
| //! return map of matching blocks vs window size. Matching blocks is a vector of varying size. | ||||||
| std::unordered_map<SizeType32,std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>>> lookupBlocks( | ||||||
| std::map<SizeType32,WindowBlockManager> const& windowBlockManagers, | ||||||
| LlmRequest const& llmRequest, SizeType32 inputLength, | ||||||
| bool allowPartiallyFilledBlock, bool enablePartialReuse); | ||||||
|
|
||||||
| // Debugging functions | ||||||
| // | ||||||
| std::string printNode(LookupResult const& match); | ||||||
| std::string printNodes(LookupResults const& matches); | ||||||
| std::string printMatchedBlock(std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr> const& match); | ||||||
| std::string printMatchedBlocks(std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>> const& matches); | ||||||
| std::string printMatchedBlocks(std::unordered_map<SizeType32,std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>>> const& matches); | ||||||
| std::string printPrompt(LlmRequest const& llmRequest); | ||||||
|
|
||||||
| private: | ||||||
| // Root of search structure | ||||||
| LookupNodePtr mRoot; | ||||||
| // KV cache type (self or cross) | ||||||
| CacheType mCacheType; | ||||||
| // Number of tokens per one block | ||||||
| SizeType32 mTokensPerBlock; | ||||||
| }; | ||||||
|
|
||||||
| // Basic building block of a paged KV cache - a single | ||||||
| // cache block. This class just holds metadata, no pointers | ||||||
| // since it is reused across all layers. | ||||||
|
|
@@ -207,14 +322,12 @@ class KVCacheBlock | |||||
|
|
||||||
| static constexpr IdType kCachedBlocksRootId = -1; | ||||||
|
|
||||||
| explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx); | ||||||
| explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx, SizeType32 windowSize); | ||||||
|
|
||||||
| void startScheduling(); | ||||||
|
|
||||||
| [[nodiscard]] IdType getBlockId() const; | ||||||
|
|
||||||
| [[nodiscard]] NextBlockMap getNextBlocks() const; | ||||||
|
|
||||||
| [[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const; | ||||||
|
|
||||||
| [[nodiscard]] bool isPrimary() const; | ||||||
|
|
@@ -231,40 +344,22 @@ class KVCacheBlock | |||||
|
|
||||||
| [[nodiscard]] bool hasSchedulingRefs() const; | ||||||
|
|
||||||
| // This info is duplicated in KVCacheBlock and KVCachePromptLookupNode | ||||||
| // because it is needed by the former when KVCacheBlock might not be stored | ||||||
| // in lookup structure and therefore cannot get this value from there | ||||||
| void setBlockKey(BlockKey const& blockKey, bool isFull); | ||||||
|
|
||||||
| BlockKey getBlockKey(); | ||||||
|
|
||||||
| BlockKey getBlockKey() const; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
| [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const; | ||||||
|
|
||||||
| BlockPtr const& getPrevBlock() const; | ||||||
|
|
||||||
| void setPrevBlock(BlockPtr prevBlock); | ||||||
|
|
||||||
| BlockPtr const& getPrevBlockInSeq() const; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
|
|
||||||
| void setPrevBlockInSeq(BlockPtr prevBlock); | ||||||
|
|
||||||
| void addNextBlock(BlockKey const& blockKey, BlockPtr block); | ||||||
|
|
||||||
| void removeNextBlock(BlockKey const& blockKey); | ||||||
|
|
||||||
| //! \brief Find block matching blockKey. If allowPartial is true, the returned block may match only a prefix of | ||||||
| //! blockKey. | ||||||
| //! @return tuple of [partialMatch, numMatched, block], partialMatch is true if not all the tokens of the block were | ||||||
| //! matched. | ||||||
| [[nodiscard]] std::tuple<bool, SizeType32, BlockPtr> findMatchingBlock( | ||||||
| BlockKey const& blockKey, bool enablePartialReuse, bool copyOnPartialReuse) const; | ||||||
|
|
||||||
| //! \brief Free block from previous block if present. | ||||||
| void freeLeafBlock(); | ||||||
| BlockPtr getPrevBlock() const; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
|
|
||||||
| [[nodiscard]] bool isFull() const; | ||||||
|
|
||||||
| [[nodiscard]] bool isShared() const; | ||||||
|
|
||||||
| [[nodiscard]] bool isLeaf() const; | ||||||
|
|
||||||
| void setPriority(executor::RetentionPriority priority); | ||||||
|
|
||||||
| [[nodiscard]] executor::RetentionPriority getPriority() const; | ||||||
|
|
@@ -284,6 +379,12 @@ class KVCacheBlock | |||||
|
|
||||||
| size_t getHash() const; | ||||||
|
|
||||||
| // set lookup node using this block | ||||||
| void setLookupNode(LookupNodePtr node, BlockPtr block); | ||||||
|
|
||||||
| // get lookup node using this block. Can be nullptr | ||||||
| [[nodiscard]] LookupNodePtr getLookupNode() const; | ||||||
|
|
||||||
| private: | ||||||
| // Linear ID of block independent of pool | ||||||
| IdType mBlockId; | ||||||
|
|
@@ -301,15 +402,9 @@ class KVCacheBlock | |||||
| // Key of this block in mNextBlocks map in block pointed to by mPrevBlock | ||||||
| BlockKey mBlockKey; | ||||||
|
|
||||||
| // Previous block in reuse tree, or nullptr if not reusing | ||||||
| BlockPtr mPrevBlock; | ||||||
|
|
||||||
| // Previous block in sequence, == nullptr for first block, == mPrevBlock if reusing and not first | ||||||
| BlockPtr mPrevBlockInSeq; | ||||||
|
|
||||||
| // Next block(s) in sequence(s) | ||||||
| NextBlockMap mNextBlocks; | ||||||
|
|
||||||
| // Iterator pointing to this block in mFreeBlocks. | ||||||
| std::optional<FreeBlocksQueue::iterator> mFreeBlockIterator; | ||||||
|
|
||||||
|
|
@@ -324,6 +419,11 @@ class KVCacheBlock | |||||
| std::optional<std::chrono::steady_clock::time_point::duration> mExpirationTime; | ||||||
| // Hash for the event manager | ||||||
| size_t mHash; | ||||||
|
|
||||||
| // Pointer to search tree lookup node using this block | ||||||
| LookupNodePtr mLookupNode; | ||||||
| // Window size using this block (0 if not in use) | ||||||
| SizeType32 mWindowSize; | ||||||
| }; | ||||||
|
|
||||||
| class GenerationRequest | ||||||
|
|
@@ -538,7 +638,7 @@ class WindowBlockManager | |||||
| SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, | ||||||
| bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority, | ||||||
| std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse, | ||||||
| std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager); | ||||||
| std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager, bool isSWA); | ||||||
|
|
||||||
| ~WindowBlockManager(); | ||||||
|
|
||||||
|
|
@@ -550,7 +650,7 @@ class WindowBlockManager | |||||
|
|
||||||
| //! \brief Assign blocks for new sequence. Try to reuse blocks. | ||||||
| void addSequence( | ||||||
| GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest); | ||||||
| GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest, std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>> const& matchedBlocks); | ||||||
|
|
||||||
| //! \brief Assign blocks for new sequence. Does not try to reuse blocks. | ||||||
| void addSequence(GenerationRequest& sequence, SizeType32 numContextBlocks, bool isShareLastContextBlock); | ||||||
|
|
@@ -564,8 +664,6 @@ class WindowBlockManager | |||||
| //! \brief Get the ids of all newly allocated (not reused) blocks for the sequence. | ||||||
| std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const; | ||||||
|
|
||||||
| void storeBlocksForReuse(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest); | ||||||
|
|
||||||
| void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest); | ||||||
|
|
||||||
| //! \brief Release blocks of the sequence. | ||||||
|
|
@@ -708,11 +806,6 @@ class WindowBlockManager | |||||
| //! \details Does nothing if block is already in secondary memory. | ||||||
| void offloadBlock(BlockPtr const& block); | ||||||
|
|
||||||
| //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors. | ||||||
| //! \details Only full blocks are considered. | ||||||
| [[nodiscard]] std::optional<BlockKey> findNewContextBlock( | ||||||
| VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const; | ||||||
|
|
||||||
| [[nodiscard]] runtime::BufferManager const& getBufferManager() const | ||||||
| { | ||||||
| return mBufferManager; | ||||||
|
|
@@ -726,7 +819,7 @@ class WindowBlockManager | |||||
| //! \brief Store blocks in cached blocks. | ||||||
| //! \param blockKeys Key of each block. | ||||||
| //! \param blockIds Id of each block. | ||||||
| void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds); | ||||||
| void storeBlocks(LookupResults const& lookupNodes, std::vector<KVCacheBlock::IdType> const& blockIds); | ||||||
|
|
||||||
| [[nodiscard]] bool verifyQueueIntegrity(); | ||||||
|
|
||||||
|
|
@@ -748,6 +841,11 @@ class WindowBlockManager | |||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| [[nodiscard]] bool isSWA() const | ||||||
| { | ||||||
| return mIsSWA; | ||||||
| } | ||||||
|
|
||||||
| private: | ||||||
| //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq. | ||||||
| void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx); | ||||||
|
|
@@ -759,22 +857,15 @@ class WindowBlockManager | |||||
| //! \param blockKeys Key of each block. | ||||||
| //! \param sequence Sequence to which blocks are assigned. | ||||||
| //! \return Number of matched tokens from loaded blocks. | ||||||
| SizeType32 loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks, | ||||||
| GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions); | ||||||
|
|
||||||
| //! \brief Free block and all it's descendants. This makes block a claimed leaf block. | ||||||
| void freeChildren(BlockPtr const& block, executor::RetentionPriority priority, | ||||||
| std::optional<std::chrono::milliseconds> durationMs); | ||||||
| SizeType32 loadOrAllocateBlocks( | ||||||
| std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>> const& matchedBlocks, SizeType32 numContextBlocks, | ||||||
| GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions); | ||||||
|
|
||||||
| //! \brief Find block least likely to be reused, free it if necessary and return. | ||||||
| [[nodiscard]] BlockPtr getFreeBlock( | ||||||
| executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority, | ||||||
| std::optional<std::chrono::milliseconds> durationMs = std::nullopt); | ||||||
|
|
||||||
| //! \brief Free block from previous block and claim it from free blocks list. | ||||||
| void claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority = std::nullopt, | ||||||
| std::optional<std::chrono::milliseconds> durationMs = std::nullopt); | ||||||
|
|
||||||
| //! \brief For FP4 quantization. Creates pool objects for FP4 block scalars. | ||||||
| void createBlockScalePools(SizeType32 blockSize); | ||||||
|
|
||||||
|
|
@@ -846,6 +937,9 @@ class WindowBlockManager | |||||
| bool mCopyOnPartialReuse; | ||||||
| // The kv cache connector manager | ||||||
| std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager; | ||||||
|
|
||||||
| // Whether this window block manager is for an SWA layer. Affects evicting policies | ||||||
| bool mIsSWA; | ||||||
| }; | ||||||
|
|
||||||
| class BlockManager | ||||||
|
|
@@ -919,10 +1013,10 @@ class BlockManager | |||||
| //! \details Does nothing if block is already in secondary memory. | ||||||
| void offloadBlock(BlockPtr const& block, SizeType32 windowSize); | ||||||
|
|
||||||
| void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds, | ||||||
| void storeBlocks(LookupResults const& lookupNodes, std::vector<KVCacheBlock::IdType> const& blockIds, | ||||||
| SizeType32 windowSize) | ||||||
| { | ||||||
| mWindowBlockManagers.at(windowSize).storeBlocks(blockKeys, blockIds); | ||||||
| mWindowBlockManagers.at(windowSize).storeBlocks(lookupNodes, blockIds); | ||||||
| } | ||||||
|
|
||||||
| [[nodiscard]] bool verifyQueueIntegrity(SizeType32 windowSize); | ||||||
|
|
@@ -1167,6 +1261,9 @@ class BlockManager | |||||
| std::vector<SizeType32> mLayerToWindowSize; | ||||||
| std::vector<SizeType32> mAbsolutePoolToWindowSize; | ||||||
| std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex; | ||||||
|
|
||||||
| bool mEnablePartialReuse; | ||||||
| LookupPtr mLookup; | ||||||
| }; | ||||||
|
|
||||||
| struct OffsetTableDimensions | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.