NVIDIA · thorjohnsen · Sep 3, 2025 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
@@ -92,13 +92,8 @@ class LRUEvictionPolicy : public BaseEvictionPolicy
     bool verifyQueueIntegrity() override;
 
 private:
-    // Check if the block should be added to mFreeQueues.
-    bool isReleasedLeafBlock(BlockPtr const& block);
-
     // Queues of available leaf blocks, split by cache level and priority level
     std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
-    // All blocks that have been released, along with the amount of released children
-    std::vector<std::unordered_set<SizeType32>> mReleasedBlocks;
     // Iterators to block entries in mFreeQueues
     std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
     // Amount of free blocks at each cache level

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -55,15 +55,21 @@ static constexpr SizeType32 kPrimaryLevel = 0;
 static constexpr SizeType32 kSecondaryLevel = 1;
 
 class KVCacheBlock;
+class KVCachePromptLookupNode;
+class KVCachePromptLookup;
 class BlockManager;
 class KVCacheManager;
 class KVCacheTransferManager;
+class WindowBlockManager;
+class GenerationRequest;
 
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 using TokenIdType = tensorrt_llm::runtime::TokenIdType;
 using VecTokens = std::vector<TokenIdType>;
 using BeamTokens = std::vector<VecTokens>;
 using BlockPtr = std::shared_ptr<KVCacheBlock>;
+using LookupNodePtr = std::shared_ptr<KVCachePromptLookupNode>;
+using LookupPtr = std::shared_ptr<KVCachePromptLookup>;
 using FreeBlocksQueue = std::list<BlockPtr>;
 using UniqueToken = tensorrt_llm::runtime::UniqueToken;
 using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
@@ -96,6 +102,7 @@ struct WindowSizeMetadata
                                          // Only needed when chunked context + sliding window attention are used
                                          // together. And it should only be considered when allocating blocks.
 
+
     std::string toString()
     {
         return tensorrt_llm::common::fmtstr(
@@ -169,7 +176,7 @@ struct BlockKeyHasher
     }
 };
 
-using NextBlockMap = std::unordered_map<BlockKey, BlockPtr, BlockKeyHasher>;
+using NextNodeMap = std::unordered_map<BlockKey, LookupNodePtr, BlockKeyHasher>;
 
 struct KvCacheStats
 {
@@ -197,6 +204,114 @@ struct KvCacheStats
     std::size_t allocatedBytes{};
 };
 
+using LookupResult = std::vector<std::tuple<bool,SizeType32,LookupNodePtr>>;
+
+// Vector of LookupResult, one for each BlockKey used during search.
+// If no match was found, vector will be empty.
+// If an exact match was found, vector will have one item.
+// If partial matching is enabled and no exact match was found,
+// vector will list all nodes with at least one matching token.
+// Partially matching nodes are sorted in descending order of number of matching tokens.
+using LookupResults = std::vector<LookupResult>;
+
+// Implement an object that represents a given prompt prefix in search structure.
+// The node contains pointers to all reusable state for the prompt prefix.
+class KVCachePromptLookupNode
+{
+public:
+    explicit KVCachePromptLookupNode(BlockKey const& blockKey, bool isFull);
+
+    void setBlockKey(BlockKey const& blockKey, bool isFull);
+
+    BlockKey getBlockKey() const;
+
+    [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
+
+    LookupNodePtr const& getPrevNode() const;
-    LookupNodePtr const& getPrevNode() const;
+    [[nodiscard]] LookupNodePtr const& getPrevNode() const;
-    LookupNodePtr const& getPrevNode() const;
+    [[nodiscard]] LookupNodePtr const& getPrevNode() const;
+
+    void setPrevNode(LookupNodePtr prevNode);
+
+    [[nodiscard]] NextNodeMap getNextNodes() const;
+
+    void addNextNode(BlockKey const& blockKey, LookupNodePtr block);
+
+    void removeNextNode(BlockKey const& blockKey);
+
+    //! \brief Find block matching blockKey. If allowPartial is true, the returned block may match only a prefix of
+    //! blockKey.
+    //! @return tuple of [partialMatch, numMatched, block], partialMatch is true if not all the tokens of the block were
+    //! matched.
+    [[nodiscard]] LookupResult findMatchingNodes(
+        BlockKey const& blockKey, bool enablePartialReuse) const;
+
+    void setBlock(SizeType32 windowSize, BlockPtr block);
+
+    [[nodiscard]] BlockPtr getBlock(SizeType32 windowSize) const;
+
+    [[nodiscard]] bool hasBlocks() const;
+
+    [[nodiscard]] bool isFull() const;
+
+    [[nodiscard]] bool isLeaf() const;
+
+private:
+    // Key of this block in mNextBlocks map in block pointed to by mPrevBlock
+    BlockKey mBlockKey;
+    // Flag indicating if block is full
+    bool mIsFull;
+    // Previous node in search structure
+    LookupNodePtr mPrevNode;
+    // Next node(s) in sequence(s)
+    NextNodeMap mNextNodes;
+    // Pointers to blocks holding KV state for this prompt prefix
+    std::unordered_map<SizeType32, BlockPtr> mBlocks;
+};
+
+class KVCachePromptLookup
+{
+public:
+    explicit KVCachePromptLookup(CacheType cacheType, SizeType32 tokensPerBlock);
+
+    [[nodiscard]] std::vector<BlockKey> getBlockKeys(LlmRequest const& llmRequest, SizeType32 inputLength, bool allowPartiallyFilledBlock) const;
+
+    //! \brief Find first new context block for each window block manager.
+    //! \param llmRequest The new request.
+    //! \param inputLength Number of useful prompt tokens. If zero, length of prompt minus 1 is used.
+    //! \param allowPartiallyFilledBlock Allow matching of blocks that are not full.
+    //! \param windowBlockManagers Map of window block managers vs window size. Method will search for a new context block for each window size.
+    //! \return map of BlockKey vs windowSize. The block key is that of first new context block for that window size.
+    [[nodiscard]] std::unordered_map<SizeType32,BlockKey> findNewContextBlock(LlmRequest const& llmRequest, SizeType32 inputLength, bool allowPartiallyFilledBlock, std::vector<SizeType32> const& windowSizes) const;
+
+    //! \brief Find matching nodes for a given prompt prefix
+    //! \param allowPartiallyFilledBlock Allow last block in prompt to have less than tokensPerBlock tokens.
+    //! \param enablePartialReuse Allow matching tokens to be copied from block that does not match entire prompt.
+    [[nodiscard]] LookupResults lookup(LlmRequest const & llmRequest, SizeType32 inputLength, bool allowPartiallyFilledBlock, bool enablePartialReuse, bool createNodes);
+
+    //! \brief Find matching blocks for a given prompt prefix for all window sizes.
+    //! return map of matching blocks vs window size. Matching blocks is a vector of varying size.
+    std::unordered_map<SizeType32,std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>>> lookupBlocks(
+            std::map<SizeType32,WindowBlockManager> const& windowBlockManagers, 
+            LlmRequest const& llmRequest, SizeType32 inputLength, 
+            bool allowPartiallyFilledBlock, bool enablePartialReuse);
+
+    // Debugging functions
+    //
+    std::string printNode(LookupResult const& match);
+    std::string printNodes(LookupResults const& matches);
+    std::string printMatchedBlock(std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr> const& match);
+    std::string printMatchedBlocks(std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>> const& matches);
+    std::string printMatchedBlocks(std::unordered_map<SizeType32,std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>>> const& matches);
+    std::string printPrompt(LlmRequest const& llmRequest);
+
+private:
+    // Root of search structure
+    LookupNodePtr mRoot;
+    // KV cache type (self or cross)
+    CacheType mCacheType;
+    // Number of tokens per one block
+    SizeType32 mTokensPerBlock;
+};
+
 // Basic building block of a paged KV cache - a single
 // cache block. This class just holds metadata, no pointers
 // since it is reused across all layers.
@@ -207,14 +322,12 @@ class KVCacheBlock
 
     static constexpr IdType kCachedBlocksRootId = -1;
 
-    explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
+    explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx, SizeType32 windowSize);
 
     void startScheduling();
 
     [[nodiscard]] IdType getBlockId() const;
 
-    [[nodiscard]] NextBlockMap getNextBlocks() const;
-
     [[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const;
 
     [[nodiscard]] bool isPrimary() const;
@@ -231,40 +344,22 @@ class KVCacheBlock
 
     [[nodiscard]] bool hasSchedulingRefs() const;
 
+    // This info is duplicated in KVCacheBlock and KVCachePromptLookupNode
+    // because it is needed by the former when KVCacheBlock might not be stored
+    // in lookup structure and therefore cannot get this value from there
     void setBlockKey(BlockKey const& blockKey, bool isFull);
-
-    BlockKey getBlockKey();
-
+    BlockKey getBlockKey() const;
     [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
 
-    BlockPtr const& getPrevBlock() const;
-
-    void setPrevBlock(BlockPtr prevBlock);
-
     BlockPtr const& getPrevBlockInSeq() const;
-
     void setPrevBlockInSeq(BlockPtr prevBlock);
 
-    void addNextBlock(BlockKey const& blockKey, BlockPtr block);
-
-    void removeNextBlock(BlockKey const& blockKey);
-
-    //! \brief Find block matching blockKey. If allowPartial is true, the returned block may match only a prefix of
-    //! blockKey.
-    //! @return tuple of [partialMatch, numMatched, block], partialMatch is true if not all the tokens of the block were
-    //! matched.
-    [[nodiscard]] std::tuple<bool, SizeType32, BlockPtr> findMatchingBlock(
-        BlockKey const& blockKey, bool enablePartialReuse, bool copyOnPartialReuse) const;
-
-    //! \brief Free block from previous block if present.
-    void freeLeafBlock();
+    BlockPtr getPrevBlock() const;
 
     [[nodiscard]] bool isFull() const;
 
     [[nodiscard]] bool isShared() const;
 
-    [[nodiscard]] bool isLeaf() const;
-
     void setPriority(executor::RetentionPriority priority);
 
     [[nodiscard]] executor::RetentionPriority getPriority() const;
@@ -284,6 +379,12 @@ class KVCacheBlock
 
     size_t getHash() const;
 
+    // set lookup node using this block
+    void setLookupNode(LookupNodePtr node, BlockPtr block);
+
+    // get lookup node using this block. Can be nullptr
+    [[nodiscard]] LookupNodePtr getLookupNode() const;
+
 private:
     // Linear ID of block independent of pool
     IdType mBlockId;
@@ -301,15 +402,9 @@ class KVCacheBlock
     // Key of this block in mNextBlocks map in block pointed to by mPrevBlock
     BlockKey mBlockKey;
 
-    // Previous block in reuse tree, or nullptr if not reusing
-    BlockPtr mPrevBlock;
-
     // Previous block in sequence, == nullptr for first block, == mPrevBlock if reusing and not first
     BlockPtr mPrevBlockInSeq;
 
-    // Next block(s) in sequence(s)
-    NextBlockMap mNextBlocks;
-
     // Iterator pointing to this block in mFreeBlocks.
     std::optional<FreeBlocksQueue::iterator> mFreeBlockIterator;
 
@@ -324,6 +419,11 @@ class KVCacheBlock
     std::optional<std::chrono::steady_clock::time_point::duration> mExpirationTime;
     // Hash for the event manager
     size_t mHash;
+
+    // Pointer to search tree lookup node using this block
+    LookupNodePtr mLookupNode;
+    // Window size using this block (0 if not in use)
+    SizeType32 mWindowSize;
 };
 
 class GenerationRequest
@@ -538,7 +638,7 @@ class WindowBlockManager
         SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
         bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
         std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
-        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager);
+        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager, bool isSWA);
 
     ~WindowBlockManager();
 
@@ -550,7 +650,7 @@ class WindowBlockManager
 
     //! \brief Assign blocks for new sequence. Try to reuse blocks.
     void addSequence(
-        GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest);
+        GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest, std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>> const& matchedBlocks);
 
     //! \brief Assign blocks for new sequence. Does not try to reuse blocks.
     void addSequence(GenerationRequest& sequence, SizeType32 numContextBlocks, bool isShareLastContextBlock);
@@ -564,8 +664,6 @@ class WindowBlockManager
     //! \brief Get the ids of all newly allocated (not reused) blocks for the sequence.
     std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const;
 
-    void storeBlocksForReuse(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
-
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
     //! \brief Release blocks of the sequence.
@@ -708,11 +806,6 @@ class WindowBlockManager
     //! \details Does nothing if block is already in secondary memory.
     void offloadBlock(BlockPtr const& block);
 
-    //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors.
-    //! \details Only full blocks are considered.
-    [[nodiscard]] std::optional<BlockKey> findNewContextBlock(
-        VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const;
-
     [[nodiscard]] runtime::BufferManager const& getBufferManager() const
     {
         return mBufferManager;
@@ -726,7 +819,7 @@ class WindowBlockManager
     //! \brief Store blocks in cached blocks.
     //! \param blockKeys Key of each block.
     //! \param blockIds Id of each block.
-    void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
+    void storeBlocks(LookupResults const& lookupNodes, std::vector<KVCacheBlock::IdType> const& blockIds);
 
     [[nodiscard]] bool verifyQueueIntegrity();
 
@@ -748,6 +841,11 @@ class WindowBlockManager
         return 0;
     }
 
+    [[nodiscard]] bool isSWA() const
+    {
+	return mIsSWA;
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -759,22 +857,15 @@ class WindowBlockManager
     //! \param blockKeys Key of each block.
     //! \param sequence Sequence to which blocks are assigned.
     //! \return Number of matched tokens from loaded blocks.
-    SizeType32 loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
-        GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions);
-
-    //! \brief Free block and all it's descendants. This makes block a claimed leaf block.
-    void freeChildren(BlockPtr const& block, executor::RetentionPriority priority,
-        std::optional<std::chrono::milliseconds> durationMs);
+    SizeType32 loadOrAllocateBlocks(
+            std::vector<std::tuple<bool,SizeType32,BlockPtr,LookupNodePtr>> const& matchedBlocks, SizeType32 numContextBlocks,
+            GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions);
 
     //! \brief Find block least likely to be reused, free it if necessary and return.
     [[nodiscard]] BlockPtr getFreeBlock(
         executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority,
         std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
 
-    //! \brief Free block from previous block and claim it from free blocks list.
-    void claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority = std::nullopt,
-        std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
-
     //! \brief For FP4 quantization. Creates pool objects for FP4 block scalars.
     void createBlockScalePools(SizeType32 blockSize);
 
@@ -846,6 +937,9 @@ class WindowBlockManager
     bool mCopyOnPartialReuse;
     // The kv cache connector manager
     std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
+
+    // Whether this window block manager is for an SWA layer. Affects evicting policies
+    bool mIsSWA;
 };
 
 class BlockManager
@@ -919,10 +1013,10 @@ class BlockManager
     //! \details Does nothing if block is already in secondary memory.
     void offloadBlock(BlockPtr const& block, SizeType32 windowSize);
 
-    void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
+    void storeBlocks(LookupResults const& lookupNodes, std::vector<KVCacheBlock::IdType> const& blockIds,
         SizeType32 windowSize)
     {
-        mWindowBlockManagers.at(windowSize).storeBlocks(blockKeys, blockIds);
+        mWindowBlockManagers.at(windowSize).storeBlocks(lookupNodes, blockIds);
     }
 
     [[nodiscard]] bool verifyQueueIntegrity(SizeType32 windowSize);
@@ -1167,6 +1261,9 @@ class BlockManager
     std::vector<SizeType32> mLayerToWindowSize;
     std::vector<SizeType32> mAbsolutePoolToWindowSize;
     std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
+
+    bool mEnablePartialReuse;
+    LookupPtr mLookup;
 };
 
 struct OffsetTableDimensions