NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 54 additions & 38 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 54 additions & 38 deletions
@@ -53,6 +53,10 @@ static constexpr SizeType32 kPrimaryLevel = 0;
 
 static constexpr SizeType32 kSecondaryLevel = 1;
 
+// Extra block buffer allocated for SWA to be able to always keep "window size"
+// tokens held in the blocks.
+static constexpr SizeType32 kSWAExtraBlock = 1;
+
 class KVCacheBlock;
 class BlockManager;
 class KVCacheManager;
@@ -88,8 +92,8 @@ struct WindowSizeMetadata
     SizeType32 allottedSecondaryBlocks;  // Number of secondary blocks allotted to the windowSize
     SizeType32 absolutePoolsOffset;      // cumulative number of pools up to manager
     SizeType32 numPools;                 // number of managed pools
-    SizeType32 maxTokenNum;              // Maximum token length (including bubble)
-    SizeType32 maxBlocksPerSeq;
+    SizeType32 maxTokensPerSeq;          // Maximum token length per sequence
+    SizeType32 maxBlocksPerSeq;          // Maximum number of blocks per sequence
     SizeType32 maxNumBlocks;             // Number of primary+secondary blocks allotted to the windowSize
     SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence.
                                          // Only needed when chunked context + sliding window attention are used
@@ -99,9 +103,9 @@ struct WindowSizeMetadata
     {
         return tensorrt_llm::common::fmtstr(
             "WindowSizeMetadata{ .allottedPrimaryBlocks=%d, .allottedSecondaryBlocks=%d, .absolutePoolsOffset=%d, "
-            ".numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d }",
-            allottedPrimaryBlocks, allottedSecondaryBlocks, absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq,
-            maxNumBlocks, temporaryAttentionWindow);
+            ".numPools=%d, .maxTokensPerSeq=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d }",
+            allottedPrimaryBlocks, allottedSecondaryBlocks, absolutePoolsOffset, numPools, maxTokensPerSeq,
+            maxBlocksPerSeq, maxNumBlocks, temporaryAttentionWindow);
     }
 };
 
@@ -335,14 +339,7 @@ class GenerationRequest
         , mNumTokens(numTokens)
         , mBeamWidth(beamWidth)
         , mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
-        // min window size + sink bubble length
-        // Why use the minimum window size:
-        // Chunked Prefill + Reuse calls `setPrepopulatedPromptLen()` which sets
-        // `mContextCurrentPosition` - this cannot be done for some windows sizes and
-        // not for others, the state needs to remain identical for all window sizes. So
-        // we currently resort to strictly disabling the reuse code path for all window
-        // sizes at once or enable it for all window sizes at once.
-        , mCyclicThreshold(windowSizeToMetadata.cbegin()->second.maxTokenNum)
+        , mNumFrontBlocksRemoved(0)
     {
         auto const numWindowSizes = windowSizeToMetadata.size();
         mCacheBlockIds.reserve(numWindowSizes);
@@ -385,6 +382,11 @@ class GenerationRequest
         return mNumTokens;
     }
 
+    [[nodiscard]] SizeType32 getNumFrontBlocksRemoved() const
+    {
+        return mNumFrontBlocksRemoved;
+    }
+
     [[nodiscard]] SizeType32 getBeamWidth() const
     {
         return mBeamWidth;
@@ -418,6 +420,17 @@ class GenerationRequest
         }
     }
 
+    void removeFrontBlock(SizeType32 windowSize)
+    {
+        for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
+        {
+            // Does not actually remove from mCacheBlockIds like removeLastBlock
+            // Id is set to -1 instead.
+            beamBlockIds[mNumFrontBlocksRemoved] = -1;
+        }
+        ++mNumFrontBlocksRemoved;
+    }
+
     void removeLastBlock(SizeType32 windowSize)
     {
         for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
@@ -436,14 +449,6 @@ class GenerationRequest
         return mKvCacheRetentionConfig.getDecodeDurationMs();
     }
 
-    // @brief Check whether the sequence uses cyclic KV cache.
-    // @return `true` if we have begun overwriting the beginning of the sequence's KV cache.
-    // @details If `true`, we cannot store the sequence's KV cache for reuse.
-    [[nodiscard]] bool isCyclic() const
-    {
-        return mNumTokens >= mCyclicThreshold;
-    }
-
 private:
     // Request id of the sequence
     LlmRequest::RequestIdType mRequestId;
@@ -457,9 +462,8 @@ class GenerationRequest
     std::unordered_map<SizeType32, runtime::ITensor::SharedPtr> mCacheBlockIndices;
     // The retention priority to assign to decode blocks
     executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
-
-    // Number of tokens at which the KV Cache begins sliding [for the minimum attention window]
-    SizeType32 mCyclicThreshold;
+    // Number of front blocks removed from the sequence
+    SizeType32 mNumFrontBlocksRemoved;
 };
 
 // attach metadata to a pool pointer
@@ -560,14 +564,26 @@ class WindowBlockManager
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
     //! \brief Release blocks of the sequence.
-    void releaseBlocks(GenerationRequest& sequence);
+    //! \details When llmRequest is provided and reuse is enabled, blocks will be stored.
+    void releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt);
 
     //! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
 
     //! \brief Release last block in the sequence
     void releaseLastBlock(GenerationRequest& sequence);
 
+    //! \brief Detach block from the sequence
+    void detachBlock(GenerationRequest& sequence, bool isEnableBlockReuse);
+
+    //! \brief Check and add a block to the sequence if needed.
+    //! \details Out-of-window blocks will be detached. If reuse is enabled,
+    //! the detached block will be stored via offload.
+    void addBlockIfNeeded(GenerationRequest& sequence, bool isEnableBlockReuse);
+
+    //! \brief Cache offsets for new block
+    void cacheNewBlockOffset(GenerationRequest& sequence);
+
     [[nodiscard]] SizeType32 getWindowSize() const noexcept
     {
         return mWindowSize;
@@ -578,7 +594,7 @@ class WindowBlockManager
         return mLogPrefix;
     }
 
-    [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept;
+    [[nodiscard]] SizeType32 getNumFreeBlocks(SizeType32 cacheLevel = kPrimaryLevel) const noexcept;
 
     [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
     {
@@ -713,7 +729,8 @@ class WindowBlockManager
     //! \brief Store blocks in cached blocks.
     //! \param blockKeys Key of each block.
     //! \param blockIds Id of each block.
-    void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
+    //! \return Number of actual blocks stored.
+    SizeType32 storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
 
     void addBlockToHashMap(BlockPtr const& block);
 
@@ -916,19 +933,20 @@ class BlockManager
 
     void startScheduling();
 
-    [[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize() const
+    [[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize(
+        SizeType32 cacheLevel = kPrimaryLevel) const
     {
         std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
         for (auto const& [windowSize, manager] : mWindowBlockManagers)
         {
-            numFreeBlocksPerWindowSize[windowSize] = manager.getNumFreeBlocks();
+            numFreeBlocksPerWindowSize[windowSize] = manager.getNumFreeBlocks(cacheLevel);
         }
         return numFreeBlocksPerWindowSize;
     }
 
-    [[nodiscard]] SizeType32 getNumFreeBlocks() const
+    [[nodiscard]] SizeType32 getNumFreeBlocks(SizeType32 cacheLevel = kPrimaryLevel) const
     {
-        return sumWindows([](auto const& manager) { return manager.getNumFreeBlocks(); });
+        return sumWindows([cacheLevel](auto const& manager) { return manager.getNumFreeBlocks(cacheLevel); });
     }
 
     [[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired, SizeType32 windowSize) const
@@ -1102,12 +1120,10 @@ class BlockManager
     //! \brief Store newest block for reuse
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
-    [[nodiscard]] static bool isUseOneMoreBlock(
-        SizeType32 windowSize, std::optional<SizeType32> maxSequenceLength, SizeType32 maxBeamWidth)
+    [[nodiscard]] static bool isUseOneMoreBlock()
     {
-        bool const isCyclicWindowSize = maxSequenceLength.has_value() && maxSequenceLength.value() > windowSize;
-        bool const isBeamSearch = maxBeamWidth > 1;
-        return isCyclicWindowSize && isBeamSearch;
+        //
+        return false;
     }
 
     //! \brief Perform per-request bookkeeping
@@ -1128,8 +1144,8 @@ class BlockManager
     //! \brief Cache offsets for blocks initiated from sequence
     void cacheSequenceBlockOffsets(GenerationRequest& sequence, SizeType32 windowSize);
 
-    //! \brief Cache offsets for new block
-    void cacheNewBlockOffset(GenerationRequest& sequence, SizeType32 windowSize);
+    //! \brief Add block to the sequence if needed
+    void addBlockIfNeeded(GenerationRequest& sequence, bool isEnableBlockReuse);
 
 private:
     [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const