[KV cache manager] Simplify block allocation

eopXD · eopXD · commit 0202754b4377 · 2025-08-09T23:36:04.000-07:00
We will be dealing with input with or without chunk prefill.
This will be dealt by the outer control flow, namely the caller
of addSequence. Treat addSequence simple by allocating
the specified inputLength of blocks.

Signed-off-by: eopXD &lt;yuehtingc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -1804,7 +1804,8 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
 
 SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
 {
-
+    TLLM_CHECK_WITH_INFO(
+        mSinkBlockTokenLength == 0 && mSinkBubbleLength == 0, "streamLLM is not supported at the moment");
     if (isCrossKv())
     {
         if (req.isContextInitState() && req.getContextCurrentPosition() == 0)
@@ -1815,14 +1816,9 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req,
         return 0; // cross KV cache doesn't grow after the initial context phase
     }
 
-    auto const temporaryAttentionWindow = mBlockManager.getWindowSizeMetadata(windowSize).temporaryAttentionWindow;
-
-    SizeType32 const numContextBlocks
-        = (std::min(req.mPromptLen, windowSize + temporaryAttentionWindow) + mSinkBubbleLength) / getTokensPerBlock();
+    SizeType32 const numContextBlocks = req.mPromptLen / getTokensPerBlock();
 
-    SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(
-        std::min(req.mPromptLen + req.mMaxNewTokens, windowSize + temporaryAttentionWindow) + mSinkBubbleLength,
-        getTokensPerBlock());
+    SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(req.mPromptLen + req.mMaxNewTokens, getTokensPerBlock());
 
     SizeType32 const numGenBlocksPerBeam = numTotalBlocksPerBeam - numContextBlocks;
 
@@ -1951,7 +1947,7 @@ void KVCacheManager::addSequence(
         auto const temporaryAttentionWindow = metadata.temporaryAttentionWindow;
 
         // Consider the temporaryAttentionWindow when allocating blocks.
-        auto const effectiveInputLength = std::min(inputLength, maxTokenNum + temporaryAttentionWindow);
+        auto const effectiveInputLength = inputLength;
         auto const numContextBlocks = tc::ceilDiv(effectiveInputLength, getTokensPerBlock());
         if (!sequence.isCyclic() && mEnableBlockReuse)
         {