Skip to content

Commit 0202754

Browse files
committed
[KV cache manager] Simplify block allocation
We will be dealing with input with or without chunk prefill. This will be dealt by the outer control flow, namely the caller of addSequence. Treat addSequence simple by allocating the specified inputLength of blocks. Signed-off-by: eopXD <[email protected]>
1 parent e956fbf commit 0202754

File tree

1 file changed

+5
-9
lines changed

1 file changed

+5
-9
lines changed

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,7 +1804,8 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
18041804

18051805
SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
18061806
{
1807-
1807+
TLLM_CHECK_WITH_INFO(
1808+
mSinkBlockTokenLength == 0 && mSinkBubbleLength == 0, "streamLLM is not supported at the moment");
18081809
if (isCrossKv())
18091810
{
18101811
if (req.isContextInitState() && req.getContextCurrentPosition() == 0)
@@ -1815,14 +1816,9 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req,
18151816
return 0; // cross KV cache doesn't grow after the initial context phase
18161817
}
18171818

1818-
auto const temporaryAttentionWindow = mBlockManager.getWindowSizeMetadata(windowSize).temporaryAttentionWindow;
1819-
1820-
SizeType32 const numContextBlocks
1821-
= (std::min(req.mPromptLen, windowSize + temporaryAttentionWindow) + mSinkBubbleLength) / getTokensPerBlock();
1819+
SizeType32 const numContextBlocks = req.mPromptLen / getTokensPerBlock();
18221820

1823-
SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(
1824-
std::min(req.mPromptLen + req.mMaxNewTokens, windowSize + temporaryAttentionWindow) + mSinkBubbleLength,
1825-
getTokensPerBlock());
1821+
SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(req.mPromptLen + req.mMaxNewTokens, getTokensPerBlock());
18261822

18271823
SizeType32 const numGenBlocksPerBeam = numTotalBlocksPerBeam - numContextBlocks;
18281824

@@ -1951,7 +1947,7 @@ void KVCacheManager::addSequence(
19511947
auto const temporaryAttentionWindow = metadata.temporaryAttentionWindow;
19521948

19531949
// Consider the temporaryAttentionWindow when allocating blocks.
1954-
auto const effectiveInputLength = std::min(inputLength, maxTokenNum + temporaryAttentionWindow);
1950+
auto const effectiveInputLength = inputLength;
19551951
auto const numContextBlocks = tc::ceilDiv(effectiveInputLength, getTokensPerBlock());
19561952
if (!sequence.isCyclic() && mEnableBlockReuse)
19571953
{

0 commit comments

Comments
 (0)