diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index a0234cbbe49..a49527a6157 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -536,8 +536,7 @@ class WindowBlockManager SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr stream, bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, - std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, - bool copyOnPartialReuse); + std::shared_ptr eventManager, bool enablePartialReuse, bool copyOnPartialReuse); ~WindowBlockManager(); @@ -633,11 +632,6 @@ class WindowBlockManager return mAllBlocksById.at(blockId); } - [[nodiscard]] BlockMapIterRange getBlocksByHash(size_t hash) const - { - return mContextBlocksByHash.equal_range(hash); - } - [[nodiscard]] SizeType32 getTokensPerBlock() const noexcept { return mTokensPerBlock; @@ -723,10 +717,6 @@ class WindowBlockManager //! \param blockIds Id of each block. void storeBlocks(std::vector const& blockKeys, std::vector const& blockIds); - void addBlockToHashMap(BlockPtr const& block); - - void removeBlockFromHashMap(BlockPtr const& block); - [[nodiscard]] bool verifyQueueIntegrity(); // Only needed when sliding window attention + paged context fmha are used together. @@ -808,8 +798,6 @@ class WindowBlockManager SizeType32 mTokensPerBlock; // List of all blocks by idx std::vector mAllBlocksById; - // List of all context blocks by hash - BlockMap mContextBlocksByHash; // Dummy block acting as root for BlockToken searches BlockPtr mCachedBlocksRoot; // KV cache type (self or cross) @@ -841,8 +829,6 @@ class WindowBlockManager double mReusedTokens; // Total number of input tokens double mTotalInputTokens; - // Whether or not to maintain a hashmap of blocks. - bool mEnableHashKey; // Whether blocks that are partially matched should be reused. bool mEnablePartialReuse; // Whether partially matched blocks that are already in use should be copied and reused. @@ -863,8 +849,8 @@ class BlockManager std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, - std::shared_ptr eventManager = nullptr, bool enableHashKey = false, - bool enablePartialReuse = true, bool copyOnPartialReuse = true); + std::shared_ptr eventManager = nullptr, bool enablePartialReuse = true, + bool copyOnPartialReuse = true); BlockManager(BlockManager const&) = delete; BlockManager& operator=(BlockManager const&) = delete; @@ -1081,11 +1067,6 @@ class BlockManager return mWindowBlockManagers.at(windowSize).getBlockById(blockId); } - [[nodiscard]] WindowBlockManager::BlockMapIterRange getBlocksByHash(size_t hash, SizeType32 windowSize) const - { - return mWindowBlockManagers.at(windowSize).getBlocksByHash(hash); - } - [[nodiscard]] SizeType32 getNumPrimaryBlocks() const { return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); }); @@ -1096,16 +1077,6 @@ class BlockManager return getPool(poolIdx).containsBlockScales; } - void addBlockToHashMap(BlockPtr const& block, SizeType32 windowSize) - { - mWindowBlockManagers.at(windowSize).addBlockToHashMap(block); - } - - void removeBlockFromHashMap(BlockPtr const& block, SizeType32 windowSize) - { - mWindowBlockManagers.at(windowSize).removeBlockFromHashMap(block); - } - //! \brief Store context blocks void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest); @@ -1385,8 +1356,8 @@ class KVCacheManager : public BaseKVCacheManager SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, - std::shared_ptr eventManager = nullptr, bool enableHashKey = false, - bool enablePartialReuse = true, bool copyOnpartialReuse = true); + std::shared_ptr eventManager = nullptr, bool enablePartialReuse = true, + bool copyOnpartialReuse = true); KVCacheManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, @@ -1405,8 +1376,8 @@ class KVCacheManager : public BaseKVCacheManager SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, std::optional secondaryOffloadMinPriority = std::nullopt, - std::shared_ptr eventManager = nullptr, bool enableHashKey = false, - bool enablePartialReuse = true, bool copyOnpartialReuse = true); + std::shared_ptr eventManager = nullptr, bool enablePartialReuse = true, + bool copyOnpartialReuse = true); KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, @@ -1692,8 +1663,6 @@ class KVCacheManager : public BaseKVCacheManager std::unordered_map mSequences; // Whether to cache KV pages for reuse bool mEnableBlockReuse; - // Whether enable finding blocks by their hash, ignored when reuse enabled - bool mEnableHashKey; // Mutex to protect access to mSequences mutable std::mutex mSequencesMtx; // buffers for static tensors, will be created after allocating pools diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index 4202ba348ac..c032c80757c 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -504,8 +504,7 @@ BlockManager::BlockManager(std::vector const& numKvHeadsPerLayer, Si std::optional const& tempAttentionWindowInputs, nvinfer1::DataType dtype, SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, - std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, - bool copyOnPartialReuse) + std::shared_ptr eventManager, bool enablePartialReuse, bool copyOnPartialReuse) : mNumLayers{static_cast(numKvHeadsPerLayer.size())} , mTokensPerBlock{tokensPerBlock} , mEventManager{std::move(eventManager)} @@ -530,7 +529,7 @@ BlockManager::BlockManager(std::vector const& numKvHeadsPerLayer, Si TLLM_CHECK(allottedPrimaryBlocks > 0); // You can't have a model with negative primary blocks... mWindowBlockManagers.try_emplace(windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer, sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream, - onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enableHashKey, enablePartialReuse, + onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enablePartialReuse, copyOnPartialReuse); } @@ -573,8 +572,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr stream, bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, - std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, - bool copyOnPartialReuse) + std::shared_ptr eventManager, bool enablePartialReuse, bool copyOnPartialReuse) : mDataType{dtype} , mWindowSize{windowSize} , mNumPrimaryBlocks{blocksInPrimaryPool} @@ -596,7 +594,6 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind , mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%u]", mWindowSize)} , mReusedTokens{0.0} , mTotalInputTokens{0.0} - , mEnableHashKey{enableHashKey} , mEnablePartialReuse{enablePartialReuse} , mCopyOnPartialReuse{copyOnPartialReuse} { @@ -920,50 +917,6 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const mWindowBlockManagers.at(windowSize).setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId); } -void WindowBlockManager::addBlockToHashMap(BlockPtr const& block) -{ - if (!mEnableHashKey) - { - return; - } - auto range = mContextBlocksByHash.equal_range(block->getHash()); - for (auto it = range.first; it != range.second; ++it) - { - if (it->second == block) - { - // TODO: change to assert when reused block is added only once - TLLM_LOG_TRACE( - "Block %d by %zx exists", block->getBlockId(), block->getHash(), mContextBlocksByHash.size()); - return; - } - } - TLLM_LOG_TRACE( - "Add block %d by %zx, block n = %zu", block->getBlockId(), block->getHash(), mContextBlocksByHash.size()); - mContextBlocksByHash.emplace(block->getHash(), std::move(block)); -} - -void WindowBlockManager::removeBlockFromHashMap(BlockPtr const& block) -{ - if (mContextBlocksByHash.empty() || block->getBlockKey().uniqueTokens.empty()) - { - // Hash key not enabled / Empty block - return; - } - auto range = mContextBlocksByHash.equal_range(block->getHash()); - TLLM_LOG_TRACE( - "Remove block %d by %zx, block n = %zu", block->getBlockId(), block->getHash(), mContextBlocksByHash.size()); - for (auto it = range.first; it != range.second; ++it) - { - if (it->second == block) - { - mContextBlocksByHash.erase(it); - return; - } - } - // TODO: should be unreachable - TLLM_LOG_DEBUG("Trying to remove block %d by %zx that is not in hash map", block->getBlockId(), block->getHash()); -} - void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize) { mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock); @@ -1104,7 +1057,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs); TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Reused partially filled block %d", mLogPrefix.c_str(), matchingBlockId); - addBlockToHashMap(matchingBlock); } searchRoot = nullptr; // no matching needed for following blocks } @@ -1114,7 +1066,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& mEvictionPolicy->claimBlock( matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs); TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Matched full block %d", mLogPrefix.c_str(), matchingBlockId); - addBlockToHashMap(matchingBlock); searchRoot = matchingBlock; } onboardBlock(matchingBlock); @@ -1145,7 +1096,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& ++blockItr; } freeBlock->setHash(); - addBlockToHashMap(freeBlock); ++mMissedBlocks; } } @@ -1169,7 +1119,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& ++blockItr; } freeBlock->setHash(); - addBlockToHashMap(freeBlock); TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d", mLogPrefix.c_str(), beamIdx, freeBlock->getBlockId(), bi); } @@ -1369,9 +1318,7 @@ void WindowBlockManager::storeBlocks( if (oldHash != newHash) { TLLM_LOG_DEBUG("#%d block hash %zx -> %zx", block->getBlockId(), oldHash, newHash); - removeBlockFromHashMap(block); block->setHash(newHash); - addBlockToHashMap(block); } searchRoot = block; } @@ -1408,7 +1355,6 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp if (!block->hasRefs()) { mEvictionPolicy->releaseBlock(block); - removeBlockFromHashMap(block); } } @@ -1473,7 +1419,6 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence) if (!block->hasRefs()) { mEvictionPolicy->releaseBlock(block, true); - removeBlockFromHashMap(block); } // Remove block from allocated blocks allocatedBlocks.pop_back(); @@ -1616,7 +1561,6 @@ void WindowBlockManager::releaseBlocks(GenerationRequest& sequence) if (!block->hasRefs()) { mEvictionPolicy->releaseBlock(block); - removeBlockFromHashMap(block); } } // Remove stored block ids in sequence @@ -1654,8 +1598,7 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size : KVCacheManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength, std::make_shared(reinterpret_cast(stream)), maxSequenceLength, - enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, false, enablePartialReuse, - copyOnPartialReuse) + enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, enablePartialReuse, copyOnPartialReuse) { } @@ -1682,8 +1625,7 @@ KVCacheManager::KVCacheManager(std::vector const& numKvHeadsPerLayer SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, - std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, - bool copyOnPartialReuse) + std::shared_ptr eventManager, bool enablePartialReuse, bool copyOnPartialReuse) : mMaxBeamWidth(maxBeamWidth) , mDataType(dtype) , mMaxAttentionWindow(*std::max_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end())) @@ -1693,10 +1635,9 @@ KVCacheManager::KVCacheManager(std::vector const& numKvHeadsPerLayer , mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, std::move(stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager), - enableHashKey, enablePartialReuse, copyOnPartialReuse) + enablePartialReuse, copyOnPartialReuse) // disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case , mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse} - , mEnableHashKey{enableHashKey} { TLLM_CHECK_DEBUG(std::find(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end(), mMaxAttentionWindow) != maxAttentionWindowVec.end()); @@ -1716,12 +1657,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional maxSequenceLength, bool enableBlockReuse, bool onboardBlocks, CacheType cacheType, std::optional secondaryOffloadMinPriority, - std::shared_ptr eventManager, bool enableHashKey, bool enablePartialReuse, - bool copyOnPartialReuse) + std::shared_ptr eventManager, bool enablePartialReuse, bool copyOnPartialReuse) : KVCacheManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength, std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority, - std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse) + std::move(eventManager), enablePartialReuse, copyOnPartialReuse) { } @@ -2085,30 +2025,6 @@ void KVCacheManager::addSequence( llmRequest->mRequestId); } mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx, windowSize); - if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1) - { - constexpr SizeType32 beamIdx = 0; - auto const& blockIds = sequence.getCacheBlockIds(windowSize).at(beamIdx); - auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx); - auto blockedUniqueTokens = chopVectorIntoBlocks( - uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true); - auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest); - auto tokensPerBlock = static_cast(getTokensPerBlock()); - for (size_t i = 0; i < blockIds.size(); i++) - { - auto const& block = mBlockManager.getBlockById(blockIds[i], windowSize); - if (i < blockKeys.size()) - { - block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock); - } - else - { - block->setBlockKey({}, false); - } - block->setHash(); - mBlockManager.addBlockToHashMap(block, windowSize); - } - } } cacheBlockOffsets(sequence, windowSize); } diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp index 4a5ddb89286..d42d798f68b 100644 --- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp +++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp @@ -693,7 +693,7 @@ std::unique_ptr TrtGptModelInflightBatching::c kvCacheConfig.getEventBufferMaxSize() > 0 ? std::make_unique(kvCacheConfig.getEventBufferMaxSize()) : nullptr, - false, kvCacheConfig.getEnablePartialReuse(), kvCacheConfig.getCopyOnPartialReuse()); + kvCacheConfig.getEnablePartialReuse(), kvCacheConfig.getCopyOnPartialReuse()); reshapeKvTensors(kvCacheManager->getOffsetTableDimensions()); diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp index bfc62acc3f6..8e58ee77f45 100644 --- a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp @@ -3053,189 +3053,6 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) assertBlocks(seq3, {4}, {6}); } -namespace -{ -KVCacheManager setupKvCacheManagerForHashTest(bool enableBlockReuse) -{ - auto constexpr numLayers = 2; - auto constexpr numHeads = 2; - auto constexpr sizePerHead = 64; - auto constexpr tokensPerBlock = 4; - auto constexpr maxNumSequences = 8; - auto constexpr maxBeamWidth = 1; - auto constexpr sinkTokenLength = 0; - auto const stream = std::make_shared(); - - auto constexpr maxBlocksPerSeq = 8; - auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; - auto constexpr maxAttentionWindow = maxNumTokens; - - auto constexpr blocksInPrimaryPool = 16; - auto constexpr blocksInSecondaryPool = 0; - - auto constexpr onboardBlocks = true; - - auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; - - return KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksPerWindow, - maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, - nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, - CacheType::kSELF, std::nullopt, nullptr, - /*enableHashKey*/ true); -} - -std::vector getHashAndRetrieveBlocksByHashTest( - BlockManager const& blockManager, std::vector const& blockIds, SizeType32 windowSize) -{ - std::vector blockHashes; - for (auto blockId : blockIds) - { - blockHashes.emplace_back(blockManager.getBlockById(blockId, windowSize)->getHash()); - } - std::vector blockPtrs; - for (auto hash : blockHashes) - { - auto range = blockManager.getBlocksByHash(hash, windowSize); - BlockPtr const prevBlock = blockPtrs.empty() ? nullptr : blockPtrs.back(); - BlockPtr thisBlock = nullptr; - for (auto it = range.first; it != range.second; ++it) - { - if (it->second->getPrevBlockInSeq() == prevBlock) - { - thisBlock = it->second; - break; - } - } - EXPECT_NE(thisBlock, nullptr); - blockPtrs.emplace_back(thisBlock); - } - EXPECT_EQ(blockHashes.size(), blockPtrs.size()); - for (size_t i = 0; i < blockHashes.size(); i++) - { - EXPECT_EQ(blockManager.getBlockById(blockIds[i], windowSize), blockPtrs[i]); - } - return blockHashes; -} -} // namespace - -TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyTest) -{ - auto kvCacheManager = setupKvCacheManagerForHashTest(false); - - auto const& blockManager = kvCacheManager.getBlockManager(); - - SizeType32 constexpr maxNewTokens = 4; - - // prepare tokens with token[i] = 1000 + i - TokenIdType constexpr firstToken = 1000; - - auto constexpr beamWidth = 1; - tr::SamplingConfig const samplingConfig{beamWidth}; - bool constexpr isStreaming{false}; - - SizeType32 requestId = 0; - int inputLength = 16; - auto inputTokens = std::make_shared(inputLength); - std::iota(inputTokens->begin(), inputTokens->end(), firstToken); - auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - auto constexpr beamIdx = 0; - - /////////////////////////////////////////////////////////////////////////// - // add a request and then remove it without reuse - kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); - GenerationRequest const& seq = kvCacheManager.getSequence(requestId); - EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - - auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); - - auto& blockIds = seq.getCacheBlockIds(onlyWindowSize).at(beamIdx); - EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2, 3})); - - // get blocks by hash and try to retrieve them by hash - auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds, onlyWindowSize); - - EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); - - // blocks are all removed - for (auto hash : blockHashes) - { - auto range = blockManager.getBlocksByHash(hash, onlyWindowSize); - EXPECT_EQ(range.first, range.second); - } - EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); -} - -TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest) -{ - auto kvCacheManager = setupKvCacheManagerForHashTest(true); - - auto const& blockManager = kvCacheManager.getBlockManager(); - - SizeType32 constexpr maxNewTokens = 4; - - // prepare tokens with token[i] = 1000 + i - TokenIdType constexpr firstToken = 1000; - - auto constexpr beamWidth = 1; - tr::SamplingConfig const samplingConfig{beamWidth}; - bool constexpr isStreaming{false}; - - SizeType32 requestId = 0; - int inputLength = 16; - auto inputTokens = std::make_shared(inputLength); - std::iota(inputTokens->begin(), inputTokens->end(), firstToken); - auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - auto constexpr beamIdx = 0; - - /////////////////////////////////////////////////////////////////////////// - // add a request and then remove it with reuse - kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); - GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); - EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); - - EXPECT_EQ(blockManager.getNumPools(), 1); - auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); - - auto& blockIds0 = seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx); - EXPECT_THAT(blockIds0, ::testing::ElementsAreArray({0, 1, 2, 3})); - - // get blocks by hash and try to retrieve them by hash - auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0, onlyWindowSize); - - EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); - - // TODO: Make reused blocks accessible by hash, after sequence removed. Test here. - - /////////////////////////////////////////////////////////////////////////// - // add a new request with same prefix - requestId = 1; - inputLength = 20; - inputTokens->resize(inputLength); - std::iota(inputTokens->begin(), inputTokens->end(), firstToken); - llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); - kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); - GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); - EXPECT_EQ(llmRequest->getContextCurrentPosition(), 15); - auto& blockIds1 = seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx); - EXPECT_THAT(blockIds1, ::testing::ElementsAreArray({0, 1, 2, 3, 4})); - - std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1, onlyWindowSize); - - // blocks are reused, so reused blocks are still accessible by previous hashes - for (size_t i = 0; i < 4; i++) - { - auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize); - EXPECT_NE(range.first, range.second); - } - // evicted block is not accessible - { - size_t i = 4; - auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize); - EXPECT_EQ(range.first, range.second); - } - EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 5); -} - TEST_F(KVCacheManagerTest, KVCacheManagerEventStream) { auto constexpr numLayers = 12;