@@ -504,8 +504,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
504504    std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
505505    SizeType32 sinkBubbleLength, bool  onboardBlocks, CacheType cacheType,
506506    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
507-     std::shared_ptr<KVCacheEventManager> eventManager, bool  enableHashKey, bool  enablePartialReuse,
508-     bool  copyOnPartialReuse)
507+     std::shared_ptr<KVCacheEventManager> eventManager, bool  enablePartialReuse, bool  copyOnPartialReuse)
509508    : mNumLayers {static_cast <SizeType32>(numKvHeadsPerLayer.size ())}
510509    , mTokensPerBlock {tokensPerBlock}
511510    , mEventManager {std::move (eventManager)}
@@ -530,7 +529,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
530529        TLLM_CHECK (allottedPrimaryBlocks > 0 ); //  You can't have a model with negative primary blocks...
531530        mWindowBlockManagers .try_emplace (windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
532531            sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
533-             onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enableHashKey,  enablePartialReuse,
532+             onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enablePartialReuse,
534533            copyOnPartialReuse);
535534    }
536535
@@ -573,8 +572,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
573572    SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
574573    SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool  onboardBlocks, CacheType cacheType,
575574    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
576-     std::shared_ptr<KVCacheEventManager> eventManager, bool  enableHashKey, bool  enablePartialReuse,
577-     bool  copyOnPartialReuse)
575+     std::shared_ptr<KVCacheEventManager> eventManager, bool  enablePartialReuse, bool  copyOnPartialReuse)
578576    : mDataType {dtype}
579577    , mWindowSize {windowSize}
580578    , mNumPrimaryBlocks {blocksInPrimaryPool}
@@ -596,7 +594,6 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
596594    , mLogPrefix {tensorrt_llm::common::fmtstr (" BlockManager[windowSize=%u]" mWindowSize )}
597595    , mReusedTokens {0.0 }
598596    , mTotalInputTokens {0.0 }
599-     , mEnableHashKey {enableHashKey}
600597    , mEnablePartialReuse {enablePartialReuse}
601598    , mCopyOnPartialReuse {copyOnPartialReuse}
602599{
@@ -920,50 +917,6 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const
920917    mWindowBlockManagers .at (windowSize).setOffsets (offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
921918}
922919
923- void  WindowBlockManager::addBlockToHashMap (BlockPtr const & block)
924- {
925-     if  (!mEnableHashKey )
926-     {
927-         return ;
928-     }
929-     auto  range = mContextBlocksByHash .equal_range (block->getHash ());
930-     for  (auto  it = range.first ; it != range.second ; ++it)
931-     {
932-         if  (it->second  == block)
933-         {
934-             //  TODO: change to assert when reused block is added only once
935-             TLLM_LOG_TRACE (
936-                 " Block %d by %zx exists" getBlockId (), block->getHash (), mContextBlocksByHash .size ());
937-             return ;
938-         }
939-     }
940-     TLLM_LOG_TRACE (
941-         " Add block %d by %zx, block n = %zu" getBlockId (), block->getHash (), mContextBlocksByHash .size ());
942-     mContextBlocksByHash .emplace (block->getHash (), std::move (block));
943- }
944- 
945- void  WindowBlockManager::removeBlockFromHashMap (BlockPtr const & block)
946- {
947-     if  (mContextBlocksByHash .empty () || block->getBlockKey ().uniqueTokens .empty ())
948-     {
949-         //  Hash key not enabled / Empty block
950-         return ;
951-     }
952-     auto  range = mContextBlocksByHash .equal_range (block->getHash ());
953-     TLLM_LOG_TRACE (
954-         " Remove block %d by %zx, block n = %zu" getBlockId (), block->getHash (), mContextBlocksByHash .size ());
955-     for  (auto  it = range.first ; it != range.second ; ++it)
956-     {
957-         if  (it->second  == block)
958-         {
959-             mContextBlocksByHash .erase (it);
960-             return ;
961-         }
962-     }
963-     //  TODO: should be unreachable
964-     TLLM_LOG_DEBUG (" Trying to remove block %d by %zx that is not in hash map" getBlockId (), block->getHash ());
965- }
966- 
967920void  BlockManager::onboardBlock (BlockPtr const & offloadBlock, SizeType32 windowSize)
968921{
969922    mWindowBlockManagers .at (windowSize).onboardBlock (offloadBlock);
@@ -1104,7 +1057,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11041057                        matchingBlock, perBlockRetentions[bi].retentionPriority , perBlockRetentions[bi].durationMs );
11051058                    TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Reused partially filled block %d" mLogPrefix .c_str (),
11061059                        matchingBlockId);
1107-                     addBlockToHashMap (matchingBlock);
11081060                }
11091061                searchRoot = nullptr ; //  no matching needed for following blocks
11101062            }
@@ -1114,7 +1066,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11141066                mEvictionPolicy ->claimBlock (
11151067                    matchingBlock, perBlockRetentions[bi].retentionPriority , perBlockRetentions[bi].durationMs );
11161068                TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Matched full block %d" mLogPrefix .c_str (), matchingBlockId);
1117-                 addBlockToHashMap (matchingBlock);
11181069                searchRoot = matchingBlock;
11191070            }
11201071            onboardBlock (matchingBlock);
@@ -1145,7 +1096,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11451096                ++blockItr;
11461097            }
11471098            freeBlock->setHash ();
1148-             addBlockToHashMap (freeBlock);
11491099            ++mMissedBlocks ;
11501100        }
11511101    }
@@ -1169,7 +1119,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11691119                ++blockItr;
11701120            }
11711121            freeBlock->setHash ();
1172-             addBlockToHashMap (freeBlock);
11731122            TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d" 
11741123                mLogPrefix .c_str (), beamIdx, freeBlock->getBlockId (), bi);
11751124        }
@@ -1369,9 +1318,7 @@ void WindowBlockManager::storeBlocks(
13691318            if  (oldHash != newHash)
13701319            {
13711320                TLLM_LOG_DEBUG (" #%d block hash %zx -> %zx" getBlockId (), oldHash, newHash);
1372-                 removeBlockFromHashMap (block);
13731321                block->setHash (newHash);
1374-                 addBlockToHashMap (block);
13751322            }
13761323            searchRoot = block;
13771324        }
@@ -1408,7 +1355,6 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp
14081355        if  (!block->hasRefs ())
14091356        {
14101357            mEvictionPolicy ->releaseBlock (block);
1411-             removeBlockFromHashMap (block);
14121358        }
14131359    }
14141360
@@ -1473,7 +1419,6 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
14731419    if  (!block->hasRefs ())
14741420    {
14751421        mEvictionPolicy ->releaseBlock (block, true );
1476-         removeBlockFromHashMap (block);
14771422    }
14781423    //  Remove block from allocated blocks
14791424    allocatedBlocks.pop_back ();
@@ -1616,7 +1561,6 @@ void WindowBlockManager::releaseBlocks(GenerationRequest& sequence)
16161561        if  (!block->hasRefs ())
16171562        {
16181563            mEvictionPolicy ->releaseBlock (block);
1619-             removeBlockFromHashMap (block);
16201564        }
16211565    }
16221566    //  Remove stored block ids in sequence
@@ -1682,8 +1626,7 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
16821626    SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
16831627    bool  enableBlockReuse, bool  onboardBlocks, CacheType cacheType,
16841628    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1685-     std::shared_ptr<KVCacheEventManager> eventManager, bool  enableHashKey, bool  enablePartialReuse,
1686-     bool  copyOnPartialReuse)
1629+     std::shared_ptr<KVCacheEventManager> eventManager, bool  enablePartialReuse, bool  copyOnPartialReuse)
16871630    : mMaxBeamWidth (maxBeamWidth)
16881631    , mDataType (dtype)
16891632    , mMaxAttentionWindow (*std::max_element (maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
@@ -1693,10 +1636,9 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
16931636    , mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
16941637          std::move (stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
16951638          mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager),
1696-           enableHashKey,  enablePartialReuse, copyOnPartialReuse)
1639+           enablePartialReuse, copyOnPartialReuse)
16971640    //  disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
16981641    , mEnableBlockReuse{mSinkBubbleLength  > 0  ? false  : enableBlockReuse}
1699-     , mEnableHashKey {enableHashKey}
17001642{
17011643    TLLM_CHECK_DEBUG (std::find (maxAttentionWindowVec.begin (), maxAttentionWindowVec.end (), mMaxAttentionWindow )
17021644        != maxAttentionWindowVec.end ());
@@ -1716,12 +1658,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
17161658    SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
17171659    bool  enableBlockReuse, bool  onboardBlocks, CacheType cacheType,
17181660    std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1719-     std::shared_ptr<KVCacheEventManager> eventManager, bool  enableHashKey, bool  enablePartialReuse,
1720-     bool  copyOnPartialReuse)
1661+     std::shared_ptr<KVCacheEventManager> eventManager, bool  enablePartialReuse, bool  copyOnPartialReuse)
17211662    : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
17221663        maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
17231664        std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority,
1724-         std::move(eventManager), enableHashKey,  enablePartialReuse, copyOnPartialReuse)
1665+         std::move(eventManager), enablePartialReuse, copyOnPartialReuse)
17251666{
17261667}
17271668
@@ -2085,30 +2026,6 @@ void KVCacheManager::addSequence(
20852026                    llmRequest->mRequestId );
20862027            }
20872028            mBlockManager .addSequence (sequence, numContextBlocks, unsharedBlockIdx, windowSize);
2088-             if  (mEnableHashKey  && llmRequest.has_value () && beamWidth == 1 )
2089-             {
2090-                 constexpr  SizeType32 beamIdx = 0 ;
2091-                 auto  const & blockIds = sequence.getCacheBlockIds (windowSize).at (beamIdx);
2092-                 auto  const & uniqueTokens = llmRequest->getUniqueTokens (beamIdx);
2093-                 auto  blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
2094-                     uniqueTokens, uniqueTokens.size () - 1 , getTokensPerBlock (), true );
2095-                 auto  blockKeys = buildBlockKeys (blockedUniqueTokens, *llmRequest);
2096-                 auto  tokensPerBlock = static_cast <size_t >(getTokensPerBlock ());
2097-                 for  (size_t  i = 0 ; i < blockIds.size (); i++)
2098-                 {
2099-                     auto  const & block = mBlockManager .getBlockById (blockIds[i], windowSize);
2100-                     if  (i < blockKeys.size ())
2101-                     {
2102-                         block->setBlockKey (blockKeys[i], blockKeys[i].uniqueTokens .size () == tokensPerBlock);
2103-                     }
2104-                     else 
2105-                     {
2106-                         block->setBlockKey ({}, false );
2107-                     }
2108-                     block->setHash ();
2109-                     mBlockManager .addBlockToHashMap (block, windowSize);
2110-                 }
2111-             }
21122029        }
21132030        cacheBlockOffsets (sequence, windowSize);
21142031    }
0 commit comments