add some comments

chuangz0 · chuangz0 · commit 0ed9d9481212 · 2025-08-25T10:10:49.000Z
Signed-off-by: Chuang Zhu &lt;111838961+chuangz0@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -75,7 +75,6 @@ BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmReques
 bool CacheFormatter::needSendCache(
     CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx)
 {
-    // int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
     if (targetInfo.mDupHeadFactor <= 1)
     {
@@ -91,12 +90,17 @@ bool CacheFormatter::needSendCache(
         selfTpRankInDpGroup = selfTpRank % selfTPNumInDPGroup;
     }
 
+    // only TP rank % dupHeadFactor == 0 need to send cache.
     return selfTpRankInDpGroup % targetInfo.mDupHeadFactor == 0;
 }
 
 void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::CacheState const& selfConfig,
     BaseCacheFormatter::CacheState const& destConfig)
 {
+    // TODO: VSWA do not support uneven layer per PP.
+    // if gen PP and context PP are different, cache formatter only support alternative window like gpt-oss.
+    // which is one layer is WSA, and another layer is Full attention.
+
     auto numPools = cacheManager->getBlockManager().getNumPools();
     auto layerNum = cacheManager->getBlockManager().getNumLayers();
 
@@ -163,6 +167,7 @@ void CacheFormatter::format(TransferSession& session)
     auto const& destConfig = session.getOtherState().getCacheState().value();
     auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
     auto& bufferManager = session.getBufferManager();
+    // Some TP rank don't need to send cache since duplicate header is not needed.
     if (!needSendCache(selfConfig, destConfig, selfIdx))
     {
         return;
@@ -214,7 +219,7 @@ void CacheFormatter::format(TransferSession& session)
         int blockNum = 0;
 
         size_t allCacheBlockSize = 0;
-
+        // gather cache blocks of the request.
         std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> inputKvCacheBlocks;
         for (auto poolIdx = 0; poolIdx < numPools; poolIdx++)
         {
@@ -224,6 +229,7 @@ void CacheFormatter::format(TransferSession& session)
                 "window size already exists, which is not supported");
             inputKvCacheBlocks.emplace(window, std::vector<runtime::ITensor::SharedPtr>());
             auto maxBlockThisWindow = window / selfConfig.getModelConfig().mTokensPerBlock;
+            // only block in window will be sent.
             SizeType32 blockNumThisWindow = 0;
             for (auto it = blockRange.begin(); it != blockRange.end(); ++it)
             {
@@ -278,6 +284,14 @@ void CacheFormatter::format(TransferSession& session)
             return;
         }
 
+        // formatter flow
+        // 1. gather cache blocks of the request.
+        // 2. compute the buffer size for each target.
+        // 3. prepare the pre-allocated buffer for each target according to the buffer size.
+        // 4. call splitKVCacheDispatch to split the cache blocks according to the different parallelis and gather the
+        // cache blocks to the corresponding buffer.
+        // 5. send the buffer to the corresponding target. Ideally, we send only once (one buffer) for each target.
+
         auto cacheBufferId = mCacheTransBufferManager->assignBufferIndexForSend();
         int peerDuplicateHeadFactor = targetInfo.mPeerDupHeadFactor;
         auto targetNum = connections.size();
@@ -286,7 +300,7 @@ void CacheFormatter::format(TransferSession& session)
         int selfAttentionLayerNum
             = selfConfig.getParallelConfig()
                   .mAttentionLayerNumPerPP[selfIdx / selfConfig.getParallelConfig().mTensorParallelism];
-
+        //  since layer num per pp rank maybe different, we need to compute the buffer size for each target.
         auto getBufferSizeForTarget = [&]()
         {
             std::vector<size_t> bufferSizeForTarget(targetNum, 0);
@@ -419,7 +433,7 @@ void CacheFormatter::format(TransferSession& session)
             }
             else
             {
-                // concurrency num
+                // concurrency num should <=bufferCoverTargetNum to avoid data-race.
                 auto concurrencyNum
                     = std::min(std::max(static_cast<size_t>(1), bufferCoverTargetNum), connections.size());
 
@@ -505,6 +519,7 @@ void CacheFormatter::unformat(TransferSession& session)
     TLLM_CHECK(!outputBuffersPerWindow.empty());
     if (outputBuffersPerWindow.size() > 1)
     {
+        // We only support limited case for VSWA.
         if (selfConfig.getParallelConfig().mPipelineParallelism != destConfig.getParallelConfig().mPipelineParallelism)
         {
             checkAlternateWindow(mCacheManager, selfConfig, destConfig);
@@ -603,6 +618,13 @@ void CacheFormatter::unformat(TransferSession& session)
                     ctxReqId);
                 return;
             }
+            // unformatted flow
+            // 1. gather cache blocks of the request.
+            // 2. compute the buffer size for each target.
+            // 3. prepare the pre-allocated buffer for each target according to the buffer size.
+            // 4. receive the buffer from the corresponding target. Ideally, we receive only once (one buffer) for each
+            // target.
+            // 5. call concatKvCacheV2Dispatch to  concatenate the cache blocks according to the different parallelis
 
             runtime::ITensor::SharedPtr recvBufferTemp;
             std::vector<runtime::ITensor::SharedPtr> recvSplitCaches;
@@ -615,7 +637,7 @@ void CacheFormatter::unformat(TransferSession& session)
             int selfAttentionLayerNum
                 = selfConfig.getParallelConfig()
                       .mAttentionLayerNumPerPP[selfIdx / selfConfig.getParallelConfig().mTensorParallelism];
-            auto getTargetBufferEleSzie = [&]()
+            auto getTargetBufferEleSize = [&]()
             {
                 if (outputBuffersPerWindow.size() > 1)
                 {
@@ -627,14 +649,17 @@ void CacheFormatter::unformat(TransferSession& session)
                     // TODO: LayerNumbufferTargetNum for VWSA
                     return std::make_pair(bufferSizeForTarget, std::vector<SizeType32>(targetNum, 0));
                 }
-                size_t valideTpSize = pickUpConnections.size() / targetInfo.mDomainPPSize;
-                TLLM_CHECK_WITH_INFO(cacheBlockSizeSum % valideTpSize == 0,
-                    "cacheBlockSizeSum must be divisible by valideTpSize %ld", valideTpSize);
-                TLLM_CHECK_WITH_INFO((cacheBlockSizeSum % (selfAttentionLayerNum * valideTpSize)) == 0,
-                    "cacheBlockSizeSum must be divisible by valideTpSize %ld * selfAttentionLayerNum %d", valideTpSize,
+                // for duplicate header, gen will not recv from TP which has duplicate header, and will not prepare
+                // buffer for it.
+                size_t validTpSize = pickUpConnections.size() / targetInfo.mDomainPPSize;
+                TLLM_CHECK_WITH_INFO(cacheBlockSizeSum % validTpSize == 0,
+                    "cacheBlockSizeSum must be divisible by validTpSize %ld", validTpSize);
+                TLLM_CHECK_WITH_INFO((cacheBlockSizeSum % (selfAttentionLayerNum * validTpSize)) == 0,
+                    "cacheBlockSizeSum must be divisible by validTpSize %ld * selfAttentionLayerNum %d", validTpSize,
                     selfAttentionLayerNum);
                 TLLM_CHECK(targetNum == pickUpConnections.size());
-                size_t baseEleSize = cacheBlockSizeSum / (valideTpSize * selfAttentionLayerNum);
+                // the sum of buffer size is cacheBlockSizeSum.
+                size_t baseEleSize = cacheBlockSizeSum / (validTpSize * selfAttentionLayerNum);
 
                 std::vector<size_t> bufferEleSizes(targetNum, 0);
                 std::vector<SizeType32> LayerNumbufferTargetNum(targetNum, 0);
@@ -647,7 +672,7 @@ void CacheFormatter::unformat(TransferSession& session)
                 }
                 return std::make_pair(bufferEleSizes, LayerNumbufferTargetNum);
             };
-            auto [bufferEleSizes, LayerNumbufferTargetNum] = getTargetBufferEleSzie();
+            auto [bufferEleSizes, LayerNumbufferTargetNum] = getTargetBufferEleSize();
 
             size_t remainNoCoverTargetNum = 0;
             size_t bufferCoverTargetNum = 0;
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -369,6 +369,7 @@ void MLACacheFormatter::unformat(TransferSession& session)
         auto selfAttentionLayerNum
             = selfConfig.getParallelConfig()
                   .mAttentionLayerNumPerPP[selfIdx / selfConfig.getParallelConfig().mTensorParallelism];
+        TLLM_CHECK_WITH_INFO(selfAttentionLayerNum != 0, "selfAttentionLayerNum should not be 0");
         auto getBufferSizeForTarget = [&]()
         {
             std::vector<size_t> bufferEleSizes(targetNum, 0);
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
@@ -35,8 +35,14 @@ std::string genUniqueAgentName()
     return std::string(hostname) + "_" + std::to_string(pid) + "_" + std::to_string(counter++);
 }
 
+// NIXL connection is specific ,and different from the UCX and mpi connection, since NIXL only support one-sided
+// communication. gen send buffer metaData to context when it sending requestInfo, but don't send buffer offset, since
+// unformmatter has not called yet, it didn't know the cacheSize and offset. We assume the recv_size is the same as the
+// send_size. and compute the buffer offset according to  the layer num of the selfPPrank ,and previous PP rank's layer
+// num, since the buffer size is ratio is equal to the layer num ratio except the VSWA case.
+
 auto computeSendOffsetRatio(
-    CacheState const& peerCacheState, size_t peerIdx, CacheState const& selfCacheState, int valideConnectionIdx)
+    CacheState const& peerCacheState, int peerIdx, CacheState const& selfCacheState, int valideConnectionIdx)
 {
     auto peerTargetInfo = targetIRanks(selfCacheState, peerCacheState, peerIdx);
     // int ppRank = valideConnectionIdx % peerTargetInfo.mDomainPPSize;
@@ -176,8 +182,8 @@ bool AgentConnection::hasLoadRemoteAgent() const
 
 AgentConnectionManager::AgentConnectionManager(
     batch_manager::kv_cache_manager::CacheTransBufferManager* cacheTransBufferManager, CacheState cacheState)
-    : mRegMemDescs(MemoryType::kVRAM, {})
-    , mCacheState(std::move(cacheState))
+    : mCacheState(std::move(cacheState))
+    , mRegMemDescs(MemoryType::kVRAM, {})
 {
     TLLM_CUDA_CHECK(cudaGetDevice(&mDeviceId));
     TLLM_CHECK(mDeviceId != -1);
@@ -352,7 +358,7 @@ batch_manager::kv_cache_manager::CacheTransBufferManager* AgentConnectionManager
     return mCacheTransBufferManager;
 }
 
-AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connecitonInfo,
+AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connectionInfo,
     std::optional<std::string> metadata, bool isSender)
 {
 
@@ -393,7 +399,7 @@ AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentN
             TLLM_CHECK_WITH_INFO(!isSender, "Sender shouldn't call connectRemoteAgent");
             TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), "mAgentName: %s connect to %s with connectRemoteAgent",
                 mAgentName.c_str(), remoteAgentName.c_str());
-            m_Agent->connectRemoteAgent(remoteAgentName, connecitonInfo);
+            m_Agent->connectRemoteAgent(remoteAgentName, connectionInfo);
         }
     }
     else
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu
@@ -69,6 +69,9 @@ TargetRanksInfo TargetRanksInfoForDP(
     TLLM_CHECK(peerNumLayerPerPP.size() == peerPPNum);
     TLLM_CHECK(selfNumLayerPerPP.size() == selfPPNum);
     int selfStartLayerId = 0;
+    // global start layer id for selfPPrank, which is the sum of the layer num of the previous PP ranks.
+    // compute the target PP ranks and layer num need to be fetched from each target PP rank, according to [global start
+    // layer id, global end layer id)
 
     for (int pp_rank = 0; pp_rank < selfPPRank; pp_rank++)
     {
@@ -515,8 +518,6 @@ nvinfer1::Dims makeShapeFromCacheState(kv_cache::CacheState const& cacheState)
             cacheState.getAttentionConfig().mKvFactor, blockSize});
 }
 
-// MLA Head 1: One thread block per [(2), tokens, dimsPerHead]
-
 __device__ __forceinline__ void getLayerIdInDomainPPandRankInDomainPP(int layerId, int DomainPPSize,
     uint64_t* prefixLayerNumDevPtr, int& layerIdInDomainPP, int& rankInDomainPP, int& layerNumInSpecPP)
 {
@@ -542,6 +543,8 @@ __device__ __forceinline__ void getLayerIdInDomainPPandRankInDomainPP(int layerI
     layerNumInSpecPP = sharedLayerNumInSpecPP;
 }
 
+// MLA Head 1: One thread block per [(2), tokens, dimsPerHead]
+
 template <typename T, int subWarpSize, int vecSizeByte>
 __global__ void splitKVCacheForMLAKernel(T const** __restrict__ inputBlocks, T** __restrict__ outputCaches,
     int tokensPerBlock, int numLayers, int headNum, int dimsPerHead, int inputBlockNum, int DomainPPSize,
@@ -638,19 +641,12 @@ __global__ void splitKVCacheKernel(T const** __restrict__ inputBlocks, T** __res
         for (int layerId = blockIdx.x; layerId < numLayers; layerId += gridDim.x)
         {
 
-            //  if(peer PPrank ==threadIdx.x; peerPPRank <DomainPPSize)
-            // if( layerId>xx[peeRank] &&layerId<xx[peerPPRank+1])
-            // peerPPrank , layerIdInDomainPP = layerId - xx[peerPPrank]
             int layerIdInDomainPP{};
             int rankInDomainPP{};
             int layerNumInSpecPP{};
             getLayerIdInDomainPPandRankInDomainPP(
                 layerId, DomainPPSize, prefixLayerNumDevPtr, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
 
-            // if (threadIdx.x == 0){
-            //     printf("splitKVCacheKernel: layerId:%d, layerIdInDomainPP:%d, rankInDomainPP:%d,
-            //     layerNumInSpecPP:%d\n", layerId, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
-            // }
 #pragma unroll 1
 
             for (int headId = subWarpGroupId; headId < headNum; headId += subWarpGroupNum)
@@ -893,11 +889,6 @@ __global__ void concatKVCacheKernel(T const** __restrict__ inputCaches, T** __re
             getLayerIdInDomainPPandRankInDomainPP(
                 layerId, DomainPPSize, prefixLayerNumDevPtr, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
 
-            // if (threadIdx.x == 0){
-            //     printf("concatKVCacheKernel: layerId:%d, layerIdInDomainPP:%d, rankInDomainPP:%d,
-            //     layerNumInSpecPP:%d\n", layerId, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
-            // }
-
 #pragma unroll 1
             for (int headId = subWarpGroupId; headId < headNum; headId += subWarpGroupNum)
             {
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
@@ -25,6 +25,7 @@
 #include <nanobind/stl/optional.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
 #include <nanobind/trampoline.h>
 #include <torch/extension.h>
 
diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@@ -99,7 +99,7 @@ TEST_F(RequestInfoTest, Basic)
     }
     auto state = std::make_unique<texec::DataTransceiverState>();
     state->setCommState(texec::kv_cache::CommState{12, "127.0.0.1"});
-    state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, {4}, nvinfer1::DataType::kFLOAT});
+    state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, {10}, nvinfer1::DataType::kFLOAT});
     RequestInfo info{1, *state};
     auto info2 = serializeDeserialize(info);
     EXPECT_EQ(info, info2);
@@ -167,7 +167,7 @@ class MockDataSender : public DataSender
         ON_CALL(*this, recvRequestInfo)
             .WillByDefault(Return(RequestInfo{0,
                 texec::DataTransceiverState{
-                    texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, {4}, nvinfer1::DataType::kFLOAT},
+                    texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, {10}, nvinfer1::DataType::kFLOAT},
                     texec::kv_cache::CommState{std::vector<SizeType32>{0}, 0}}}));
         ON_CALL(*this, getCounterpartsCount).WillByDefault(Return(1));
     }
@@ -983,8 +983,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
                 startLayerId += mAttentionLayerNumPerPP[ppRank];
             }
         }
-        // TLLM_LOG_INFO(tensorrt_llm::mpi::MpiComm::world().getRank(), " fillBlockData startLayerId:%d
-        // layerSizethisRank:%d", startLayerId, layerSizeThisRank);
         int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);
         int startHeadId = headSizePerRank * (mTpRank / mDupHeadFactor);
         bool enableDP = mCacheState->getParallelConfig().mEnableAttentionDP;
@@ -1061,8 +1059,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
             }
         }
 
-        // TLLM_LOG_INFO(tensorrt_llm::mpi::MpiComm::world().getRank(), " verifyBlockData startLayerId:%d
-        // layerSizethisRank:%d", startLayerId, layerSizethisRank);
         int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);
         int startHeadId = headSizePerRank * (mTpRank / mDupHeadFactor);
         bool enableDP = mCacheState->getParallelConfig().mEnableAttentionDP;
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -102,6 +102,7 @@ def __init__(self, mapping: Mapping, dist: Distributed,
         head_dim = kv_cache_manager.head_dim
         tokens_per_block = kv_cache_manager.tokens_per_block
         dtype = kv_cache_manager.dtype
+        # get the layer num per pp rank, which is required by cache transceiver.
         pp_layer_num = len(kv_cache_manager.pp_layers)
         pp_layer_num_per_pp_rank = dist.pp_allgather(pp_layer_num)
         self.impl = CacheTransceiverCpp(kv_cache_manager.impl,
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -716,7 +716,8 @@ def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv,
                            cwd=llm_venv.get_working_directory())
 
 
-@pytest.mark.skip_less_device(4)
+#tiny llama pp4 will have uneven layer per pp. pp4
+@pytest.mark.skip_less_device(8)
 @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
                          indirect=True)
 def test_disaggregated_ctxpp4_gentp4(disaggregated_test_root, llm_venv,
@@ -802,12 +803,14 @@ def test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp(
                            cwd=llm_venv.get_working_directory())
 
 
+@pytest.mark.skip_less_device(4)
 @skip_no_hopper
 @pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
                          indirect=True)
 def test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp(
         disaggregated_test_root, disaggregated_example_root, llm_venv,
         deepseek_v3_model_root):
+    #add one mtp layer, pp rank0 will have 15 layer, pp rank 1 will have 16 layers.
     src_dst_dict = {
         deepseek_v3_model_root:
         f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",

Original file line number	Diff line number	Diff line change
`@@ -369,6 +369,7 @@ void MLACacheFormatter::unformat(TransferSession& session)`
`369`	`369`	`auto selfAttentionLayerNum`
`370`	`370`	`= selfConfig.getParallelConfig()`
`371`	`371`	`.mAttentionLayerNumPerPP[selfIdx / selfConfig.getParallelConfig().mTensorParallelism];`
	`372`	`+ TLLM_CHECK_WITH_INFO(selfAttentionLayerNum != 0, "selfAttentionLayerNum should not be 0");`
`372`	`373`	`auto getBufferSizeForTarget = [&]()`
`373`	`374`	`{`
`374`	`375`	`std::vector<size_t> bufferEleSizes(targetNum, 0);`
Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,14 @@ std::string genUniqueAgentName()`
`35`	`35`	`return std::string(hostname) + "_" + std::to_string(pid) + "_" + std::to_string(counter++);`
`36`	`36`	`}`
`37`	`37`
	`38`	`+// NIXL connection is specific ,and different from the UCX and mpi connection, since NIXL only support one-sided`
	`39`	`+// communication. gen send buffer metaData to context when it sending requestInfo, but don't send buffer offset, since`
	`40`	`+// unformmatter has not called yet, it didn't know the cacheSize and offset. We assume the recv_size is the same as the`
	`41`	`+// send_size. and compute the buffer offset according to the layer num of the selfPPrank ,and previous PP rank's layer`
	`42`	`+// num, since the buffer size is ratio is equal to the layer num ratio except the VSWA case.`
	`43`	`+`
`38`	`44`	`auto computeSendOffsetRatio(`
`39`		`- CacheState const& peerCacheState, size_t peerIdx, CacheState const& selfCacheState, int valideConnectionIdx)`
	`45`	`+ CacheState const& peerCacheState, int peerIdx, CacheState const& selfCacheState, int valideConnectionIdx)`
`40`	`46`	`{`
`41`	`47`	`auto peerTargetInfo = targetIRanks(selfCacheState, peerCacheState, peerIdx);`
`42`	`48`	`// int ppRank = valideConnectionIdx % peerTargetInfo.mDomainPPSize;`
`@@ -176,8 +182,8 @@ bool AgentConnection::hasLoadRemoteAgent() const`
`176`	`182`
`177`	`183`	`AgentConnectionManager::AgentConnectionManager(`
`178`	`184`	`batch_manager::kv_cache_manager::CacheTransBufferManager* cacheTransBufferManager, CacheState cacheState)`
`179`		`- : mRegMemDescs(MemoryType::kVRAM, {})`
`180`		`- , mCacheState(std::move(cacheState))`
	`185`	`+ : mCacheState(std::move(cacheState))`
	`186`	`+ , mRegMemDescs(MemoryType::kVRAM, {})`
`181`	`187`	`{`
`182`	`188`	`TLLM_CUDA_CHECK(cudaGetDevice(&mDeviceId));`
`183`	`189`	`TLLM_CHECK(mDeviceId != -1);`
`@@ -352,7 +358,7 @@ batch_manager::kv_cache_manager::CacheTransBufferManager* AgentConnectionManager`
`352`	`358`	`return mCacheTransBufferManager;`
`353`	`359`	`}`
`354`	`360`
`355`		`-AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connecitonInfo,`
	`361`	`+AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connectionInfo,`
`356`	`362`	`std::optional<std::string> metadata, bool isSender)`
`357`	`363`	`{`
`358`	`364`
`@@ -393,7 +399,7 @@ AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentN`
`393`	`399`	`TLLM_CHECK_WITH_INFO(!isSender, "Sender shouldn't call connectRemoteAgent");`
`394`	`400`	`TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), "mAgentName: %s connect to %s with connectRemoteAgent",`
`395`	`401`	`mAgentName.c_str(), remoteAgentName.c_str());`
`396`		`- m_Agent->connectRemoteAgent(remoteAgentName, connecitonInfo);`
	`402`	`+ m_Agent->connectRemoteAgent(remoteAgentName, connectionInfo);`
`397`	`403`	`}`
`398`	`404`	`}`
`399`	`405`	`else`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ TEST_F(RequestInfoTest, Basic)`
`99`	`99`	`}`
`100`	`100`	`auto state = std::make_unique<texec::DataTransceiverState>();`
`101`	`101`	`state->setCommState(texec::kv_cache::CommState{12, "127.0.0.1"});`
`102`		`- state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, {4}, nvinfer1::DataType::kFLOAT});`
	`102`	`+ state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, {10}, nvinfer1::DataType::kFLOAT});`
`103`	`103`	`RequestInfo info{1, *state};`
`104`	`104`	`auto info2 = serializeDeserialize(info);`
`105`	`105`	`EXPECT_EQ(info, info2);`
`@@ -167,7 +167,7 @@ class MockDataSender : public DataSender`
`167`	`167`	`ON_CALL(*this, recvRequestInfo)`
`168`	`168`	`.WillByDefault(Return(RequestInfo{0,`
`169`	`169`	`texec::DataTransceiverState{`
`170`		`- texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, {4}, nvinfer1::DataType::kFLOAT},`
	`170`	`+ texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, {10}, nvinfer1::DataType::kFLOAT},`
`171`	`171`	`texec::kv_cache::CommState{std::vector<SizeType32>{0}, 0}}}));`
`172`	`172`	`ON_CALL(*this, getCounterpartsCount).WillByDefault(Return(1));`
`173`	`173`	`}`
`@@ -983,8 +983,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara`
`983`	`983`	`startLayerId += mAttentionLayerNumPerPP[ppRank];`
`984`	`984`	`}`
`985`	`985`	`}`
`986`		`- // TLLM_LOG_INFO(tensorrt_llm::mpi::MpiComm::world().getRank(), " fillBlockData startLayerId:%d`
`987`		`- // layerSizethisRank:%d", startLayerId, layerSizeThisRank);`
`988`	`986`	`int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);`
`989`	`987`	`int startHeadId = headSizePerRank * (mTpRank / mDupHeadFactor);`
`990`	`988`	`bool enableDP = mCacheState->getParallelConfig().mEnableAttentionDP;`
`@@ -1061,8 +1059,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara`
`1061`	`1059`	`}`
`1062`	`1060`	`}`
`1063`	`1061`
`1064`		`- // TLLM_LOG_INFO(tensorrt_llm::mpi::MpiComm::world().getRank(), " verifyBlockData startLayerId:%d`
`1065`		`- // layerSizethisRank:%d", startLayerId, layerSizethisRank);`
`1066`	`1062`	`int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);`
`1067`	`1063`	`int startHeadId = headSizePerRank * (mTpRank / mDupHeadFactor);`
`1068`	`1064`	`bool enableDP = mCacheState->getParallelConfig().mEnableAttentionDP;`