[https://nvbugs/5501557][fix] Fix out-of-bounds vector access for model with multiple layer types

brb-nv · brb-nv · commit ccbd9bebccc0 · 2025-09-09T08:07:50.000-07:00
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -269,16 +269,29 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
             = [](ModelConfig const& modelConfig, WorldConfig const& worldConfig,
                   std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor)
         {
-            auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = modelConfig.getNumKvHeadsPerLayerLocalRange(
-                worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank(), isCrossAttention);
-            auto numKvHeadsPerLayer = std::vector<SizeType32>(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd);
-            auto windowSizeLayers
-                = BaseKVCacheManager::groupLayersByWindowSize(maxAttentionWindowVec, modelConfig.getNbLayers());
+            // These are the number of attention layers on this PP rank.
+            const auto numLocalAttnLayers = modelConfig.getNbAttentionLayers(
+                worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
+            // These are the number of attention layers on all previous PP ranks.
+            const auto numLowerRankAttnLayers = modelConfig.countLowerRankLayers(ModelConfig::LayerType::kATTENTION,
+                worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
+            // Use global ranks of attention layers to lookup from maxAttentionWindowVec.
+            const auto startAttnLayerId = numLowerRankAttnLayers;
+            const auto endAttnLayerId = numLowerRankAttnLayers + numLocalAttnLayers;
+            auto const numNonUniqueWindowSizes = static_cast<SizeType32>(maxAttentionWindowVec.size());
+            std::map<SizeType32, std::vector<SizeType32>> uniqueWindowSizeToLayers;
+            for (SizeType32 layerIdx = startAttnLayerId; layerIdx < endAttnLayerId; layerIdx++)
+            {
+                // maxAttentionWindowVec may or may not be stretched to the length of numLayers yet.
+                // If not stretched yet, we cycle through the window sizes.
+                auto const windowSize = maxAttentionWindowVec.at(layerIdx % numNonUniqueWindowSizes);
+                uniqueWindowSizeToLayers[windowSize].push_back(layerIdx);
+            }
             std::map<SizeType32, SizeType32> cacheSizeBytesPerTokenPerWindow;
-            for (auto const& [windowSize, managedLayers] : windowSizeLayers)
+            for (auto const& [windowSize, globalLayerIds] : uniqueWindowSizeToLayers)
             {
                 auto const cacheSizePerToken = BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize(
-                    modelConfig, managedLayers, isCrossAttention, kvFactor);
+                    modelConfig, globalLayerIds, isCrossAttention, kvFactor);
                 auto const cacheSizeBytesPerToken
                     = cacheSizePerToken * BufferDataType(modelConfig.getKvDataType()).getSize();
                 cacheSizeBytesPerTokenPerWindow[windowSize] = cacheSizeBytesPerToken;