formatting

brb-nv · brb-nv · commit f083d449b6b0 · 2025-09-10T01:15:20.000Z
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -83,18 +83,19 @@ namespace tk = tensorrt_llm::kernels;
 namespace tensorrt_llm::batch_manager
 {
 
-std::map<SizeType32, SizeType32> TrtGptModelInflightBatching::calculateCacheSizePerToken(ModelConfig const& modelConfig, WorldConfig const& worldConfig,
-          std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor)
+std::map<SizeType32, SizeType32> TrtGptModelInflightBatching::calculateCacheSizePerToken(ModelConfig const& modelConfig,
+    WorldConfig const& worldConfig, std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention,
+    SizeType32 kvFactor)
 {
     // These are the number of attention layers on this PP rank.
-    const auto numLocalAttnLayers = modelConfig.getNbAttentionLayers(
-        worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
+    auto const numLocalAttnLayers
+        = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
     // These are the number of attention layers on all previous PP ranks.
-    const auto numLowerRankAttnLayers = modelConfig.countLowerRankLayers(ModelConfig::LayerType::kATTENTION,
+    auto const numLowerRankAttnLayers = modelConfig.countLowerRankLayers(ModelConfig::LayerType::kATTENTION,
         worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
     // Use global ranks of attention layers to lookup from maxAttentionWindowVec.
-    const auto startAttnLayerId = numLowerRankAttnLayers;
-    const auto endAttnLayerId = numLowerRankAttnLayers + numLocalAttnLayers;
+    auto const startAttnLayerId = numLowerRankAttnLayers;
+    auto const endAttnLayerId = numLowerRankAttnLayers + numLocalAttnLayers;
     auto const numNonUniqueWindowSizes = static_cast<SizeType32>(maxAttentionWindowVec.size());
     std::map<SizeType32, std::vector<SizeType32>> uniqueWindowSizeToLayers;
     for (SizeType32 layerIdx = startAttnLayerId; layerIdx < endAttnLayerId; layerIdx++)
@@ -109,8 +110,7 @@ std::map<SizeType32, SizeType32> TrtGptModelInflightBatching::calculateCacheSize
     {
         auto const cacheSizePerToken = BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize(
             modelConfig, globalLayerIds, isCrossAttention, kvFactor);
-        auto const cacheSizeBytesPerToken
-            = cacheSizePerToken * BufferDataType(modelConfig.getKvDataType()).getSize();
+        auto const cacheSizeBytesPerToken = cacheSizePerToken * BufferDataType(modelConfig.getKvDataType()).getSize();
         cacheSizeBytesPerTokenPerWindow[windowSize] = cacheSizeBytesPerToken;
     }
 
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
@@ -148,9 +148,9 @@ class TrtGptModelInflightBatching : public TrtGptModel
 
     ~TrtGptModelInflightBatching() override;
 
-
-    [[nodiscard]] static std::map<SizeType32, SizeType32> calculateCacheSizePerToken(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
-          std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor);
+    [[nodiscard]] static std::map<SizeType32, SizeType32> calculateCacheSizePerToken(
+        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+        std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor);
 
     void terminateRequest(LlmRequestPtr const& llmRequest, bool pause = false) override;
 
diff --git a/cpp/tests/unit_tests/executor/executorTestSmall.cpp b/cpp/tests/unit_tests/executor/executorTestSmall.cpp
@@ -204,19 +204,14 @@ INSTANTIATE_TEST_SUITE_P(Float, DecoderFloatTest, paramGenerator,
 
 // Helper function to test calculateCacheSizePerToken with given parameters.
 std::map<runtime::SizeType32, runtime::SizeType32> calculateCacheSizePerTokenHelper(
-    std::vector<runtime::SizeType32> const& maxAttentionWindowVec,
-    runtime::SizeType32 kvFactor = 2,
-    runtime::SizeType32 vocabSize = 32,
-    runtime::SizeType32 nbLayers = 4,
-    runtime::SizeType32 nbAttentionLayers = 4,
-    runtime::SizeType32 nbRnnLayers = 0,
-    runtime::SizeType32 nbHeads = 8,
-    runtime::SizeType32 hiddenSize = 512,
+    std::vector<runtime::SizeType32> const& maxAttentionWindowVec, runtime::SizeType32 kvFactor = 2,
+    runtime::SizeType32 vocabSize = 32, runtime::SizeType32 nbLayers = 4, runtime::SizeType32 nbAttentionLayers = 4,
+    runtime::SizeType32 nbRnnLayers = 0, runtime::SizeType32 nbHeads = 8, runtime::SizeType32 hiddenSize = 512,
     bool isCrossAttention = false)
 {
     // Create minimal ModelConfig for testing.
-    auto modelConfig = runtime::ModelConfig(vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, 
-                                          nbHeads, hiddenSize, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = runtime::ModelConfig(
+        vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, nvinfer1::DataType::kFLOAT);
     modelConfig.useGptAttentionPlugin(true);
     modelConfig.setModelVariant(runtime::ModelConfig::ModelVariant::kGpt);
     modelConfig.setKVCacheType(runtime::ModelConfig::KVCacheType::kPAGED);
@@ -242,8 +237,8 @@ TEST(TrtInflightBatchingTest, CalculateCacheSizePerTokenForDisagg)
         constexpr runtime::SizeType32 nbAttentionLayers = 5;
         constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
         constexpr runtime::SizeType32 nbRnnLayers = 0;
-        auto result = calculateCacheSizePerTokenHelper(
-            maxAttentionWindowVec, kvFactor, vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
         EXPECT_EQ(result.size(), 1);
         EXPECT_EQ(result.at(128), nbAttentionLayers * kvFactor * hiddenSize * numBytesPerFloatElement);
     }
@@ -254,22 +249,23 @@ TEST(TrtInflightBatchingTest, CalculateCacheSizePerTokenForDisagg)
         constexpr runtime::SizeType32 nbAttentionLayers = 5;
         constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
         constexpr runtime::SizeType32 nbRnnLayers = 0;
-        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
         EXPECT_EQ(result.size(), 2);
-        const auto nbAttentionLayersIn128Window = 3;
-        const auto nbAttentionLayersIn256Window = 2;
+        auto const nbAttentionLayersIn128Window = 3;
+        auto const nbAttentionLayersIn256Window = 2;
         EXPECT_EQ(result.at(128), nbAttentionLayersIn128Window * kvFactor * hiddenSize * numBytesPerFloatElement);
         EXPECT_EQ(result.at(256), nbAttentionLayersIn256Window * kvFactor * hiddenSize * numBytesPerFloatElement);
     }
 
     // Test case 3: Single attention window size - attention and rnn layers.
     {
-        std::vector<runtime::SizeType32> maxAttentionWindowVec = {128};        
+        std::vector<runtime::SizeType32> maxAttentionWindowVec = {128};
         constexpr runtime::SizeType32 nbAttentionLayers = 3;
         constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
         constexpr runtime::SizeType32 nbRnnLayers = 2;
-        auto result = calculateCacheSizePerTokenHelper(
-            maxAttentionWindowVec, kvFactor, vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
         EXPECT_EQ(result.size(), 1);
         EXPECT_EQ(result.at(128), nbAttentionLayers * kvFactor * hiddenSize * numBytesPerFloatElement);
     }
@@ -280,10 +276,11 @@ TEST(TrtInflightBatchingTest, CalculateCacheSizePerTokenForDisagg)
         constexpr runtime::SizeType32 nbAttentionLayers = 3;
         constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
         constexpr runtime::SizeType32 nbRnnLayers = 2;
-        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
         EXPECT_EQ(result.size(), 2);
-        const auto nbAttentionLayersIn128Window = 2;
-        const auto nbAttentionLayersIn256Window = 1;
+        auto const nbAttentionLayersIn128Window = 2;
+        auto const nbAttentionLayersIn256Window = 1;
         EXPECT_EQ(result.at(128), nbAttentionLayersIn128Window * kvFactor * hiddenSize * numBytesPerFloatElement);
         EXPECT_EQ(result.at(256), nbAttentionLayersIn256Window * kvFactor * hiddenSize * numBytesPerFloatElement);
     }