[None] [refactor] Minor cleanup and improvements (NVIDIA#7619)

Funatiq · evezhier · commit f1a4443ac5cf · 2025-10-03T08:37:43.000-07:00
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,19 +20,15 @@
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/common/algorithm.h"
 #include "tensorrt_llm/common/optionalRef.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 namespace tensorrt_llm::runtime
 {
-class DecodingInput;
-class DecodingOutput;
-class GptDecoderBatched;
 class SamplingConfig;
-class SpeculativeDecodingMode;
 
 namespace decoder
 {
@@ -56,10 +52,6 @@ class CreateNewDecoderRequests : Algorithm
     using CudaStream = tensorrt_llm::runtime::CudaStream;
     using TensorPtr = runtime::ITensor::SharedPtr;
     using SharedConstPtr = runtime::ITensor::SharedConstPtr;
-    using DecodingInput = runtime::DecodingInput;
-    using DecodingOutput = runtime::DecodingOutput;
-    using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
-    using GptDecoderBatched = runtime::GptDecoderBatched;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
@@ -70,16 +62,15 @@ class CreateNewDecoderRequests : Algorithm
     {
     }
 
-    std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+    [[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
         nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
         CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
         SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
-    [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
-        std::vector<executor::LookaheadDecodingConfig>>
+    [[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
     createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
         nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <cassert>
 #include <chrono>
 #include <cstdint>
+#include <cstring>
+#include <list>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
                                             /// used in layer-wise transmission
     kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
     kGENERATION_IN_PROGRESS = 13,           ///< Generation phase is in progress
-    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
 
     // schedulable states ends
+    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
     kGENERATION_COMPLETE = 20,              ///< Generation phase completed
     kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
                                             /// after computation finished
@@ -1075,7 +1077,6 @@ class GenericLlmRequest
         TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
             "Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
             promptLen, mRequestId);
-        TLLM_CHECK(prepopulatedPromptLen < promptLen);
 
         auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
         auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
@@ -1116,9 +1117,9 @@ class GenericLlmRequest
         mDraftLogits = draftLogits;
     }
 
-    [[nodiscard]] SizeType32 getNumDraftTokens() const
+    [[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
     {
-        return hasDraftTokens() ? mDraftTokens->size() : 0;
+        return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
     }
 
     void discardDraftTokens(SizeType32 numTokensToDiscard)
@@ -1379,17 +1380,17 @@ class GenericLlmRequest
         mGenerationLogitsFragments.push_back(genLogits);
     }
 
-    SizeType32 getGenerationLogitsFragmentsSize()
+    [[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
     {
-        return mGenerationLogitsFragments.size();
+        return static_cast<SizeType32>(mGenerationLogitsFragments.size());
     }
 
-    void clearGenerationLogitsFragments()
+    void clearGenerationLogitsFragments() noexcept
     {
         mGenerationLogitsFragments.clear();
     }
 
-    bool hasAdditionalOutputs()
+    [[nodiscard]] bool hasAdditionalOutputs() const noexcept
     {
         return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
     }
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1478,7 +1478,8 @@ class CacheTransceiverConfig
 class ExecutorConfig
 {
 public:
-    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
+    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
+        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();
 
     static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;
 
diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h
@@ -19,7 +19,6 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/speculativeDecodingModule.h"
-#include <memory>
 
 namespace tensorrt_llm::runtime
 {
@@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
 public:
     explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
         : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
-        , mExecutionConfig()
     {
     }
 
@@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule
         mExecutionConfig = config;
     }
 
-    executor::LookaheadDecodingConfig const getExecutionConfig() const
+    [[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
     {
         return mExecutionConfig;
     }
diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/runtime/lookaheadModule.h"
 #include "tensorrt_llm/runtime/loraModule.h"
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
+#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
 
 #include <NvInferRuntime.h>
 #include <array>
diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;
 
 namespace tc = tensorrt_llm::common;
 namespace te = tensorrt_llm::executor;
-namespace tk = tensorrt_llm::kernels;
 namespace tr = tensorrt_llm::runtime;
 
 namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/runtime/bufferView.h b/cpp/tensorrt_llm/runtime/bufferView.h
@@ -39,8 +39,8 @@ class BufferView : virtual public IBuffer
 
         if (offset + size > mBuffer->getSize())
         {
-            throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "
-                + std::to_string(mBuffer->getSize()));
+            throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")
+                + std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));
         }
     }
 
diff --git a/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp b/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp
@@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList,
 }
 
 // Pick a different endId at random from one of the expected tokens
-std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType,
-    std::vector<SizeType32> const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits)
+std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, std::vector<SizeType32> const& givenInputLengths,
+    SizeType32 const maxNewTokens, bool replaceLogits)
 {
     auto const nbGivenInputs = testData.nbGivenInputs;
     auto const beamWidth = testData.beamWidth;
@@ -328,9 +328,9 @@ std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelT
     return endIds;
 }
 
-TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds,
-    BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId,
-    bool const replaceLogits, BufferManager& manager)
+TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult,
+    ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
+    BufferManager& manager)
 {
     auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId);
     auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult;
@@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
 
     if (useRandomEndId)
     {
-        testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits);
+        testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits);
     }
     else
     {
@@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
 }
 
 std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> loadTestData(ModelSpec const& modelSpec,
-    TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths,
-    ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
-    BufferManager& manager)
+    ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput,
+    SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager)
 {
     // Map between beam width, and expected results for that beam width
     std::unordered_map<SizeType32, TestData> beamWidthTestData;
@@ -424,8 +423,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
         EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end());
         beamWidths.push_back(beamWidth);
 
-        auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth,
-            useRandomEndId, replaceLogits, manager);
+        auto testData = loadTestData(
+            modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager);
         beamWidthTestData.emplace(beamWidth, std::move(testData));
     }
 
@@ -435,9 +434,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
 RequestList runGptModelInference(std::shared_ptr<TrtGptModel>& trtGptModel, std::vector<SizeType32> const& beamWidths,
     std::unordered_map<SizeType32, TestData> const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs,
     SizeType32 maxInputLength, SizeType32 padId, std::vector<SizeType32> const& givenInputLengths,
-    TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType,
-    TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode,
-    bool enableBlockReuse)
+    TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep,
+    bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse)
 {
     // Fill the requests using givenInput
     // requestList will have batchSize requests
@@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
 
     auto const maxBeamWidth = executorConfig.getMaxBeamWidth();
     // Load expected outputs for each beam width value
-    auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths,
-        *givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
+    auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput,
+        maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
 
     int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize;
     auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize);
@@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
         // Prepopulate KV cache for speculative decoding test
         bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0;
         auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
-            nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
-            maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
+            nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep,
+            prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
 
         if (prepopulateKVCache)
         {
             // Call the 2nd time with prefilled KV cache
             finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
-                nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
+                nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType,
                 maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse);
         }
 
diff --git a/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp b/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp
@@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest)
         EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT);
         EXPECT_FALSE(llmReq.mSeqSlot);
         // No speculative decoding config, draft tokens should be empty
-        EXPECT_EQ(llmReq.getDraftTokens()->size(), 0);
+        EXPECT_EQ(llmReq.getNumDraftTokens(), 0);
         EXPECT_FALSE(llmReq.getEmbeddingBias().has_value());
         EXPECT_FALSE(llmReq.getBadWordsList().has_value());
         EXPECT_FALSE(llmReq.getStopWordsList().has_value());

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.`
	`2`	`+ * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.`
`3`	`3`	`*`
`4`	`4`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`* you may not use this file except in compliance with the License.`
`@@ -29,6 +29,8 @@`
`29`	`29`	`#include <cassert>`
`30`	`30`	`#include <chrono>`
`31`	`31`	`#include <cstdint>`
	`32`	`+#include <cstring>`
	`33`	`+#include <list>`
`32`	`34`	`#include <memory>`
`33`	`35`	`#include <optional>`
`34`	`36`	`#include <utility>`
`@@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t`
`56`	`58`	`/// used in layer-wise transmission`
`57`	`59`	`kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished`
`58`	`60`	`kGENERATION_IN_PROGRESS = 13, ///< Generation phase is in progress`
`59`		`- kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed`
`60`	`61`
`61`	`62`	`// schedulable states ends`
	`63`	`+ kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed`
`62`	`64`	`kGENERATION_COMPLETE = 20, ///< Generation phase completed`
`63`	`65`	`kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,`
`64`	`66`	`/// after computation finished`
`@@ -1075,7 +1077,6 @@ class GenericLlmRequest`
`1075`	`1077`	`TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,`
`1076`	`1078`	`"Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,`
`1077`	`1079`	`promptLen, mRequestId);`
`1078`		`- TLLM_CHECK(prepopulatedPromptLen < promptLen);`
`1079`	`1080`
`1080`	`1081`	`auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;`
`1081`	`1082`	`auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;`
`@@ -1116,9 +1117,9 @@ class GenericLlmRequest`
`1116`	`1117`	`mDraftLogits = draftLogits;`
`1117`	`1118`	`}`
`1118`	`1119`
`1119`		`- [[nodiscard]] SizeType32 getNumDraftTokens() const`
	`1120`	`+ [[nodiscard]] SizeType32 getNumDraftTokens() const noexcept`
`1120`	`1121`	`{`
`1121`		`- return hasDraftTokens() ? mDraftTokens->size() : 0;`
	`1122`	`+ return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;`
`1122`	`1123`	`}`
`1123`	`1124`
`1124`	`1125`	`void discardDraftTokens(SizeType32 numTokensToDiscard)`
`@@ -1379,17 +1380,17 @@ class GenericLlmRequest`
`1379`	`1380`	`mGenerationLogitsFragments.push_back(genLogits);`
`1380`	`1381`	`}`
`1381`	`1382`
`1382`		`- SizeType32 getGenerationLogitsFragmentsSize()`
	`1383`	`+ [[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept`
`1383`	`1384`	`{`
`1384`		`- return mGenerationLogitsFragments.size();`
	`1385`	`+ return static_cast<SizeType32>(mGenerationLogitsFragments.size());`
`1385`	`1386`	`}`
`1386`	`1387`
`1387`		`- void clearGenerationLogitsFragments()`
	`1388`	`+ void clearGenerationLogitsFragments() noexcept`
`1388`	`1389`	`{`
`1389`	`1390`	`mGenerationLogitsFragments.clear();`
`1390`	`1391`	`}`
`1391`	`1392`
`1392`		`- bool hasAdditionalOutputs()`
	`1393`	`+ [[nodiscard]] bool hasAdditionalOutputs() const noexcept`
`1393`	`1394`	`{`
`1394`	`1395`	`return !mAdditionalContextOutputTensors.empty() \|\| !mAdditionalGenerationOutputTensors.empty();`
`1395`	`1396`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1478,7 +1478,8 @@ class CacheTransceiverConfig`
`1478`	`1478`	`class ExecutorConfig`
`1479`	`1479`	`{`
`1480`	`1480`	`public:`
`1481`		`- static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;`
	`1481`	`+ static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds`
	`1482`	`+ = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();`
`1482`	`1483`
`1483`	`1484`	`static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;`
`1484`	`1485`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,6 @@`
`19`	`19`	`#include "tensorrt_llm/executor/executor.h"`
`20`	`20`	`#include "tensorrt_llm/runtime/common.h"`
`21`	`21`	`#include "tensorrt_llm/runtime/speculativeDecodingModule.h"`
`22`		`-#include <memory>`
`23`	`22`
`24`	`23`	`namespace tensorrt_llm::runtime`
`25`	`24`	`{`
`@@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule`
`29`	`28`	`public:`
`30`	`29`	`explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept`
`31`	`30`	`: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)`
`32`		`- , mExecutionConfig()`
`33`	`31`	`{`
`34`	`32`	`}`
`35`	`33`
`@@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule`
`43`	`41`	`mExecutionConfig = config;`
`44`	`42`	`}`
`45`	`43`
`46`		`- executor::LookaheadDecodingConfig const getExecutionConfig() const`
	`44`	`+ [[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const`
`47`	`45`	`{`
`48`	`46`	`return mExecutionConfig;`
`49`	`47`	`}`
Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,8 @@ class BufferView : virtual public IBuffer`
`39`	`39`
`40`	`40`	`if (offset + size > mBuffer->getSize())`
`41`	`41`	`{`
`42`		`- throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "`
`43`		`- + std::to_string(mBuffer->getSize()));`
	`42`	`+ throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")`
	`43`	`+ + std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));`
`44`	`44`	`}`
`45`	`45`	`}`
`46`	`46`