diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h index 0978905b5e2..bc619a34bc0 100644 --- a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h +++ b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +20,7 @@ #include "tensorrt_llm/batch_manager/common.h" #include "tensorrt_llm/common/algorithm.h" #include "tensorrt_llm/common/optionalRef.h" -#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/modelConfig.h" @@ -28,11 +28,7 @@ namespace tensorrt_llm::runtime { -class DecodingInput; -class DecodingOutput; -class GptDecoderBatched; class SamplingConfig; -class SpeculativeDecodingMode; namespace decoder { @@ -56,10 +52,6 @@ class CreateNewDecoderRequests : Algorithm using CudaStream = tensorrt_llm::runtime::CudaStream; using TensorPtr = runtime::ITensor::SharedPtr; using SharedConstPtr = runtime::ITensor::SharedConstPtr; - using DecodingInput = runtime::DecodingInput; - using DecodingOutput = runtime::DecodingOutput; - using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode; - using GptDecoderBatched = runtime::GptDecoderBatched; template using OptionalRef = tensorrt_llm::common::OptionalRef; @@ -70,7 +62,7 @@ class CreateNewDecoderRequests : Algorithm { } - std::tuple, std::vector, + [[nodiscard]] std::tuple, std::vector, std::vector> operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests, @@ -78,8 +70,7 @@ class CreateNewDecoderRequests : Algorithm CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef medusaBuffers) const; - [[nodiscard]] std::tuple, - std::vector> + [[nodiscard]] std::tuple, std::vector> createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds, executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState, nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 275bc75721a..97db8e15339 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t /// used in layer-wise transmission kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished kGENERATION_IN_PROGRESS = 13, ///< Generation phase is in progress - kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed // schedulable states ends + kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed kGENERATION_COMPLETE = 20, ///< Generation phase completed kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache, /// after computation finished @@ -1074,7 +1076,6 @@ class GenericLlmRequest TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen, "Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen, promptLen, mRequestId); - TLLM_CHECK(prepopulatedPromptLen < promptLen); auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget; auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget; @@ -1115,9 +1116,9 @@ class GenericLlmRequest mDraftLogits = draftLogits; } - [[nodiscard]] SizeType32 getNumDraftTokens() const + [[nodiscard]] SizeType32 getNumDraftTokens() const noexcept { - return hasDraftTokens() ? mDraftTokens->size() : 0; + return hasDraftTokens() ? static_cast(mDraftTokens->size()) : 0; } void discardDraftTokens(SizeType32 numTokensToDiscard) @@ -1378,17 +1379,17 @@ class GenericLlmRequest mGenerationLogitsFragments.push_back(genLogits); } - SizeType32 getGenerationLogitsFragmentsSize() + [[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept { - return mGenerationLogitsFragments.size(); + return static_cast(mGenerationLogitsFragments.size()); } - void clearGenerationLogitsFragments() + void clearGenerationLogitsFragments() noexcept { mGenerationLogitsFragments.clear(); } - bool hasAdditionalOutputs() + [[nodiscard]] bool hasAdditionalOutputs() const noexcept { return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty(); } diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index 9dda07d19c6..b5769177b28 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -1478,7 +1478,8 @@ class CacheTransceiverConfig class ExecutorConfig { public: - static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000; + static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds + = std::chrono::duration_cast(std::chrono::minutes(3)).count(); static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000; diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h index 0f1f18f6a1f..dd8c8c7b7b3 100644 --- a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h +++ b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h @@ -19,7 +19,6 @@ #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/speculativeDecodingModule.h" -#include namespace tensorrt_llm::runtime { @@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule public: explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens) - , mExecutionConfig() { } @@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule mExecutionConfig = config; } - executor::LookaheadDecodingConfig const getExecutionConfig() const + [[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const { return mExecutionConfig; } diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h index b1858573e68..14a1f982dc5 100644 --- a/cpp/include/tensorrt_llm/runtime/modelConfig.h +++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h @@ -21,6 +21,7 @@ #include "tensorrt_llm/runtime/lookaheadModule.h" #include "tensorrt_llm/runtime/loraModule.h" #include "tensorrt_llm/runtime/speculativeDecodingMode.h" +#include "tensorrt_llm/runtime/speculativeDecodingModule.h" #include #include diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp index 3335d69a015..5c5d3e11a01 100644 --- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp +++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp @@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace te = tensorrt_llm::executor; -namespace tk = tensorrt_llm::kernels; namespace tr = tensorrt_llm::runtime; namespace tensorrt_llm::batch_manager diff --git a/cpp/tensorrt_llm/runtime/bufferView.h b/cpp/tensorrt_llm/runtime/bufferView.h index e22208cfcfc..236b89d7d45 100644 --- a/cpp/tensorrt_llm/runtime/bufferView.h +++ b/cpp/tensorrt_llm/runtime/bufferView.h @@ -39,8 +39,8 @@ class BufferView : virtual public IBuffer if (offset + size > mBuffer->getSize()) { - throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size " - + std::to_string(mBuffer->getSize())); + throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ") + + std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize())); } } diff --git a/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp b/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp index e826e81ecf1..38f1cccf936 100644 --- a/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp +++ b/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp @@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList, } // Pick a different endId at random from one of the expected tokens -std::vector pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType, - std::vector const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits) +std::vector pickRandomEndIds(TestData const& testData, std::vector const& givenInputLengths, + SizeType32 const maxNewTokens, bool replaceLogits) { auto const nbGivenInputs = testData.nbGivenInputs; auto const beamWidth = testData.beamWidth; @@ -328,9 +328,9 @@ std::vector pickRandomEndIds(TestData const& testData, TrtGptModelT return endIds; } -TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds, - BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, - bool const replaceLogits, BufferManager& manager) +TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult, + ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, + BufferManager& manager) { auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId); auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult; @@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy if (useRandomEndId) { - testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits); + testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits); } else { @@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy } std::tuple, std::unordered_map> loadTestData(ModelSpec const& modelSpec, - TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, - ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, - BufferManager& manager) + ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput, + SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager) { // Map between beam width, and expected results for that beam width std::unordered_map beamWidthTestData; @@ -424,8 +423,8 @@ std::tuple, std::unordered_map> lo EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end()); beamWidths.push_back(beamWidth); - auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth, - useRandomEndId, replaceLogits, manager); + auto testData = loadTestData( + modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager); beamWidthTestData.emplace(beamWidth, std::move(testData)); } @@ -435,9 +434,8 @@ std::tuple, std::unordered_map> lo RequestList runGptModelInference(std::shared_ptr& trtGptModel, std::vector const& beamWidths, std::unordered_map const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs, SizeType32 maxInputLength, SizeType32 padId, std::vector const& givenInputLengths, - TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, - TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode, - bool enableBlockReuse) + TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep, + bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse) { // Fill the requests using givenInput // requestList will have batchSize requests @@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds auto const maxBeamWidth = executorConfig.getMaxBeamWidth(); // Load expected outputs for each beam width value - auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths, - *givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager); + auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput, + maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager); int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize; auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize); @@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds // Prepopulate KV cache for speculative decoding test bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0; auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize, - nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType, - maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse); + nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep, + prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse); if (prepopulateKVCache) { // Call the 2nd time with prefilled KV cache finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize, - nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType, + nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse); } diff --git a/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp b/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp index d08d2f5bc47..634ee31ad39 100644 --- a/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp @@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest) EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT); EXPECT_FALSE(llmReq.mSeqSlot); // No speculative decoding config, draft tokens should be empty - EXPECT_EQ(llmReq.getDraftTokens()->size(), 0); + EXPECT_EQ(llmReq.getNumDraftTokens(), 0); EXPECT_FALSE(llmReq.getEmbeddingBias().has_value()); EXPECT_FALSE(llmReq.getBadWordsList().has_value()); EXPECT_FALSE(llmReq.getStopWordsList().has_value());