Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
de99e23
[#5860][feat] Add ModelOPT INT4 awq fake quant support in AutoDeploy …
Fridah-nv Oct 1, 2025
e107749
[None][fix] fix patchelf version issue (#8112)
bo-nv Oct 1, 2025
1ad7bc4
[None][feat] Draft: Save state first pass (#7012)
IzzyPutterman Oct 1, 2025
bd3d0ad
[TRTLLM-7733][feat] Executor changes to support helix parallelism (#7…
brb-nv Oct 2, 2025
726ac07
[https://nvbugs/5549081][fix] Fix device id assignment for some visio…
chang-l Oct 2, 2025
32c7f8c
[#7588][feat] lock gpu clocks in test_perf.py to reliably detect perf…
MrGeva Oct 2, 2025
fc7f78c
[TRTLLM-8269][test] do not explicitly pass temperature=0 to select gr…
ixlmar Oct 2, 2025
293637e
[https://nvbugs/5556020][chore] waive test_eagle3 (#8119)
hchings Oct 2, 2025
34d158b
[TRTLLM-6589][feat] Support CUDA graph for DeepEP (#7514)
yifeizhang-c Oct 2, 2025
6568e56
[TRTLLM-7775][feat] Integrate tinygemm2 for gpt-oss (#7916)
dongfengy Oct 2, 2025
fefa7d8
[None][feat] Support for cancelling requests with disaggregation (#8114)
pcastonguay Oct 2, 2025
ab433b7
[None][fix] Fix access to new tokens in sampler. (#7958)
dcampora Oct 2, 2025
08a4791
[None][chore] Adding install_tensorrt.sh script to pip wheel (#8116)
pcastonguay Oct 2, 2025
4136942
[#7588][fix] fixed the kv cache size parsing in test_perf.py AD backe…
MrGeva Oct 2, 2025
a5b59fd
[TRTLLM-6342][bug] Patched incorrect starcoder tp config (#8118)
greg-kwasniewski1 Oct 2, 2025
01423ac
[None][feat] perf_metrics endpoint functionality improvement (#8005)
nv-yilinf Oct 3, 2025
9b3d7cc
[None][feat] Update TRT-LLM Gen MoE kernels (#7970)
nekorobov Oct 3, 2025
ba3dbb6
[https://nvbugs/5548098][fix] Fix flakey unit test for dynamic spec d…
hchings Oct 3, 2025
e2f69c5
[None] [refactor] Minor cleanup and improvements (#7619)
Funatiq Oct 3, 2025
5faa5e9
[None][feat] AutoDeploy: dive deeper into token generation bugs + ena…
lucaslie Oct 3, 2025
9db4366
[None][fix] Fix Qwen3 FP8 per-tensor when requesting TRTLLM-GEN MoE b…
achartier Oct 3, 2025
d821524
[None][feat] AutoDeploy add autotuning when capturing cudagraphs (#8120)
suyoggupta Oct 3, 2025
7bc2d9e
[https://nvbugs/5537878][fix] Reserve an extra slot for padded batch …
ziyixiong-nv Oct 3, 2025
aaf2c3c
[None][feat] AutoDeploy: compiler backends based on nn.Module (#8126)
lucaslie Oct 3, 2025
ca82911
[None][fix] Fix MTP 2-model (#8115)
mikeiovine Oct 3, 2025
0aed9b8
kwargs-first pipeline
lucaslie Sep 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 4 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class BaseCacheTransceiver
virtual void checkGenTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) = 0;

[[nodiscard]] virtual bool checkGenTransferComplete() const = 0;

virtual bool cancelRequest(LlmRequest* llmRequest) = 0;
};

class CacheTransceiver : public BaseCacheTransceiver
Expand Down Expand Up @@ -111,6 +113,8 @@ class CacheTransceiver : public BaseCacheTransceiver

[[nodiscard]] bool checkGenTransferComplete() const override;

virtual bool cancelRequest(LlmRequest* llmRequest) override;

private:
void initializeCommState();

Expand Down
17 changes: 4 additions & 13 deletions cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -20,19 +20,15 @@
#include "tensorrt_llm/batch_manager/common.h"
#include "tensorrt_llm/common/algorithm.h"
#include "tensorrt_llm/common/optionalRef.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"

namespace tensorrt_llm::runtime
{
class DecodingInput;
class DecodingOutput;
class GptDecoderBatched;
class SamplingConfig;
class SpeculativeDecodingMode;

namespace decoder
{
Expand All @@ -56,10 +52,6 @@ class CreateNewDecoderRequests : Algorithm
using CudaStream = tensorrt_llm::runtime::CudaStream;
using TensorPtr = runtime::ITensor::SharedPtr;
using SharedConstPtr = runtime::ITensor::SharedConstPtr;
using DecodingInput = runtime::DecodingInput;
using DecodingOutput = runtime::DecodingOutput;
using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
using GptDecoderBatched = runtime::GptDecoderBatched;
template <typename T>
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;

Expand All @@ -70,16 +62,15 @@ class CreateNewDecoderRequests : Algorithm
{
}

std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
[[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
std::vector<executor::LookaheadDecodingConfig>>
operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;

[[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
std::vector<executor::LookaheadDecodingConfig>>
[[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
Expand Down
67 changes: 47 additions & 20 deletions cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,6 +29,8 @@
#include <cassert>
#include <chrono>
#include <cstdint>
#include <cstring>
#include <list>
#include <memory>
#include <optional>
#include <utility>
Expand Down Expand Up @@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
/// used in layer-wise transmission
kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
kGENERATION_IN_PROGRESS = 13, ///< Generation phase is in progress
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed

// schedulable states ends
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed
kGENERATION_COMPLETE = 20, ///< Generation phase completed
kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
/// after computation finished
Expand Down Expand Up @@ -101,6 +103,7 @@ class GenericLlmRequest
using RequestPtr = std::shared_ptr<GenericLlmRequest>;
using MillisecondsType = std::chrono::milliseconds;
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
using Duration = std::chrono::time_point<std::chrono::steady_clock>::duration;
using CacheSaltIDType = runtime::CacheSaltIDType;

GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
Expand Down Expand Up @@ -1074,7 +1077,6 @@ class GenericLlmRequest
TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
"Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
promptLen, mRequestId);
TLLM_CHECK(prepopulatedPromptLen < promptLen);

auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
Expand Down Expand Up @@ -1115,9 +1117,9 @@ class GenericLlmRequest
mDraftLogits = draftLogits;
}

[[nodiscard]] SizeType32 getNumDraftTokens() const
[[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
{
return hasDraftTokens() ? mDraftTokens->size() : 0;
return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
}

void discardDraftTokens(SizeType32 numTokensToDiscard)
Expand Down Expand Up @@ -1255,7 +1257,7 @@ class GenericLlmRequest
{
if (mPerfMetrics.timingMetrics.firstScheduledTime == executor::RequestPerfMetrics::TimePoint{})
{
mPerfMetrics.timingMetrics.firstScheduledTime = std::chrono::steady_clock::now();
mPerfMetrics.timingMetrics.firstScheduledTime = getSteadyClockNow();
}
}

Expand Down Expand Up @@ -1378,17 +1380,17 @@ class GenericLlmRequest
mGenerationLogitsFragments.push_back(genLogits);
}

SizeType32 getGenerationLogitsFragmentsSize()
[[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
{
return mGenerationLogitsFragments.size();
return static_cast<SizeType32>(mGenerationLogitsFragments.size());
}

void clearGenerationLogitsFragments()
void clearGenerationLogitsFragments() noexcept
{
mGenerationLogitsFragments.clear();
}

bool hasAdditionalOutputs()
[[nodiscard]] bool hasAdditionalOutputs() const noexcept
{
return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
}
Expand Down Expand Up @@ -1689,22 +1691,22 @@ class GenericLlmRequest
mDecodingIter = iter;
}

void setKvCacheTransferStart(std::chrono::time_point<std::chrono::steady_clock> const& time)
void setKvCacheTransferStart(TimePoint const& time)
{
mPerfMetrics.timingMetrics.kvCacheTransferStart = time;
mPerfMetrics.timingMetrics.kvCacheTransferStart = maybeToGlobalSteadyClock(time);
}

void setKvCacheTransferEnd(std::chrono::time_point<std::chrono::steady_clock> const& time)
void setKvCacheTransferEnd(TimePoint const& time)
{
mPerfMetrics.timingMetrics.kvCacheTransferEnd = time;
mPerfMetrics.timingMetrics.kvCacheTransferEnd = maybeToGlobalSteadyClock(time);
}

std::chrono::time_point<std::chrono::steady_clock> getKvCacheTransferStart()
TimePoint getKvCacheTransferStart()
{
return mPerfMetrics.timingMetrics.kvCacheTransferStart;
}

std::chrono::time_point<std::chrono::steady_clock> getKvCacheTransferEnd()
TimePoint getKvCacheTransferEnd()
{
return mPerfMetrics.timingMetrics.kvCacheTransferEnd;
}
Expand Down Expand Up @@ -1788,7 +1790,7 @@ class GenericLlmRequest
if (finishReason == executor::FinishReason::kTIMED_OUT)
{
TLLM_LOG_DEBUG("Request %ld finished by timeout after %f sec", mRequestId,
std::chrono::duration<float>(std::chrono::steady_clock::now() - mStartTime).count());
std::chrono::duration<float>(getSteadyClockNow() - mStartTime).count());
}
if (finishReason == executor::FinishReason::kCANCELLED)
{
Expand Down Expand Up @@ -1826,10 +1828,9 @@ class GenericLlmRequest

void updatePerfMetrics(executor::IterationType iter)
{
auto const currentTokenTime = std::chrono::steady_clock::now();

if (!mPerfMetrics.firstIter)
{
auto const currentTokenTime = getSteadyClockNow();
mPerfMetrics.firstIter = iter;
mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;
}
Expand All @@ -1838,6 +1839,7 @@ class GenericLlmRequest

if (isFinished())
{
auto const currentTokenTime = getSteadyClockNow();
mPerfMetrics.lastIter = iter;
mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;
}
Expand All @@ -1863,6 +1865,15 @@ class GenericLlmRequest
return mUseDraftModel;
}

// If mGlobalSteadyClockOffset is set, return a global steady clock time point, otherwise return local steady clock
// time point
[[nodiscard]] TimePoint getSteadyClockNow() const
{
const TimePoint time_point = std::chrono::steady_clock::now();

return maybeToGlobalSteadyClock(time_point);
}

RequestIdType mRequestId;
SizeType32 mPromptLen;
SizeType32 mMaxNewTokens;
Expand All @@ -1882,6 +1893,9 @@ class GenericLlmRequest
// current position of the prompt tuning table (only used in chunked prefill mode)
SizeType32 mPtableCurrentPosition{0};

// The offset between local steady clock and global steady clock (at rank 0)
inline static std::optional<Duration> mGlobalSteadyClockOffset{std::nullopt};

protected:
bool mIsStreaming;

Expand Down Expand Up @@ -2137,7 +2151,8 @@ class GenericLlmRequest

if (mReturnPerfMetrics)
{
mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(std::chrono::steady_clock::now());
// arrivalTime is assumed to be recorded at the rank 0, so no need to convert it to global clock
mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(getSteadyClockNow());
}
mStartTime = std::chrono::steady_clock::now();
}
Expand Down Expand Up @@ -2167,6 +2182,18 @@ class GenericLlmRequest

return tensor;
}

TimePoint maybeToGlobalSteadyClock(TimePoint const& time_point) const
{
if (mGlobalSteadyClockOffset.has_value())
{
return time_point + *mGlobalSteadyClockOffset;
}
else
{
return time_point;
}
}
};

class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
Expand Down
3 changes: 2 additions & 1 deletion cpp/include/tensorrt_llm/executor/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -1478,7 +1478,8 @@ class CacheTransceiverConfig
class ExecutorConfig
{
public:
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();

static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;

Expand Down
4 changes: 1 addition & 3 deletions cpp/include/tensorrt_llm/runtime/lookaheadModule.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
#include <memory>

namespace tensorrt_llm::runtime
{
Expand All @@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
public:
explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
, mExecutionConfig()
{
}

Expand All @@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule
mExecutionConfig = config;
}

executor::LookaheadDecodingConfig const getExecutionConfig() const
[[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
{
return mExecutionConfig;
}
Expand Down
1 change: 1 addition & 0 deletions cpp/include/tensorrt_llm/runtime/modelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "tensorrt_llm/runtime/lookaheadModule.h"
#include "tensorrt_llm/runtime/loraModule.h"
#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"

#include <NvInferRuntime.h>
#include <array>
Expand Down
8 changes: 4 additions & 4 deletions cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
TLLM_CHECK(connections.size() > (processIdx / peerDuplicateHeadFactor));
TLLM_CHECK(outputSplitCaches.size() > (processIdx / peerDuplicateHeadFactor));
auto startTime = std::chrono::steady_clock::now();
auto startTime = llmRequest.getSteadyClockNow();

size_t ppDomainSize = targetInfo.mDomainPPSize;
size_t bufferTpRank = (processIdx / ppDomainSize) / peerDuplicateHeadFactor;
Expand Down Expand Up @@ -437,7 +437,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
}
}

auto endTime = std::chrono::steady_clock::now();
auto endTime = llmRequest.getSteadyClockNow();
double delay = 0.0;
if (recordDelay)
{
Expand Down Expand Up @@ -753,7 +753,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
TLLM_CHECK(pickUpConnections.size() > processIdx);
TLLM_CHECK(recvSplitCaches.size() > processIdx);
auto startTime = std::chrono::steady_clock::now();
auto startTime = llmRequest.getSteadyClockNow();
size_t size = 0;

if (processIdx >= remainNoCoverTargetNum)
Expand Down Expand Up @@ -794,7 +794,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
}
}

auto endTime = std::chrono::steady_clock::now();
auto endTime = llmRequest.getSteadyClockNow();
double delay = 0.0;
if (recordDelay)
{
Expand Down
13 changes: 13 additions & 0 deletions cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -567,4 +567,17 @@ bool CacheTransceiver::checkGenTransferComplete() const
return mRequesterFutures.empty();
}

bool CacheTransceiver::cancelRequest(LlmRequest* llmRequest)
{
if (llmRequest->isContextOnlyRequest())
{
return mCacheSender->cancelRequest(*llmRequest);
}
else if (llmRequest->isGenerationOnlyRequest())
{
return mCacheReceiver->cancelRequest(*llmRequest);
}
return false;
}

} // namespace tensorrt_llm::batch_manager
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;

namespace tc = tensorrt_llm::common;
namespace te = tensorrt_llm::executor;
namespace tk = tensorrt_llm::kernels;
namespace tr = tensorrt_llm::runtime;

namespace tensorrt_llm::batch_manager
Expand Down
Loading