nv-auto-deploy · lucaslie · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 2, 2025
diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
@@ -71,6 +71,8 @@ class BaseCacheTransceiver
     virtual void checkGenTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) = 0;
 
     [[nodiscard]] virtual bool checkGenTransferComplete() const = 0;
+
+    virtual bool cancelRequest(LlmRequest* llmRequest) = 0;
 };
 
 class CacheTransceiver : public BaseCacheTransceiver
@@ -111,6 +113,8 @@ class CacheTransceiver : public BaseCacheTransceiver
 
     [[nodiscard]] bool checkGenTransferComplete() const override;
 
+    virtual bool cancelRequest(LlmRequest* llmRequest) override;
+
 private:
     void initializeCommState();
 

diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,19 +20,15 @@
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/common/algorithm.h"
 #include "tensorrt_llm/common/optionalRef.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 namespace tensorrt_llm::runtime
 {
-class DecodingInput;
-class DecodingOutput;
-class GptDecoderBatched;
 class SamplingConfig;
-class SpeculativeDecodingMode;
 
 namespace decoder
 {
@@ -56,10 +52,6 @@ class CreateNewDecoderRequests : Algorithm
     using CudaStream = tensorrt_llm::runtime::CudaStream;
     using TensorPtr = runtime::ITensor::SharedPtr;
     using SharedConstPtr = runtime::ITensor::SharedConstPtr;
-    using DecodingInput = runtime::DecodingInput;
-    using DecodingOutput = runtime::DecodingOutput;
-    using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
-    using GptDecoderBatched = runtime::GptDecoderBatched;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
@@ -70,16 +62,15 @@ class CreateNewDecoderRequests : Algorithm
     {
     }
 
-    std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+    [[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
         nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
         CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
         SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
-    [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
-        std::vector<executor::LookaheadDecodingConfig>>
+    [[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
     createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
         nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <cassert>
 #include <chrono>
 #include <cstdint>
+#include <cstring>
+#include <list>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
                                             /// used in layer-wise transmission
     kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
     kGENERATION_IN_PROGRESS = 13,           ///< Generation phase is in progress
-    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
 
     // schedulable states ends
+    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
     kGENERATION_COMPLETE = 20,              ///< Generation phase completed
     kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
                                             /// after computation finished
@@ -101,6 +103,7 @@ class GenericLlmRequest
     using RequestPtr = std::shared_ptr<GenericLlmRequest>;
     using MillisecondsType = std::chrono::milliseconds;
     using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
+    using Duration = std::chrono::time_point<std::chrono::steady_clock>::duration;
     using CacheSaltIDType = runtime::CacheSaltIDType;
 
     GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
@@ -1074,7 +1077,6 @@ class GenericLlmRequest
         TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
             "Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
             promptLen, mRequestId);
-        TLLM_CHECK(prepopulatedPromptLen < promptLen);
 
         auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
         auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
@@ -1115,9 +1117,9 @@ class GenericLlmRequest
         mDraftLogits = draftLogits;
     }
 
-    [[nodiscard]] SizeType32 getNumDraftTokens() const
+    [[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
     {
-        return hasDraftTokens() ? mDraftTokens->size() : 0;
+        return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
     }
 
     void discardDraftTokens(SizeType32 numTokensToDiscard)
@@ -1255,7 +1257,7 @@ class GenericLlmRequest
     {
         if (mPerfMetrics.timingMetrics.firstScheduledTime == executor::RequestPerfMetrics::TimePoint{})
         {
-            mPerfMetrics.timingMetrics.firstScheduledTime = std::chrono::steady_clock::now();
+            mPerfMetrics.timingMetrics.firstScheduledTime = getSteadyClockNow();
         }
     }
 
@@ -1378,17 +1380,17 @@ class GenericLlmRequest
         mGenerationLogitsFragments.push_back(genLogits);
     }
 
-    SizeType32 getGenerationLogitsFragmentsSize()
+    [[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
     {
-        return mGenerationLogitsFragments.size();
+        return static_cast<SizeType32>(mGenerationLogitsFragments.size());
     }
 
-    void clearGenerationLogitsFragments()
+    void clearGenerationLogitsFragments() noexcept
     {
         mGenerationLogitsFragments.clear();
     }
 
-    bool hasAdditionalOutputs()
+    [[nodiscard]] bool hasAdditionalOutputs() const noexcept
     {
         return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
     }
@@ -1689,22 +1691,22 @@ class GenericLlmRequest
         mDecodingIter = iter;
     }
 
-    void setKvCacheTransferStart(std::chrono::time_point<std::chrono::steady_clock> const& time)
+    void setKvCacheTransferStart(TimePoint const& time)
     {
-        mPerfMetrics.timingMetrics.kvCacheTransferStart = time;
+        mPerfMetrics.timingMetrics.kvCacheTransferStart = maybeToGlobalSteadyClock(time);
     }
 
-    void setKvCacheTransferEnd(std::chrono::time_point<std::chrono::steady_clock> const& time)
+    void setKvCacheTransferEnd(TimePoint const& time)
     {
-        mPerfMetrics.timingMetrics.kvCacheTransferEnd = time;
+        mPerfMetrics.timingMetrics.kvCacheTransferEnd = maybeToGlobalSteadyClock(time);
     }
 
-    std::chrono::time_point<std::chrono::steady_clock> getKvCacheTransferStart()
+    TimePoint getKvCacheTransferStart()
     {
         return mPerfMetrics.timingMetrics.kvCacheTransferStart;
     }
 
-    std::chrono::time_point<std::chrono::steady_clock> getKvCacheTransferEnd()
+    TimePoint getKvCacheTransferEnd()
     {
         return mPerfMetrics.timingMetrics.kvCacheTransferEnd;
     }
@@ -1788,7 +1790,7 @@ class GenericLlmRequest
         if (finishReason == executor::FinishReason::kTIMED_OUT)
         {
             TLLM_LOG_DEBUG("Request %ld finished by timeout after %f sec", mRequestId,
-                std::chrono::duration<float>(std::chrono::steady_clock::now() - mStartTime).count());
+                std::chrono::duration<float>(getSteadyClockNow() - mStartTime).count());
         }
         if (finishReason == executor::FinishReason::kCANCELLED)
         {
@@ -1826,10 +1828,9 @@ class GenericLlmRequest
 
     void updatePerfMetrics(executor::IterationType iter)
     {
-        auto const currentTokenTime = std::chrono::steady_clock::now();
-
         if (!mPerfMetrics.firstIter)
         {
+            auto const currentTokenTime = getSteadyClockNow();
             mPerfMetrics.firstIter = iter;
             mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;
         }
@@ -1838,6 +1839,7 @@ class GenericLlmRequest
 
         if (isFinished())
         {
+            auto const currentTokenTime = getSteadyClockNow();
             mPerfMetrics.lastIter = iter;
             mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;
         }
@@ -1863,6 +1865,15 @@ class GenericLlmRequest
         return mUseDraftModel;
     }
 
+    // If mGlobalSteadyClockOffset is set, return a global steady clock time point, otherwise return local steady clock
+    // time point
+    [[nodiscard]] TimePoint getSteadyClockNow() const
+    {
+        const TimePoint time_point = std::chrono::steady_clock::now();
+
+        return maybeToGlobalSteadyClock(time_point);
+    }
+
     RequestIdType mRequestId;
     SizeType32 mPromptLen;
     SizeType32 mMaxNewTokens;
@@ -1882,6 +1893,9 @@ class GenericLlmRequest
     // current position of the prompt tuning table (only used in chunked prefill mode)
     SizeType32 mPtableCurrentPosition{0};
 
+    // The offset between local steady clock and global steady clock (at rank 0)
+    inline static std::optional<Duration> mGlobalSteadyClockOffset{std::nullopt};
+
 protected:
     bool mIsStreaming;
 
@@ -2137,7 +2151,8 @@ class GenericLlmRequest
 
         if (mReturnPerfMetrics)
         {
-            mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(std::chrono::steady_clock::now());
+            // arrivalTime is assumed to be recorded at the rank 0, so no need to convert it to global clock
+            mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(getSteadyClockNow());
         }
         mStartTime = std::chrono::steady_clock::now();
     }
@@ -2167,6 +2182,18 @@ class GenericLlmRequest
 
         return tensor;
     }
+
+    TimePoint maybeToGlobalSteadyClock(TimePoint const& time_point) const
+    {
+        if (mGlobalSteadyClockOffset.has_value())
+        {
+            return time_point + *mGlobalSteadyClockOffset;
+        }
+        else
+        {
+            return time_point;
+        }
+    }
 };
 
 class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>

diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1478,7 +1478,8 @@ class CacheTransceiverConfig
 class ExecutorConfig
 {
 public:
-    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
+    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
+        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();
 
     static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;
 

diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h
@@ -19,7 +19,6 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/speculativeDecodingModule.h"
-#include <memory>
 
 namespace tensorrt_llm::runtime
 {
@@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
 public:
     explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
         : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
-        , mExecutionConfig()
     {
     }
 
@@ -43,7 +41,7 @@ class LookaheadModule : public SpeculativeDecodingModule
         mExecutionConfig = config;
     }
 
-    executor::LookaheadDecodingConfig const getExecutionConfig() const
+    [[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
     {
         return mExecutionConfig;
     }

diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/runtime/lookaheadModule.h"
 #include "tensorrt_llm/runtime/loraModule.h"
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
+#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
 
 #include <NvInferRuntime.h>
 #include <array>

diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -390,7 +390,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
             TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
             TLLM_CHECK(connections.size() > (processIdx / peerDuplicateHeadFactor));
             TLLM_CHECK(outputSplitCaches.size() > (processIdx / peerDuplicateHeadFactor));
-            auto startTime = std::chrono::steady_clock::now();
+            auto startTime = llmRequest.getSteadyClockNow();
 
             size_t ppDomainSize = targetInfo.mDomainPPSize;
             size_t bufferTpRank = (processIdx / ppDomainSize) / peerDuplicateHeadFactor;
@@ -437,7 +437,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
                 }
             }
 
-            auto endTime = std::chrono::steady_clock::now();
+            auto endTime = llmRequest.getSteadyClockNow();
             double delay = 0.0;
             if (recordDelay)
             {
@@ -753,7 +753,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
                 TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
                 TLLM_CHECK(pickUpConnections.size() > processIdx);
                 TLLM_CHECK(recvSplitCaches.size() > processIdx);
-                auto startTime = std::chrono::steady_clock::now();
+                auto startTime = llmRequest.getSteadyClockNow();
                 size_t size = 0;
 
                 if (processIdx >= remainNoCoverTargetNum)
@@ -794,7 +794,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
                     }
                 }
 
-                auto endTime = std::chrono::steady_clock::now();
+                auto endTime = llmRequest.getSteadyClockNow();
                 double delay = 0.0;
                 if (recordDelay)
                 {

diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -567,4 +567,17 @@ bool CacheTransceiver::checkGenTransferComplete() const
     return mRequesterFutures.empty();
 }
 
+bool CacheTransceiver::cancelRequest(LlmRequest* llmRequest)
+{
+    if (llmRequest->isContextOnlyRequest())
+    {
+        return mCacheSender->cancelRequest(*llmRequest);
+    }
+    else if (llmRequest->isGenerationOnlyRequest())
+    {
+        return mCacheReceiver->cancelRequest(*llmRequest);
+    }
+    return false;
+}
+
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;
 
 namespace tc = tensorrt_llm::common;
 namespace te = tensorrt_llm::executor;
-namespace tk = tensorrt_llm::kernels;
 namespace tr = tensorrt_llm::runtime;
 
 namespace tensorrt_llm::batch_manager