Address comments

nv-yilinf · nv-yilinf · commit 188924be272b · 2025-09-26T17:27:40.000-07:00
Signed-off-by: Yilin Fan &lt;206948969+nv-yilinf@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -140,8 +140,7 @@ class GenericLlmRequest
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt,
-        std::optional<Duration> globalSteadyClockOffset = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
         : mRequestId(requestId)
         , mPromptLen(inputTokens->size())
         , mMaxNewTokens(maxNewTokens)
@@ -199,7 +198,6 @@ class GenericLlmRequest
         , mLanguageAdapterUid(languageAdapterUid)
         , mAllottedTimeMs(allottedTimeMs)
         , mCacheSaltID(cacheSaltID)
-        , mGlobalSteadyClockOffset(globalSteadyClockOffset)
     {
         if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
         {
@@ -227,8 +225,7 @@ class GenericLlmRequest
         executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1,
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
-        std::optional<Duration> globalSteadyClockOffset = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
         : mRequestId(requestId)
         , mPromptLen(inputTokens.size())
         , mMaxNewTokens(maxNewTokens)
@@ -269,7 +266,6 @@ class GenericLlmRequest
         , mNumReturnSequences(numReturnSequences)
         , mLanguageAdapterUid(languageAdapterUid)
         , mCacheSaltID(cacheSaltID)
-        , mGlobalSteadyClockOffset(globalSteadyClockOffset)
     {
         if (mEncoderTokens.has_value())
         {
@@ -1897,6 +1893,9 @@ class GenericLlmRequest
     // current position of the prompt tuning table (only used in chunked prefill mode)
     SizeType32 mPtableCurrentPosition{0};
 
+    // The offset between local steady clock and global steady clock (at rank 0)
+    inline static std::optional<Duration> mGlobalSteadyClockOffset{std::nullopt};
+
 protected:
     bool mIsStreaming;
 
@@ -2059,9 +2058,6 @@ class GenericLlmRequest
     // Cache salt id for each request.
     std::optional<CacheSaltIDType> mCacheSaltID{std::nullopt};
 
-    // The offset between local steady clock and global steady clock (at rank 0)
-    std::optional<Duration> mGlobalSteadyClockOffset;
-
 private:
     void initialize(
         VecTokens const& inputTokens, bool outputLogProbs, std::optional<TimePoint> arrivalTime = std::nullopt)
@@ -2158,6 +2154,7 @@ class GenericLlmRequest
 
         if (mReturnPerfMetrics)
         {
+            // arrivalTime is assumed to be recorded at the rank 0, so no need to convert it to global clock
             mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(getSteadyClockNow());
         }
         mStartTime = getSteadyClockNow();
@@ -2265,8 +2262,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt,
-        std::optional<Duration> globalSteadyClockOffset = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
         : Base(requestId, maxNewTokens, std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),
             samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList),
             std::move(stopWordsList),
@@ -2297,7 +2293,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
                                : std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),
             numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics,
             std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID,
-            arrivalTime, globalSteadyClockOffset)
+            arrivalTime)
     {
     }
 
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -291,8 +291,7 @@ void initBindings(nb::module_& m)
                 std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
                 std::optional<executor::ContextPhaseParams> context_phase_params,
                 std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
-                std::optional<tb::LlmRequest::TimePoint> arrival_time,
-                std::optional<tb::LlmRequest::TimePoint::duration> global_steady_clock_offset)
+                std::optional<tb::LlmRequest::TimePoint> arrival_time)
             {
                 auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
                 {
@@ -333,7 +332,7 @@ void initBindings(nb::module_& m)
                     encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
                     num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
                     guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id,
-                    arrival_time, global_steady_clock_offset};
+                    arrival_time};
             },
             nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
             nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
@@ -359,7 +358,7 @@ void initBindings(nb::module_& m)
             nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
             nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
             nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt,
-            nb::arg("arrival_time") = std::nullopt, nb::arg("global_steady_clock_offset") = std::nullopt)
+            nb::arg("arrival_time") = std::nullopt)
         .def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, nb::arg("vocab_size"))
         .def(nb::init<tb::LlmRequest const&>())
         .def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),
@@ -383,7 +382,8 @@ void initBindings(nb::module_& m)
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, nb::arg("finish_reason"))
         .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
         .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter"))
-        .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors);
+        .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors)
+        .def_rw_static("global_steady_clock_offset", &tb::LlmRequest::mGlobalSteadyClockOffset);
 
     nb::class_<tb::SequenceSlotManager>(m, "SequenceSlotManager")
         .def(nb::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), nb::arg("max_num_slots"),
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
@@ -127,7 +127,6 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
         mAllottedTimeMs,                                           //
         mContextPhaseParams,                                       //
         mCacheSaltID,                                              //
-        mPerfMetrics.timingMetrics.arrivalTime,                    //
-        mGlobalSteadyClockOffset                                   //
+        mPerfMetrics.timingMetrics.arrivalTime                     //
     );
 }
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
@@ -86,8 +86,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
         std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
-        std::optional<TimePoint> arrivalTime = std::nullopt,
-        std::optional<TimePoint::duration> globalSteadyClockOffset = std::nullopt)
+        std::optional<TimePoint> arrivalTime = std::nullopt)
         : Base(requestId,                                                                                       //
             maxNewTokens,                                                                                       //
             std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
@@ -150,8 +149,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
             allottedTimeMs,                                                                                      //
             contextPhaseParams,                                                                                  //
             cacheSaltID,                                                                                         //
-            arrivalTime,                                                                                         //
-            globalSteadyClockOffset                                                                              //
+            arrivalTime                                                                                          //
         )
     {
     }
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -296,8 +296,7 @@ void initBindings(pybind11::module_& m)
                      std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
                      std::optional<executor::ContextPhaseParams> context_phase_params,
                      std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
-                     std::optional<tb::LlmRequest::TimePoint> arrival_time,
-                     std::optional<TimePoint::duration> globalSteadyClockOffset = std::nullopt)
+                     std::optional<tb::LlmRequest::TimePoint> arrival_time)
                  {
                      auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
                      {
@@ -338,7 +337,7 @@ void initBindings(pybind11::module_& m)
                          encoder_input_features_tensor_ptr, encoder_output_length, cross_attention_mask_tensor_ptr,
                          llm_request_type, input_token_extra_ids, num_return_sequences, eagle_config,
                          skip_cross_attn_blocks_tensor_ptr, return_perf_metrics, guided_decoding_params,
-                         language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, arrival_time, global_steady_clock_offset};
+                         language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, arrival_time};
                  }),
             py::arg("request_id"), py::arg("max_new_tokens"), py::arg("input_tokens"), py::arg("sampling_config"),
             py::arg("is_streaming"), py::arg("end_id") = std::nullopt, py::arg("pad_id") = std::nullopt,
@@ -365,7 +364,7 @@ void initBindings(pybind11::module_& m)
             py::arg("return_perf_metrics") = false, py::arg("guided_decoding_params") = std::nullopt,
             py::arg("language_adapter_uid") = std::nullopt, py::arg("allotted_time_ms") = std::nullopt,
             py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt,
-            nb::arg("arrival_time") = std::nullopt, nb::arg("global_steady_clock_offset") = std::nullopt)
+            py::arg("arrival_time") = std::nullopt)
         .def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, py::arg("vocab_size"))
         .def(py::init<tb::LlmRequest const&>())
         .def("validate", &tb::LlmRequest::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
@@ -389,7 +388,8 @@ void initBindings(pybind11::module_& m)
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"))
         .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
         .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter"))
-        .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors);
+        .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors)
+        .def_readwrite_static("global_steady_clock_offset", &tb::LlmRequest::mGlobalSteadyClockOffset);
 
     py::classh<tb::SequenceSlotManager>(m, "SequenceSlotManager")
         .def(py::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), py::arg("max_num_slots"),
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
@@ -126,7 +126,6 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
         mAllottedTimeMs,                                           //
         mContextPhaseParams,                                       //
         mCacheSaltID,                                              //
-        mPerfMetrics.timingMetrics.arrivalTime,                    //
-        mGlobalSteadyClockOffset                                   //
+        mPerfMetrics.timingMetrics.arrivalTime                     //
     );
 }
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
@@ -86,8 +86,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
         std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
-        std::optional<TimePoint> arrivalTime = std::nullopt,
-        std::optional<TimePoint::duration> globalSteadyClockOffset = std::nullopt)
+        std::optional<TimePoint> arrivalTime = std::nullopt)
         : Base(requestId,                                                                                       //
             maxNewTokens,                                                                                       //
             std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -44,8 +44,7 @@ class ExecutorRequestQueue:
     def __init__(self, dist: Distributed, enable_attention_dp: bool,
                  max_batch_size: int, max_beam_width: int,
                  max_num_active_requests: int, enable_iter_perf_stats: bool,
-                 batch_wait_timeout_ms: float, is_disaggregated: bool,
-                 global_steady_clock_offset: float):
+                 batch_wait_timeout_ms: float, is_disaggregated: bool):
         self.dist = dist
         self.request_queue: queue.Queue[RequestQueueItem] = queue.Queue()
         self.waiting_queue: deque[RequestQueueItem] = deque()
@@ -61,7 +60,6 @@ def __init__(self, dist: Distributed, enable_attention_dp: bool,
         self.start_times = {}
         self.active = True
         self.batch_wait_timeout_ms = batch_wait_timeout_ms
-        self.global_steady_clock_offset = global_steady_clock_offset
 
         # State tracking
         self.num_fetch_requests = 0
@@ -613,9 +611,6 @@ def _merge_requests(
         else:
             req_with_children = []
             for req_item in new_requests:
-                if self.global_steady_clock_offset:
-                    req_item.request.py_global_steady_clock_offset = self.global_steady_clock_offset
-
                 req = executor_request_to_llm_request(
                     req_item.id, req_item.request, req_item.child_req_ids,
                     self._should_exclude_last_generation_logits())
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -586,7 +586,6 @@ def executor_request_to_llm_request(
         context_phase_params=executor_request.context_phase_params,
         cache_salt_id=executor_request.cache_salt_id,
         arrival_time=getattr(executor_request, "py_arrival_time", None),
-        global_steady_clock_offset=getattr(executor_request, "py_global_steady_clock_offset", None),
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None))
     if child_req_ids:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -12,6 +12,8 @@
 
 import torch
 
+from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds
+
 try:
     from cuda.bindings import runtime as cudart
 except ImportError:
@@ -165,8 +167,6 @@ def __init__(self,
         super(PyExecutor, self).__init__()
         self.device_id = torch.cuda.current_device()
         self.global_rank = global_mpi_rank()
-        self.dist = dist
-        self.global_steady_clock_offset = self._get_global_steady_clock_offset()
 
         self.peft_cache_config = peft_cache_config
 
@@ -185,6 +185,7 @@ def __init__(self,
         self.draft_model_engine = getattr(self.drafter, "draft_model_engine",
                                           None)
         self.guided_decoder = guided_decoder
+        self.dist = dist
         self.disable_overlap_scheduler = disable_overlap_scheduler
 
         # enqueue and _fetch_new_requests used data
@@ -253,6 +254,7 @@ def __init__(self,
         self.batch_wait_iters_count = 0
 
         # request fetcher initialization
+        self._set_global_steady_clock_offset()
         self.executor_request_queue = ExecutorRequestQueue(
             dist=self.dist,
             enable_attention_dp=self.enable_attention_dp,
@@ -262,7 +264,6 @@ def __init__(self,
             enable_iter_perf_stats=self.enable_iter_perf_stats,
             batch_wait_timeout_ms=self.batch_wait_timeout_ms,
             is_disaggregated=kv_cache_transceiver is not None,
-            global_steady_clock_offset=self.global_steady_clock_offset,
         )
         self.executor_request_queue.set_exclude_last_generation_logits(
             self.disable_overlap_scheduler, self.dist.pp_size)
@@ -365,20 +366,24 @@ def start_worker(self):
                 self.worker_thread.start()
                 self.worker_started = True
 
-    def _get_global_steady_clock_offset(self):
+    def _set_global_steady_clock_offset(self):
         assert self.global_rank >= 0, "rank should be >= 0"
 
         # Sync all ranks
         self.dist.barrier()
         # Immediately take the local steady clock timestamp
-        local_timestamp = time.monotonic()
+        local_timestamp = get_steady_clock_now_in_seconds()
         all_rank_timestamps = self.dist.allgather(local_timestamp)
         if self.global_rank == 0:
             logger.info(
                 f"global_steady_clock_offset at each rank: {[local_timestamp - ts for ts in all_rank_timestamps]}"
             )
         # Compute the steady clock offset between rank 0 and current rank
-        return all_rank_timestamps[0] - local_timestamp
+        global_steady_clock_offset = all_rank_timestamps[0] - local_timestamp
+        LlmRequest.global_steady_clock_offset = global_steady_clock_offset
+        logger.info(
+            f"Setting global_steady_clock_offset: {global_steady_clock_offset} seconds for rank {self.global_rank}"
+        )
 
     def __enter__(self):
         return self
diff --git a/tensorrt_llm/executor/postproc_worker.py b/tensorrt_llm/executor/postproc_worker.py
diff --git a/tensorrt_llm/serve/openai_disagg_server.py b/tensorrt_llm/serve/openai_disagg_server.py
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
diff --git a/tensorrt_llm/serve/responses_utils.py b/tensorrt_llm/serve/responses_utils.py

Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,6 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const`
`127`	`127`	`mAllottedTimeMs, //`
`128`	`128`	`mContextPhaseParams, //`
`129`	`129`	`mCacheSaltID, //`
`130`		`- mPerfMetrics.timingMetrics.arrivalTime, //`
`131`		`- mGlobalSteadyClockOffset //`
	`130`	`+ mPerfMetrics.timingMetrics.arrivalTime //`
`132`	`131`	`);`
`133`	`132`	`}`
Original file line number	Diff line number	Diff line change
`@@ -86,8 +86,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>`
`86`	`86`	`std::optional<MillisecondsType> allottedTimeMs = std::nullopt,`
`87`	`87`	`std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,`
`88`	`88`	`std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,`
`89`		`- std::optional<TimePoint> arrivalTime = std::nullopt,`
`90`		`- std::optional<TimePoint::duration> globalSteadyClockOffset = std::nullopt)`
	`89`	`+ std::optional<TimePoint> arrivalTime = std::nullopt)`
`91`	`90`	`: Base(requestId, //`
`92`	`91`	`maxNewTokens, //`
`93`	`92`	`std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)), //`
`@@ -150,8 +149,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>`
`150`	`149`	`allottedTimeMs, //`
`151`	`150`	`contextPhaseParams, //`
`152`	`151`	`cacheSaltID, //`
`153`		`- arrivalTime, //`
`154`		`- globalSteadyClockOffset //`
	`152`	`+ arrivalTime //`
`155`	`153`	`)`
`156`	`154`	`{`
`157`	`155`	`}`
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,6 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const`
`126`	`126`	`mAllottedTimeMs, //`
`127`	`127`	`mContextPhaseParams, //`
`128`	`128`	`mCacheSaltID, //`
`129`		`- mPerfMetrics.timingMetrics.arrivalTime, //`
`130`		`- mGlobalSteadyClockOffset //`
	`129`	`+ mPerfMetrics.timingMetrics.arrivalTime //`
`131`	`130`	`);`
`132`	`131`	`}`