Skip to content

Commit 24fc1f9

Browse files
authored
[None][fix] using arrival time in llmapi when creating LlmRequest in pytorch workflow (#7553)
Signed-off-by: zhengd-nv <[email protected]>
1 parent e080294 commit 24fc1f9

File tree

15 files changed

+56
-108
lines changed

15 files changed

+56
-108
lines changed

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 12 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ class GenericLlmRequest
100100
RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
101101
using RequestPtr = std::shared_ptr<GenericLlmRequest>;
102102
using MillisecondsType = std::chrono::milliseconds;
103+
using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
103104
using CacheSaltIDType = runtime::CacheSaltIDType;
104105

105106
GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
@@ -138,7 +139,7 @@ class GenericLlmRequest
138139
std::optional<SizeType32> languageAdapterUid = std::nullopt,
139140
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
140141
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
141-
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
142+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
142143
: mRequestId(requestId)
143144
, mPromptLen(inputTokens->size())
144145
, mMaxNewTokens(maxNewTokens)
@@ -202,7 +203,7 @@ class GenericLlmRequest
202203
mState = LlmRequestState::kENCODER_INIT;
203204
}
204205

205-
initialize(*inputTokens, returnLogProbs);
206+
initialize(*inputTokens, returnLogProbs, arrivalTime);
206207
}
207208

208209
GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
@@ -2054,7 +2055,8 @@ class GenericLlmRequest
20542055
std::optional<CacheSaltIDType> mCacheSaltID{std::nullopt};
20552056

20562057
private:
2057-
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
2058+
void initialize(
2059+
VecTokens const& inputTokens, bool outputLogProbs, std::optional<TimePoint> arrivalTime = std::nullopt)
20582060
{
20592061
if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY)
20602062
{
@@ -2148,7 +2150,7 @@ class GenericLlmRequest
21482150

21492151
if (mReturnPerfMetrics)
21502152
{
2151-
mPerfMetrics.timingMetrics.arrivalTime = std::chrono::steady_clock::now();
2153+
mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(std::chrono::steady_clock::now());
21522154
}
21532155
mStartTime = std::chrono::steady_clock::now();
21542156
}
@@ -2197,61 +2199,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
21972199
using TokenExtraIdType = Base::TokenExtraIdType;
21982200
using VecTokenExtraIds = Base::VecTokenExtraIds;
21992201

2200-
// 49 parameters, 49 parameters in Base class constructor
2201-
LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
2202-
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
2203-
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
2204-
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
2205-
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
2206-
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
2207-
std::optional<SizeType32> promptVocabSize = std::nullopt,
2208-
std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>> multimodalHashes = std::nullopt,
2209-
std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalPositions = std::nullopt,
2210-
std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalLengths = std::nullopt,
2211-
std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
2212-
std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
2213-
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
2214-
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
2215-
std::optional<TensorPtr> loraConfig = std::nullopt,
2216-
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
2217-
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
2218-
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
2219-
std::optional<std::shared_ptr<VecTokens>> const& draftTokens = std::nullopt,
2220-
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
2221-
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
2222-
bool applyLogitsPostProcessorBatched = false,
2223-
std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false,
2224-
std::optional<RequestIdType> clientId = std::nullopt,
2225-
executor::PriorityType priority = executor::Request::kDefaultPriority,
2226-
std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
2227-
std::optional<SizeType32> encoderOutputLength = std::nullopt,
2228-
std::optional<TensorPtr> crossAttentionMask = std::nullopt,
2229-
LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
2230-
std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
2231-
SizeType32 numReturnSequences = 1, std::optional<executor::EagleConfig> eagleConfig = std::nullopt,
2232-
std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false,
2233-
std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
2234-
std::optional<SizeType32> languageAdapterUid = std::nullopt,
2235-
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
2236-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
2237-
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
2238-
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
2239-
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
2240-
std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalHashes),
2241-
std::move(multimodalPositions), std::move(multimodalLengths), std::move(multimodalEmbedding),
2242-
std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
2243-
std::move(loraConfig), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs,
2244-
returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits),
2245-
excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched,
2246-
std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures),
2247-
std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType,
2248-
std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks),
2249-
returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams,
2250-
cacheSaltID)
2251-
{
2252-
}
2253-
2254-
// 49 parameters, 49 parameters in Base class constructor
2202+
// inherit constructors
2203+
using Base::Base;
2204+
22552205
LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
22562206
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
22572207
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
@@ -2286,7 +2236,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
22862236
std::optional<SizeType32> languageAdapterUid = std::nullopt,
22872237
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
22882238
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
2289-
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
2239+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
22902240
: Base(requestId, maxNewTokens, std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),
22912241
samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList),
22922242
std::move(stopWordsList),
@@ -2316,37 +2266,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
23162266
inputTokenExtraIds ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))
23172267
: std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),
23182268
numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics,
2319-
std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID)
2320-
{
2321-
}
2322-
2323-
// 32 parameters, 32 parameters in Base class constructor
2324-
LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
2325-
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
2326-
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
2327-
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
2328-
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
2329-
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
2330-
std::optional<SizeType32> promptVocabSize = std::nullopt,
2331-
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
2332-
std::optional<TensorPtr> loraConfig = std::nullopt,
2333-
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
2334-
bool returnContextLogits = false, bool returnGenerationLogits = false,
2335-
std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
2336-
bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
2337-
bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
2338-
bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
2339-
executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1,
2340-
std::optional<SizeType32> languageAdapterUid = std::nullopt,
2341-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
2342-
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
2343-
: Base(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId, padId,
2344-
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
2345-
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
2346-
lookaheadConfig, returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens),
2347-
std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
2348-
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
2349-
numReturnSequences, languageAdapterUid, contextPhaseParams, cacheSaltID)
2269+
std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID,
2270+
arrivalTime)
23502271
{
23512272
}
23522273

cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
#include <ATen/ATen.h>
3434
#include <nanobind/nanobind.h>
35+
#include <nanobind/stl/chrono.h>
3536
#include <nanobind/stl/optional.h>
3637
#include <nanobind/stl/shared_ptr.h>
3738
#include <nanobind/stl/tuple.h>
@@ -289,7 +290,8 @@ void initBindings(nb::module_& m)
289290
std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
290291
std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
291292
std::optional<executor::ContextPhaseParams> context_phase_params,
292-
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id)
293+
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
294+
std::optional<tb::LlmRequest::TimePoint> arrival_time)
293295
{
294296
auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
295297
{
@@ -329,8 +331,8 @@ void initBindings(nb::module_& m)
329331
encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr,
330332
encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
331333
num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
332-
guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params,
333-
cache_salt_id};
334+
guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id,
335+
arrival_time};
334336
},
335337
nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
336338
nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
@@ -355,7 +357,8 @@ void initBindings(nb::module_& m)
355357
nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt,
356358
nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
357359
nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
358-
nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt)
360+
nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt,
361+
nb::arg("arrival_time") = std::nullopt)
359362
.def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, nb::arg("vocab_size"))
360363
.def(nb::init<tb::LlmRequest const&>())
361364
.def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),

cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
126126
mLanguageAdapterUid, //
127127
mAllottedTimeMs, //
128128
mContextPhaseParams, //
129-
mCacheSaltID //
129+
mCacheSaltID, //
130+
mPerfMetrics.timingMetrics.arrivalTime //
130131
);
131132
}

cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
8585
std::optional<SizeType32> languageAdapterUid = std::nullopt,
8686
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
8787
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
88-
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
88+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
89+
std::optional<TimePoint> arrivalTime = std::nullopt)
8990
: Base(requestId, //
9091
maxNewTokens, //
9192
std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)), //
@@ -147,7 +148,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
147148
languageAdapterUid, //
148149
allottedTimeMs, //
149150
contextPhaseParams, //
150-
cacheSaltID //
151+
cacheSaltID, //
152+
arrivalTime //
151153
)
152154
{
153155
}

cpp/tensorrt_llm/nanobind/bindings.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <nanobind/nanobind.h>
2020
#include <nanobind/operators.h>
2121
#include <nanobind/stl/bind_vector.h>
22+
#include <nanobind/stl/chrono.h>
2223
#include <nanobind/stl/filesystem.h>
2324
#include <nanobind/stl/optional.h>
2425
#include <nanobind/stl/shared_ptr.h>
@@ -511,4 +512,6 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
511512
m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference);
512513
m.def("ipc_nvls_free", &tr::ipcNvlsFree);
513514
m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
515+
516+
m.def("steady_clock_now", []() { return std::chrono::steady_clock::now(); });
514517
}

cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,8 @@ void initBindings(pybind11::module_& m)
295295
std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
296296
std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
297297
std::optional<executor::ContextPhaseParams> context_phase_params,
298-
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id)
298+
std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
299+
std::optional<tb::LlmRequest::TimePoint> arrival_time)
299300
{
300301
auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
301302
{
@@ -336,7 +337,7 @@ void initBindings(pybind11::module_& m)
336337
encoder_input_features_tensor_ptr, encoder_output_length, cross_attention_mask_tensor_ptr,
337338
llm_request_type, input_token_extra_ids, num_return_sequences, eagle_config,
338339
skip_cross_attn_blocks_tensor_ptr, return_perf_metrics, guided_decoding_params,
339-
language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id};
340+
language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, arrival_time};
340341
}),
341342
py::arg("request_id"), py::arg("max_new_tokens"), py::arg("input_tokens"), py::arg("sampling_config"),
342343
py::arg("is_streaming"), py::arg("end_id") = std::nullopt, py::arg("pad_id") = std::nullopt,
@@ -362,7 +363,8 @@ void initBindings(pybind11::module_& m)
362363
py::arg("eagle_config") = std::nullopt, py::arg("skip_cross_attn_blocks") = std::nullopt,
363364
py::arg("return_perf_metrics") = false, py::arg("guided_decoding_params") = std::nullopt,
364365
py::arg("language_adapter_uid") = std::nullopt, py::arg("allotted_time_ms") = std::nullopt,
365-
py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt)
366+
py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt,
367+
py::arg("arrival_time") = std::nullopt)
366368
.def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, py::arg("vocab_size"))
367369
.def(py::init<tb::LlmRequest const&>())
368370
.def("validate", &tb::LlmRequest::validate, py::arg("max_input_len"), py::arg("max_seq_len"),

cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
125125
mLanguageAdapterUid, //
126126
mAllottedTimeMs, //
127127
mContextPhaseParams, //
128-
mCacheSaltID //
128+
mCacheSaltID, //
129+
mPerfMetrics.timingMetrics.arrivalTime //
129130
);
130131
}

0 commit comments

Comments
 (0)