@@ -100,6 +100,7 @@ class GenericLlmRequest
100100 RequestIdType, TensorPtr&, BeamTokens const &, TStream const &, std::optional<RequestIdType>)>;
101101 using RequestPtr = std::shared_ptr<GenericLlmRequest>;
102102 using MillisecondsType = std::chrono::milliseconds;
103+ using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
103104 using CacheSaltIDType = runtime::CacheSaltIDType;
104105
105106 GenericLlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const & inputTokens,
@@ -138,7 +139,7 @@ class GenericLlmRequest
138139 std::optional<SizeType32> languageAdapterUid = std::nullopt ,
139140 std::optional<MillisecondsType> allottedTimeMs = std::nullopt ,
140141 std::optional<executor::ContextPhaseParams> const & contextPhaseParams = std::nullopt ,
141- std::optional<CacheSaltIDType> cacheSaltID = std::nullopt )
142+ std::optional<CacheSaltIDType> cacheSaltID = std::nullopt , std::optional<TimePoint> arrivalTime = std:: nullopt )
142143 : mRequestId (requestId)
143144 , mPromptLen (inputTokens->size ())
144145 , mMaxNewTokens(maxNewTokens)
@@ -202,7 +203,7 @@ class GenericLlmRequest
202203 mState = LlmRequestState::kENCODER_INIT ;
203204 }
204205
205- initialize (*inputTokens, returnLogProbs);
206+ initialize (*inputTokens, returnLogProbs, arrivalTime );
206207 }
207208
208209 GenericLlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const & inputTokens,
@@ -2054,7 +2055,8 @@ class GenericLlmRequest
20542055 std::optional<CacheSaltIDType> mCacheSaltID {std::nullopt };
20552056
20562057private:
2057- void initialize (VecTokens const & inputTokens, bool outputLogProbs)
2058+ void initialize (
2059+ VecTokens const & inputTokens, bool outputLogProbs, std::optional<TimePoint> arrivalTime = std::nullopt )
20582060 {
20592061 if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY)
20602062 {
@@ -2148,7 +2150,7 @@ class GenericLlmRequest
21482150
21492151 if (mReturnPerfMetrics )
21502152 {
2151- mPerfMetrics .timingMetrics .arrivalTime = std::chrono::steady_clock::now ();
2153+ mPerfMetrics .timingMetrics .arrivalTime = arrivalTime. value_or ( std::chrono::steady_clock::now () );
21522154 }
21532155 mStartTime = std::chrono::steady_clock::now ();
21542156 }
@@ -2197,61 +2199,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
21972199 using TokenExtraIdType = Base::TokenExtraIdType;
21982200 using VecTokenExtraIds = Base::VecTokenExtraIds;
21992201
2200- // 49 parameters, 49 parameters in Base class constructor
2201- LlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
2202- runtime::SamplingConfig const & samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt ,
2203- std::optional<SizeType32> padId = std::nullopt , std::optional<TensorPtr> embeddingBias = std::nullopt ,
2204- std::optional<TensorPtr> badWordsList = std::nullopt , std::optional<TensorPtr> stopWordsList = std::nullopt ,
2205- std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt ,
2206- std::optional<TensorPtr> promptEmbeddingTable = std::nullopt ,
2207- std::optional<SizeType32> promptVocabSize = std::nullopt ,
2208- std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>> multimodalHashes = std::nullopt ,
2209- std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalPositions = std::nullopt ,
2210- std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalLengths = std::nullopt ,
2211- std::optional<TensorPtr> multimodalEmbedding = std::nullopt ,
2212- std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt ,
2213- std::optional<SizeType32> mropePositionDeltas = std::nullopt ,
2214- std::optional<LoraTaskIdType> loraTaskId = std::nullopt , std::optional<TensorPtr> loraWeights = std::nullopt ,
2215- std::optional<TensorPtr> loraConfig = std::nullopt ,
2216- std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt ,
2217- std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt ,
2218- bool returnLogProbs = false , bool returnContextLogits = false , bool returnGenerationLogits = false ,
2219- std::optional<std::shared_ptr<VecTokens>> const & draftTokens = std::nullopt ,
2220- std::optional<TensorPtr> draftLogits = std::nullopt , bool excludeInputFromOutput = false ,
2221- std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt ,
2222- bool applyLogitsPostProcessorBatched = false ,
2223- std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt , bool returnEncoderOutput = false ,
2224- std::optional<RequestIdType> clientId = std::nullopt ,
2225- executor::PriorityType priority = executor::Request::kDefaultPriority ,
2226- std::optional<TensorPtr> encoderInputFeatures = std::nullopt ,
2227- std::optional<SizeType32> encoderOutputLength = std::nullopt ,
2228- std::optional<TensorPtr> crossAttentionMask = std::nullopt ,
2229- LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
2230- std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt ,
2231- SizeType32 numReturnSequences = 1 , std::optional<executor::EagleConfig> eagleConfig = std::nullopt ,
2232- std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt , bool returnPerfMetrics = false ,
2233- std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt ,
2234- std::optional<SizeType32> languageAdapterUid = std::nullopt ,
2235- std::optional<MillisecondsType> allottedTimeMs = std::nullopt ,
2236- std::optional<executor::ContextPhaseParams> const & contextPhaseParams = std::nullopt ,
2237- std::optional<CacheSaltIDType> cacheSaltID = std::nullopt )
2238- : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
2239- std::move (embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
2240- std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalHashes),
2241- std::move(multimodalPositions), std::move(multimodalLengths), std::move(multimodalEmbedding),
2242- std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
2243- std::move(loraConfig), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs,
2244- returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits),
2245- excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched,
2246- std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures),
2247- std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType,
2248- std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks),
2249- returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams,
2250- cacheSaltID)
2251- {
2252- }
2253-
2254- // 49 parameters, 49 parameters in Base class constructor
2202+ // inherit constructors
2203+ using Base::Base;
2204+
22552205 LlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
22562206 runtime::SamplingConfig const & samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt ,
22572207 std::optional<SizeType32> padId = std::nullopt , std::optional<TensorPtr> embeddingBias = std::nullopt ,
@@ -2286,7 +2236,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
22862236 std::optional<SizeType32> languageAdapterUid = std::nullopt ,
22872237 std::optional<MillisecondsType> allottedTimeMs = std::nullopt ,
22882238 std::optional<executor::ContextPhaseParams> const & contextPhaseParams = std::nullopt ,
2289- std::optional<CacheSaltIDType> cacheSaltID = std::nullopt )
2239+ std::optional<CacheSaltIDType> cacheSaltID = std::nullopt , std::optional<TimePoint> arrivalTime = std:: nullopt )
22902240 : Base(requestId, maxNewTokens, std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),
22912241 samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList),
22922242 std::move (stopWordsList),
@@ -2316,37 +2266,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
23162266 inputTokenExtraIds ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))
23172267 : std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt ),
23182268 numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics,
2319- std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID)
2320- {
2321- }
2322-
2323- // 32 parameters, 32 parameters in Base class constructor
2324- LlmRequest (RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const & inputTokens,
2325- runtime::SamplingConfig const & samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt ,
2326- std::optional<SizeType32> padId = std::nullopt , std::optional<TensorPtr> embeddingBias = std::nullopt ,
2327- std::optional<TensorPtr> badWordsList = std::nullopt , std::optional<TensorPtr> stopWordsList = std::nullopt ,
2328- std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt ,
2329- std::optional<TensorPtr> promptEmbeddingTable = std::nullopt ,
2330- std::optional<SizeType32> promptVocabSize = std::nullopt ,
2331- std::optional<LoraTaskIdType> loraTaskId = std::nullopt , std::optional<TensorPtr> loraWeights = std::nullopt ,
2332- std::optional<TensorPtr> loraConfig = std::nullopt ,
2333- std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt , bool returnLogProbs = false ,
2334- bool returnContextLogits = false , bool returnGenerationLogits = false ,
2335- std::optional<VecTokens> draftTokens = std::nullopt , std::optional<TensorPtr> draftLogits = std::nullopt ,
2336- bool excludeInputFromOutput = false , std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt ,
2337- bool applyLogitsPostProcessorBatched = false , std::optional<VecTokens> encoderInputTokens = std::nullopt ,
2338- bool returnEncoderOutput = false , std::optional<RequestIdType> clientId = std::nullopt ,
2339- executor::PriorityType priority = executor::Request::kDefaultPriority , SizeType32 numReturnSequences = 1 ,
2340- std::optional<SizeType32> languageAdapterUid = std::nullopt ,
2341- std::optional<executor::ContextPhaseParams> const & contextPhaseParams = std::nullopt ,
2342- std::optional<CacheSaltIDType> cacheSaltID = std::nullopt )
2343- : Base(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId, padId,
2344- std::move (embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
2345- std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
2346- lookaheadConfig, returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens),
2347- std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
2348- applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
2349- numReturnSequences, languageAdapterUid, contextPhaseParams, cacheSaltID)
2269+ std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID,
2270+ arrivalTime)
23502271 {
23512272 }
23522273
0 commit comments