Skip to content

Commit 23500b5

Browse files
authored
[TRTLLM-7398][feat] Support KV cache salting for secure KV cache reuse (#7106)
Signed-off-by: Chang Liu (Enterprise Products) <[email protected]> Signed-off-by: Chang Liu <[email protected]>
1 parent 12ecb86 commit 23500b5

File tree

32 files changed

+626
-65
lines changed

32 files changed

+626
-65
lines changed

benchmarks/cpp/disaggServerBenchmark.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,8 @@ texec::Request makeExecutorContextRequest(Sample const& sample, SizeType32 const
542542
std::nullopt, // kvCacheRetentionConfig
543543
std::nullopt, // logitsPostProcessorName
544544
std::nullopt, // logitsPostProcessor
545-
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
545+
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt,
546+
std::nullopt); // cacheSaltID
546547
request.setRequestType(tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY);
547548
return request;
548549
}

benchmarks/cpp/gptManagerBenchmark.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -837,7 +837,8 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
837837
std::nullopt, // kvCacheRetentionConfig
838838
std::nullopt, // logitsPostProcessorName
839839
std::nullopt, // logitsPostProcessor
840-
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
840+
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt,
841+
std::nullopt); // cacheSaltID
841842
}
842843

843844
void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngineDir,

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ using UniqueToken = tensorrt_llm::runtime::UniqueToken;
6969
using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
7070
using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
7171
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
72+
using CacheSaltIDType = tensorrt_llm::runtime::CacheSaltIDType;
7273

7374
// Type alias for multimodal hash key (hash array + start offset)
7475
using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
@@ -115,6 +116,7 @@ struct BlockKey
115116
// Extra keys for multimodal data (similar to VLLM's approach)
116117
// Each extra key is a pair of (mm_hash, start_offset_in_block)
117118
std::vector<MmKey> extraKeys;
119+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt;
118120

119121
BlockKey() = default;
120122

@@ -129,24 +131,25 @@ struct BlockKey
129131
}
130132

131133
explicit BlockKey(bool usesExtraIds, std::optional<LoraTaskIdType> loraTaskId, VecUniqueTokens uniqueTokens,
132-
std::vector<MmKey> extraKeys = {})
134+
std::vector<MmKey> extraKeys = {}, std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
133135
: usesExtraIds{usesExtraIds}
134136
, loraTaskId{loraTaskId}
135137
, uniqueTokens{std::move(uniqueTokens)}
136138
, extraKeys{std::move(extraKeys)}
139+
, cacheSaltID{cacheSaltID}
137140
{
138141
}
139142

140143
bool operator==(BlockKey const& other) const noexcept
141144
{
142145
return (usesExtraIds == other.usesExtraIds && loraTaskId == other.loraTaskId
143-
&& uniqueTokens == other.uniqueTokens && extraKeys == other.extraKeys);
146+
&& uniqueTokens == other.uniqueTokens && extraKeys == other.extraKeys && cacheSaltID == other.cacheSaltID);
144147
}
145148

146149
int partialMatch(BlockKey const& other) const noexcept
147150
{
148151
SizeType32 numMatched{0};
149-
if (loraTaskId == other.loraTaskId && extraKeys == other.extraKeys)
152+
if (loraTaskId == other.loraTaskId && extraKeys == other.extraKeys && cacheSaltID == other.cacheSaltID)
150153
{
151154
auto [matchEnd, otherMatchEnd] = std::mismatch(
152155
uniqueTokens.begin(), uniqueTokens.end(), other.uniqueTokens.begin(), other.uniqueTokens.end());

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ class GenericLlmRequest
100100
RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
101101
using RequestPtr = std::shared_ptr<GenericLlmRequest>;
102102
using MillisecondsType = std::chrono::milliseconds;
103+
using CacheSaltIDType = runtime::CacheSaltIDType;
103104

104-
// 49 parameters, 56 items in initialization list
105105
GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
106106
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
107107
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
@@ -137,7 +137,8 @@ class GenericLlmRequest
137137
std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
138138
std::optional<SizeType32> languageAdapterUid = std::nullopt,
139139
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
140-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
140+
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
141+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
141142
: mRequestId(requestId)
142143
, mPromptLen(inputTokens->size())
143144
, mMaxNewTokens(maxNewTokens)
@@ -194,6 +195,7 @@ class GenericLlmRequest
194195
, mGuidedDecodingParams(std::move(guidedDecodingParams))
195196
, mLanguageAdapterUid(languageAdapterUid)
196197
, mAllottedTimeMs(allottedTimeMs)
198+
, mCacheSaltID(cacheSaltID)
197199
{
198200
if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
199201
{
@@ -203,7 +205,6 @@ class GenericLlmRequest
203205
initialize(*inputTokens, returnLogProbs);
204206
}
205207

206-
// 32 parameters, 39 items in initialization list
207208
GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
208209
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
209210
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
@@ -221,7 +222,8 @@ class GenericLlmRequest
221222
bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
222223
executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1,
223224
std::optional<SizeType32> languageAdapterUid = std::nullopt,
224-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
225+
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
226+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
225227
: mRequestId(requestId)
226228
, mPromptLen(inputTokens.size())
227229
, mMaxNewTokens(maxNewTokens)
@@ -261,6 +263,7 @@ class GenericLlmRequest
261263
, mContextPhaseParams(contextPhaseParams)
262264
, mNumReturnSequences(numReturnSequences)
263265
, mLanguageAdapterUid(languageAdapterUid)
266+
, mCacheSaltID(cacheSaltID)
264267
{
265268
if (mEncoderTokens.has_value())
266269
{
@@ -269,7 +272,6 @@ class GenericLlmRequest
269272
initialize(inputTokens, returnLogProbs);
270273
}
271274

272-
// 29 items in initialization list
273275
GenericLlmRequest(RequestIdType requestId, executor::Request const& req)
274276
: mRequestId(requestId)
275277
, mPromptLen(req.getInputTokenIds().size())
@@ -300,6 +302,7 @@ class GenericLlmRequest
300302
, mGuidedDecodingParams(req.getGuidedDecodingParams())
301303
, mLanguageAdapterUid(req.getLanguageAdapterUid())
302304
, mAllottedTimeMs(req.getAllottedTimeMs())
305+
, mCacheSaltID(req.getCacheSaltID())
303306
{
304307
if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
305308
{
@@ -1764,6 +1767,11 @@ class GenericLlmRequest
17641767
return mLanguageAdapterUid;
17651768
}
17661769

1770+
[[nodiscard]] std::optional<CacheSaltIDType> getCacheSaltID() const
1771+
{
1772+
return mCacheSaltID;
1773+
}
1774+
17671775
std::vector<SizeType32> getLanguageAdapterRouting(
17681776
SizeType32 const reqNumLanguages, SizeType32 const inputLength) const
17691777
{
@@ -2042,6 +2050,9 @@ class GenericLlmRequest
20422050

20432051
bool mUseDraftModel{false};
20442052

2053+
// Cache salt id for each request.
2054+
std::optional<CacheSaltIDType> mCacheSaltID{std::nullopt};
2055+
20452056
private:
20462057
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
20472058
{
@@ -2222,7 +2233,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
22222233
std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
22232234
std::optional<SizeType32> languageAdapterUid = std::nullopt,
22242235
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
2225-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
2236+
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
2237+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
22262238
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
22272239
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
22282240
std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalHashes),
@@ -2234,7 +2246,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
22342246
std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures),
22352247
std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType,
22362248
std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks),
2237-
returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams)
2249+
returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams,
2250+
cacheSaltID)
22382251
{
22392252
}
22402253

@@ -2272,7 +2285,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
22722285
std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
22732286
std::optional<SizeType32> languageAdapterUid = std::nullopt,
22742287
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
2275-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
2288+
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
2289+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
22762290
: Base(requestId, maxNewTokens, std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),
22772291
samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList),
22782292
std::move(stopWordsList),
@@ -2302,7 +2316,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
23022316
inputTokenExtraIds ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))
23032317
: std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),
23042318
numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics,
2305-
std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams)
2319+
std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID)
23062320
{
23072321
}
23082322

@@ -2324,14 +2338,15 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
23242338
bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
23252339
executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1,
23262340
std::optional<SizeType32> languageAdapterUid = std::nullopt,
2327-
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
2341+
std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
2342+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
23282343
: Base(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId, padId,
23292344
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
23302345
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
23312346
lookaheadConfig, returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens),
23322347
std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
23332348
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
2334-
numReturnSequences, languageAdapterUid, contextPhaseParams)
2349+
numReturnSequences, languageAdapterUid, contextPhaseParams, cacheSaltID)
23352350
{
23362351
}
23372352

cpp/include/tensorrt_llm/executor/executor.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,7 @@ class Request
670670
/// @param allottedTimeMs The allotted time in milliseconds after which the request is cancelled with a timedOut
671671
/// finish reason. The request may exceed this time slightly, but at most by 1 forward pass (in pipeline parallelism
672672
/// that may involve multiple micro-batches). A request can be timed-out before ever being scheduled.
673-
// 34 parameters
673+
/// @param cacheSaltID Salt ID for KV cache blocks to limit the kv cache reuse to the requests with the same string.
674674
Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false,
675675
SamplingConfig const& samplingConfig = SamplingConfig(), OutputConfig const& outputConfig = OutputConfig(),
676676
std::optional<SizeType32> const& endId = std::nullopt, std::optional<SizeType32> const& padId = std::nullopt,
@@ -697,7 +697,8 @@ class Request
697697
std::optional<EagleConfig> eagleConfig = std::nullopt, std::optional<Tensor> skipCrossAttnBlocks = std::nullopt,
698698
std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt,
699699
std::optional<SizeType32> languageAdapterUid = std::nullopt,
700-
std::optional<MillisecondsType> allottedTimeMs = std::nullopt);
700+
std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
701+
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt);
701702

702703
/// @brief This logits postprocessor name will dispatch to the batched logits postprocessor
703704
static auto constexpr kBatchedPostProcessorName = "batched";
@@ -745,6 +746,7 @@ class Request
745746
[[nodiscard]] std::optional<GuidedDecodingParams> getGuidedDecodingParams() const;
746747
[[nodiscard]] std::optional<SizeType32> getLanguageAdapterUid() const;
747748
[[nodiscard]] std::optional<MillisecondsType> getAllottedTimeMs() const;
749+
[[nodiscard]] std::optional<CacheSaltIDType> getCacheSaltID() const;
748750
[[nodiscard]] std::optional<std::vector<std::string>> getAdditionalOutputNames() const;
749751

750752
void setStreaming(bool streaming);
@@ -780,6 +782,7 @@ class Request
780782
void setGuidedDecodingParams(GuidedDecodingParams const& guidedDecodingParams);
781783
void setLanguageAdapterUid(SizeType32 languageAdapterUid);
782784
void setAllottedTimeMs(MillisecondsType allottedTimeMs);
785+
void setCacheSaltID(CacheSaltIDType cacheSaltID);
783786

784787
private:
785788
friend class Serialization;

cpp/include/tensorrt_llm/executor/types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ using RandomSeedType = std::uint64_t;
5858
using VecLogProbs = std::vector<FloatType>;
5959
using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
6060
using MillisecondsType = std::chrono::milliseconds;
61+
using CacheSaltIDType = std::uint64_t;
6162
using LogitsPostProcessor
6263
= std::function<void(IdType, Tensor&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>;
6364
using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>;

cpp/include/tensorrt_llm/runtime/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ using TokenIdType = std::int32_t;
4444
using LoraTaskIdType = std::uint64_t;
4545
using TokenExtraIdType = std::uint64_t;
4646
using VecTokenExtraIds = std::vector<TokenExtraIdType>;
47+
using CacheSaltIDType = std::uint64_t;
4748

4849
struct UniqueToken
4950
{

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ std::vector<MmKey> generateBlockHashExtraKeys(
131131
// Check if this multimodal content overlaps with the current block
132132
if (endTokenIdx > startPos && startTokenIdx < startPos + length)
133133
{
134-
SizeType32 mmStartInBlock = (startPos >= startTokenIdx) ? 0 : startTokenIdx - startPos;
134+
uint64_t mmStartInBlock = (startPos >= startTokenIdx) ? 0 : static_cast<uint64_t>(startTokenIdx - startPos);
135135
extraKeys.emplace_back(mmHashArray, mmStartInBlock);
136136
}
137137
}
@@ -151,7 +151,7 @@ std::vector<BlockKey> buildBlockKeys(
151151
currentTokenIdx += uniqueTokens.size();
152152

153153
blockKeys.emplace_back(llmRequest.getInputTokensExtraIds().has_value(), llmRequest.getLoraTaskId(),
154-
std::move(uniqueTokens), std::move(extraKeys));
154+
std::move(uniqueTokens), std::move(extraKeys), llmRequest.getCacheSaltID());
155155
}
156156
return blockKeys;
157157
}
@@ -167,6 +167,16 @@ size_t BlockKeyHasher::hash(BlockKey const& blockKey, std::size_t parentHash) no
167167
// Constants provide very good distribution - each input bit affects each output bit with ~50% probability.
168168
size_t seed = blockKey.uniqueTokens.size() ^ parentHash * UINT64_C(0xbf58476d1ce4e5b9);
169169

170+
if (parentHash == 0 && blockKey.cacheSaltID)
171+
{
172+
// Only hashing the cache salt ID for the first block in the sequence
173+
uint64_t c = blockKey.cacheSaltID.value();
174+
c = (c ^ (c >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
175+
c = (c ^ (c >> 27)) * UINT64_C(0x94d049bb133111eb);
176+
c = c ^ (c >> 31);
177+
seed ^= c + 0x9e3779b9 + (seed << 6) + (seed >> 2);
178+
}
179+
170180
for (auto const& uniqueToken : blockKey.uniqueTokens)
171181
{
172182
uint32_t a = static_cast<uint32_t>(uniqueToken.tokenId);

cpp/tensorrt_llm/executor/request.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525

2626
namespace tensorrt_llm::executor
2727
{
28-
// 36 parameters
2928
Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming, SamplingConfig const& samplingConfig,
3029
OutputConfig const& outputConfig, std::optional<SizeType32> const& endId, std::optional<SizeType32> const& padId,
3130
std::optional<std::vector<SizeType32>> positionIds, std::optional<std::list<VecTokens>> badWords,
@@ -41,7 +40,7 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,
4140
std::optional<SizeType32> encoderOutputLength, std::optional<Tensor> crossAttentionMask,
4241
SizeType32 numReturnSequences, std::optional<EagleConfig> eagleConfig, std::optional<Tensor> skipCrossAttnBlocks,
4342
std::optional<GuidedDecodingParams> guidedDecodingParams, std::optional<SizeType32> languageAdapterUid,
44-
std::optional<MillisecondsType> allottedTimeMs)
43+
std::optional<MillisecondsType> allottedTimeMs, std::optional<CacheSaltIDType> cacheSaltID)
4544
: mImpl(std::make_unique<Impl>(std::move(inputTokenIds), maxTokens, streaming, samplingConfig, outputConfig, endId,
4645
padId, std::move(positionIds), std::move(badWords), std::move(stopWords), std::move(embeddingBias),
4746
std::move(externalDraftTokensConfig), std::move(pTuningConfig), std::move(multimodalInput),
@@ -50,7 +49,7 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,
5049
std::move(encoderInputTokenIds), clientId, returnAllGeneratedTokens, priority, type,
5150
std::move(contextPhaseParams), std::move(encoderInputFeatures), encoderOutputLength, crossAttentionMask,
5251
numReturnSequences, eagleConfig, skipCrossAttnBlocks, std::move(guidedDecodingParams), languageAdapterUid,
53-
allottedTimeMs))
52+
allottedTimeMs, cacheSaltID))
5453
{
5554
}
5655

@@ -249,6 +248,11 @@ std::optional<SizeType32> Request::getLanguageAdapterUid() const
249248
return mImpl->getLanguageAdapterUid();
250249
}
251250

251+
std::optional<CacheSaltIDType> Request::getCacheSaltID() const
252+
{
253+
return mImpl->getCacheSaltID();
254+
}
255+
252256
void Request::setStreaming(bool streaming)
253257
{
254258
mImpl->setStreaming(streaming);
@@ -413,4 +417,9 @@ void Request::setLanguageAdapterUid(SizeType32 languageAdapterUid)
413417
{
414418
return mImpl->setLanguageAdapterUid(languageAdapterUid);
415419
}
420+
421+
void Request::setCacheSaltID(CacheSaltIDType cacheSaltID)
422+
{
423+
return mImpl->setCacheSaltID(cacheSaltID);
424+
}
416425
} // namespace tensorrt_llm::executor

0 commit comments

Comments
 (0)