Skip to content

Commit 4e5e264

Browse files
committed
--wip--
1 parent d177cdf commit 4e5e264

File tree

4 files changed

+23
-6
lines changed

4 files changed

+23
-6
lines changed

lib/llm/src/kv_router.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -337,12 +337,14 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
337337
match self.inner.client.instance_source.as_ref() {
338338
InstanceSource::Static => self.inner.r#static(request).await,
339339
InstanceSource::Dynamic(_) => {
340-
// Extract context ID for request tracking
341-
let context_id = request.context().id().to_string();
342-
let (instance_id, overlap_amount) = self
343-
.chooser
344-
.find_best_match(&context_id, &request.token_ids)
345-
.await?;
340+
let (instance_id, overlap_amount) = if let Some(id) = request.backend_instance_id {
341+
// If instance_id is set, use it
342+
(id, 0)
343+
} else {
344+
// Otherwise, find the best match
345+
self.chooser.find_best_match(&request.token_ids).await?
346+
};
347+
346348
let query_instance_id = request.has_annotation("query_instance_id");
347349
// Extract context information before moving the request
348350
let stream_context = request.context().clone();

lib/llm/src/preprocessor.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@ impl OpenAIPreprocessor {
254254
builder.annotations(request.annotations().unwrap_or_default());
255255
builder.mdc_sum(Some(self.mdcsum.clone()));
256256
builder.estimated_prefix_hit_num_blocks(None);
257+
// Extract backend_instance_id from nvext if present
258+
if let Some(nvext) = request.nvext() {
259+
builder.backend_instance_id(nvext.backend_instance_id);
260+
}
257261

258262
Ok((builder.build()?, annotations))
259263
}

lib/llm/src/protocols/common/preprocessor.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ pub struct PreprocessedRequest {
5050
/// Estimated number of prefix hit tokens (only used in kv aware routing)
5151
#[builder(default)]
5252
pub estimated_prefix_hit_num_blocks: Option<u32>,
53+
54+
/// Targeted backend instance ID for the request
55+
#[builder(default)]
56+
pub backend_instance_id: Option<i64>,
5357
}
5458

5559
impl PreprocessedRequest {

lib/llm/src/protocols/openai/nvext.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@ pub struct NvExt {
8787
#[serde(default, skip_serializing_if = "Option::is_none")]
8888
#[builder(default, setter(strip_option))]
8989
pub guided_decoding_backend: Option<String>,
90+
91+
/// Targeted backend instance ID for the request
92+
/// If set, the request will be routed to backend instance with the given ID.
93+
/// If not set, the request will be routed to the best matching instance.
94+
#[builder(default, setter(strip_option))]
95+
#[serde(default, skip_serializing_if = "Option::is_none")]
96+
pub backend_instance_id: Option<i64>,
9097
}
9198

9299
impl Default for NvExt {

0 commit comments

Comments
 (0)