From 030b063549d39cf09e93fd833a6e16fdb911bd0c Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Fri, 25 Jul 2025 09:54:39 -0700 Subject: [PATCH 1/5] --wip-- --- lib/llm/src/kv_router.rs | 14 ++++++++------ lib/llm/src/preprocessor.rs | 4 ++++ lib/llm/src/protocols/common/preprocessor.rs | 4 ++++ lib/llm/src/protocols/openai/nvext.rs | 7 +++++++ 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs index 99b9a9432d..0a7bb749a7 100644 --- a/lib/llm/src/kv_router.rs +++ b/lib/llm/src/kv_router.rs @@ -311,12 +311,14 @@ impl AsyncEngine, ManyOut self.inner.r#static(request).await, InstanceSource::Dynamic(_) => { - // Extract context ID for request tracking - let context_id = request.context().id().to_string(); - let (instance_id, overlap_amount) = self - .chooser - .find_best_match(&context_id, &request.token_ids) - .await?; + let (instance_id, overlap_amount) = if let Some(id) = request.backend_instance_id { + // If instance_id is set, use it + (id, 0) + } else { + // Otherwise, find the best match + self.chooser.find_best_match(&request.token_ids).await? + }; + let query_instance_id = request.has_annotation("query_instance_id"); // Extract context information before moving the request let stream_context = request.context().clone(); diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs index eec8367806..1c577c18f0 100644 --- a/lib/llm/src/preprocessor.rs +++ b/lib/llm/src/preprocessor.rs @@ -251,6 +251,10 @@ impl OpenAIPreprocessor { builder.annotations(request.annotations().unwrap_or_default()); builder.mdc_sum(Some(self.mdcsum.clone())); builder.estimated_prefix_hit_num_blocks(None); + // Extract backend_instance_id from nvext if present + if let Some(nvext) = request.nvext() { + builder.backend_instance_id(nvext.backend_instance_id); + } Ok((builder.build()?, annotations)) } diff --git a/lib/llm/src/protocols/common/preprocessor.rs b/lib/llm/src/protocols/common/preprocessor.rs index 90f7845a2a..d1396411fe 100644 --- a/lib/llm/src/protocols/common/preprocessor.rs +++ b/lib/llm/src/protocols/common/preprocessor.rs @@ -55,6 +55,10 @@ pub struct PreprocessedRequest { /// Estimated number of prefix hit tokens (only used in kv aware routing) #[builder(default)] pub estimated_prefix_hit_num_blocks: Option, + + /// Targeted backend instance ID for the request + #[builder(default)] + pub backend_instance_id: Option, } impl PreprocessedRequest { diff --git a/lib/llm/src/protocols/openai/nvext.rs b/lib/llm/src/protocols/openai/nvext.rs index c11d6066dd..bce782b340 100644 --- a/lib/llm/src/protocols/openai/nvext.rs +++ b/lib/llm/src/protocols/openai/nvext.rs @@ -61,6 +61,13 @@ pub struct NvExt { #[serde(default, skip_serializing_if = "Option::is_none")] #[builder(default, setter(strip_option))] pub annotations: Option>, + + /// Targeted backend instance ID for the request + /// If set, the request will be routed to backend instance with the given ID. + /// If not set, the request will be routed to the best matching instance. + #[builder(default, setter(strip_option))] + #[serde(default, skip_serializing_if = "Option::is_none")] + pub backend_instance_id: Option, } impl Default for NvExt { From dfb0306c8ba41ce6eb0154941c3e2f8fef63aa4e Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 14 Aug 2025 16:28:50 -0700 Subject: [PATCH 2/5] fix call .find_best_match(&request.context_id --- lib/llm/src/kv_router.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs index 1e602e2d66..9eda895a3a 100644 --- a/lib/llm/src/kv_router.rs +++ b/lib/llm/src/kv_router.rs @@ -342,7 +342,9 @@ impl AsyncEngine, ManyOut Date: Thu, 14 Aug 2025 16:38:19 -0700 Subject: [PATCH 3/5] update lib/llm/src/kv_router.rs --- lib/llm/src/kv_router.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs index 9eda895a3a..7b87d08e55 100644 --- a/lib/llm/src/kv_router.rs +++ b/lib/llm/src/kv_router.rs @@ -337,13 +337,14 @@ impl AsyncEngine, ManyOut self.inner.r#static(request).await, InstanceSource::Dynamic(_) => { + let context_id = request.context().id().to_string(); let (instance_id, overlap_amount) = if let Some(id) = request.backend_instance_id { // If instance_id is set, use it (id, 0) } else { // Otherwise, find the best match self.chooser - .find_best_match(&request.context_id, &request.token_ids) + .find_best_match(&context_id, &request.token_ids) .await? }; From f5c3e534482dfce6c80d3cc0f6774fc28cd99c0a Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 14 Aug 2025 16:40:35 -0700 Subject: [PATCH 4/5] add comment --- lib/llm/src/kv_router.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs index 7b87d08e55..eb2d965aee 100644 --- a/lib/llm/src/kv_router.rs +++ b/lib/llm/src/kv_router.rs @@ -337,6 +337,7 @@ impl AsyncEngine, ManyOut self.inner.r#static(request).await, InstanceSource::Dynamic(_) => { + // Extract context ID for request tracking let context_id = request.context().id().to_string(); let (instance_id, overlap_amount) = if let Some(id) = request.backend_instance_id { // If instance_id is set, use it From 256a44be7d111284a5d38e2a33ee18a07a8dd86e Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Fri, 15 Aug 2025 10:18:27 -0700 Subject: [PATCH 5/5] add backend_instance_id: None, to 2 calls --- lib/llm/src/migration.rs | 1 + lib/llm/src/mocker/engine.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/lib/llm/src/migration.rs b/lib/llm/src/migration.rs index 588d7f3daa..cb70550960 100644 --- a/lib/llm/src/migration.rs +++ b/lib/llm/src/migration.rs @@ -188,6 +188,7 @@ mod tests { mdc_sum: None, annotations: vec![], estimated_prefix_hit_num_blocks: None, + backend_instance_id: None, } } diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index c7467daa86..6464a23a4b 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -647,6 +647,7 @@ mod integration_tests { mdc_sum: None, annotations: vec![format!("dp_rank:{dp_rank}")], estimated_prefix_hit_num_blocks: None, + backend_instance_id: None, }; let requests = vec![