--wip--

biswapanda · biswapanda · commit 4e5e26489a61 · 2025-08-17T21:40:06.000-07:00
diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs
@@ -337,12 +337,14 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
         match self.inner.client.instance_source.as_ref() {
             InstanceSource::Static => self.inner.r#static(request).await,
             InstanceSource::Dynamic(_) => {
-                // Extract context ID for request tracking
-                let context_id = request.context().id().to_string();
-                let (instance_id, overlap_amount) = self
-                    .chooser
-                    .find_best_match(&context_id, &request.token_ids)
-                    .await?;
+                let (instance_id, overlap_amount) = if let Some(id) = request.backend_instance_id {
+                    // If instance_id is set, use it
+                    (id, 0)
+                } else {
+                    // Otherwise, find the best match
+                    self.chooser.find_best_match(&request.token_ids).await?
+                };
+
                 let query_instance_id = request.has_annotation("query_instance_id");
                 // Extract context information before moving the request
                 let stream_context = request.context().clone();
diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs
@@ -254,6 +254,10 @@ impl OpenAIPreprocessor {
         builder.annotations(request.annotations().unwrap_or_default());
         builder.mdc_sum(Some(self.mdcsum.clone()));
         builder.estimated_prefix_hit_num_blocks(None);
+        // Extract backend_instance_id from nvext if present
+        if let Some(nvext) = request.nvext() {
+            builder.backend_instance_id(nvext.backend_instance_id);
+        }
 
         Ok((builder.build()?, annotations))
     }
diff --git a/lib/llm/src/protocols/common/preprocessor.rs b/lib/llm/src/protocols/common/preprocessor.rs
@@ -50,6 +50,10 @@ pub struct PreprocessedRequest {
     /// Estimated number of prefix hit tokens (only used in kv aware routing)
     #[builder(default)]
     pub estimated_prefix_hit_num_blocks: Option<u32>,
+
+    /// Targeted backend instance ID for the request
+    #[builder(default)]
+    pub backend_instance_id: Option<i64>,
 }
 
 impl PreprocessedRequest {
diff --git a/lib/llm/src/protocols/openai/nvext.rs b/lib/llm/src/protocols/openai/nvext.rs
@@ -87,6 +87,13 @@ pub struct NvExt {
     #[serde(default, skip_serializing_if = "Option::is_none")]
     #[builder(default, setter(strip_option))]
     pub guided_decoding_backend: Option<String>,
+
+    /// Targeted backend instance ID for the request
+    /// If set, the request will be routed to backend instance with the given ID.
+    /// If not set, the request will be routed to the best matching instance.
+    #[builder(default, setter(strip_option))]
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub backend_instance_id: Option<i64>,
 }
 
 impl Default for NvExt {