From 094a0bf0be030143748d2cf5cf879f05d05e3fb5 Mon Sep 17 00:00:00 2001
From: Rui Zhang <zrfishnoodles@gmail.com>
Date: Mon, 15 Sep 2025 18:01:53 +0000
Subject: [PATCH 01/11] [bugfix] fix kvaware routing

Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com>
---
 .github/workflows/router-e2e-test.yml     |  2 +-
 docker/Dockerfile                         |  9 +++--
 helm/templates/deployment-router.yaml     |  2 +
 helm/templates/deployment-vllm-multi.yaml |  2 +
 pyproject.toml                            |  1 +
 src/vllm_router/app.py                    |  5 +++
 src/vllm_router/routers/routing_logic.py  | 45 +++++++++++++++++++----
 7 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
index 6430547d8..eec857cc1 100644
--- a/.github/workflows/router-e2e-test.yml
+++ b/.github/workflows/router-e2e-test.yml
@@ -124,7 +124,7 @@ jobs:
           echo "🔨 Building router docker image"
           cd ${{ github.workspace }}
           eval "$(minikube docker-env)"
-          docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware .
+          docker build -t git-act-router -f docker/Dockerfile .
 
       - name: Run all k8s discovery routing tests
         run: |
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0246105d2..9fe7646b4 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -19,9 +19,12 @@ ARG INSTALL_OPTIONAL_DEP=semantic_cache,lmcache
 ENV INSTALL_OPTIONAL_DEP=${INSTALL_OPTIONAL_DEP}
 
 # Install dependencies (use cache, and delete after install, to speed up the build)
-RUN pip install --upgrade --no-cache-dir pip setuptools_scm && \
-    pip install --no-cache-dir .[$INSTALL_OPTIONAL_DEP]
+RUN pip install --no-cache-dir uv && \
+    uv venv /opt/venv && \
+    . /opt/venv/bin/activate && \
+    uv pip install --upgrade --no-cache-dir pip setuptools_scm && \
+    uv pip install --no-cache-dir .[$INSTALL_OPTIONAL_DEP]
 
 # Set the entrypoint
-ENTRYPOINT ["vllm-router"]
+ENTRYPOINT ["/opt/venv/bin/vllm-router"]
 CMD []
diff --git a/helm/templates/deployment-router.yaml b/helm/templates/deployment-router.yaml
index d81da3f6f..cf735ffd9 100644
--- a/helm/templates/deployment-router.yaml
+++ b/helm/templates/deployment-router.yaml
@@ -50,6 +50,8 @@ spec:
           - name: HF_TOKEN
             value: "{{ .Values.routerSpec.hf_token }}"
           {{- end }}
+          - name: PYTHONHASHSEED
+            value: "123"
           - name: LMCACHE_LOG_LEVEL
             value: "DEBUG"
           {{- if .Values.servingEngineSpec.enableEngine -}}
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
index 207be3297..453eaa550 100644
--- a/helm/templates/deployment-vllm-multi.yaml
+++ b/helm/templates/deployment-vllm-multi.yaml
@@ -190,6 +190,8 @@ spec:
           {{- end }}
           imagePullPolicy: "{{ .Values.servingEngineSpec.imagePullPolicy | default "Always" }}"
           env:
+          - name: PYTHONHASHSEED
+            value: "123"
           - name: HF_HOME
             {{- if hasKey $modelSpec "pvcStorage" }}
             value: /data
diff --git a/pyproject.toml b/pyproject.toml
index 6643c744d..e23a9ae32 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ semantic_cache = [
 ]
 lmcache = [
     "lmcache==0.3.5",
+    "vllm==0.10.1.1",
 ]
 test = [
     "pytest>=8.3.4",
diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py
index 9b99f3b01..112b9a2c0 100644
--- a/src/vllm_router/app.py
+++ b/src/vllm_router/app.py
@@ -33,6 +33,7 @@
 from vllm_router.routers.main_router import main_router
 from vllm_router.routers.metrics_router import metrics_router
 from vllm_router.routers.routing_logic import (
+    cleanup_routing_logic,
     get_routing_logic,
     initialize_routing_logic,
 )
@@ -111,6 +112,10 @@ async def lifespan(app: FastAPI):
         logger.info("Closing dynamic config watcher")
         dyn_cfg_watcher.close()
 
+    # Close routing logic instances
+    logger.info("Closing routing logic instances")
+    cleanup_routing_logic()
+
 
 def initialize_all(app: FastAPI, args):
     """
diff --git a/src/vllm_router/routers/routing_logic.py b/src/vllm_router/routers/routing_logic.py
index 86b05518d..63f8f0638 100644
--- a/src/vllm_router/routers/routing_logic.py
+++ b/src/vllm_router/routers/routing_logic.py
@@ -14,6 +14,7 @@
 
 import abc
 import asyncio
+import concurrent.futures
 import enum
 import math
 import random
@@ -265,7 +266,9 @@ def start_kv_manager(self):
         self.loop = asyncio.new_event_loop()
         self.thread = threading.Thread(target=self.loop.run_forever, daemon=True)
         self.thread.start()
-        asyncio.run_coroutine_threadsafe(self.kv_manager.start_all(), self.loop)
+        self.lmcache_cluster_monitor_task = asyncio.run_coroutine_threadsafe(
+            self.kv_manager.start_all(), self.loop
+        )
 
     def query_manager(self, msg) -> str:
         """
@@ -274,6 +277,20 @@ def query_manager(self, msg) -> str:
         instance_id = self.kv_manager.handle_orchestration_message(msg)
         return instance_id
 
+    def close(self):
+        """Gracefully shutdown the lmcache cluster monitor task."""
+        if (
+            hasattr(self, "lmcache_cluster_monitor_task")
+            and self.lmcache_cluster_monitor_task
+        ):
+            logger.info("Shutting down lmcache cluster monitor task")
+            self.lmcache_cluster_monitor_task.cancel()
+            try:
+                self.lmcache_cluster_monitor_task.result()
+            except concurrent.futures.CancelledError:
+                pass
+            self.lmcache_cluster_monitor_task = None
+
     async def route_request(
         self,
         endpoints: List[EndpointInfo],
@@ -323,8 +340,10 @@ async def route_request(
         event_id = "Lookup" + str(uuid.uuid4())
         logger.debug(f"Lookup event id: {event_id}")
         msg = LookupMsg(tokens=token_ids, event_id=event_id)
+        logger.debug(f"Lookup message: {msg}")
         instance_id = await self.query_manager(msg)
         matched_tokens = math.inf
+        logger.debug(f"Instance id: {instance_id}")
         if len(list(instance_id.layout_info.keys())) > 0:
             matched_instance_id = list(instance_id.layout_info.keys())[
                 0
@@ -359,8 +378,9 @@ async def route_request(
                         ].split("//")[1],
                         event_id=event_id,
                     )
+                    logger.debug(f"QueryInst message: {query_message}")
                     endpoint_instance_id = await self.query_manager(query_message)
-
+                    logger.debug(f"Endpoint instance id: {endpoint_instance_id}")
                     self.instance_id_to_ip[endpoint_instance_id.instance_id] = (
                         endpoint.url
                     )
@@ -528,19 +548,26 @@ def reconfigure_routing_logic(
     routing_logic: RoutingLogic, *args, **kwargs
 ) -> RoutingInterface:
     # Remove the existing routers from the singleton registry
+    cleanup_routing_logic()
+    return initialize_routing_logic(routing_logic, *args, **kwargs)
+
+
+def get_routing_logic() -> RoutingInterface:
+    # Look up in our singleton registry which router (if any) has been created.
     for cls in (
         SessionRouter,
         RoundRobinRouter,
         KvawareRouter,
+        PrefixAwareRouter,
         DisaggregatedPrefillRouter,
     ):
         if cls in SingletonABCMeta._instances:
-            del SingletonABCMeta._instances[cls]
-    return initialize_routing_logic(routing_logic, *args, **kwargs)
+            return cls()
+    raise ValueError("The global router has not been initialized")
 
 
-def get_routing_logic() -> RoutingInterface:
-    # Look up in our singleton registry which router (if any) has been created.
+def cleanup_routing_logic():
+    """Clean up all routing logic instances."""
     for cls in (
         SessionRouter,
         RoundRobinRouter,
@@ -549,5 +576,7 @@ def get_routing_logic() -> RoutingInterface:
         DisaggregatedPrefillRouter,
     ):
         if cls in SingletonABCMeta._instances:
-            return cls()
-    raise ValueError("The global router has not been initialized")
+            instance = cls()
+            if hasattr(instance, "close"):
+                instance.close()
+            del SingletonABCMeta._instances[cls]

From 7408cb17ef48dbf6ac5642012efe9e7fcbb32ab0 Mon Sep 17 00:00:00 2001
From: Rui Zhang <zrfishnoodles@gmail.com>
Date: Tue, 16 Sep 2025 01:01:17 +0000
Subject: [PATCH 02/11] Modify CI to be compatible

Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com>
---
 .github/values-06-session-routing.yaml    | 17 +++++++++++++----
 .github/values-07-prefix-routing.yaml     | 17 +++++++++++++----
 .github/values-08-roundrobin-routing.yaml | 17 +++++++++++++----
 .github/values-09-kvaware-routing.yaml    | 17 +++++++++++++----
 .github/values-10-disagg-prefill.yaml     | 17 +++++++++++++----
 5 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
index 43974f94e..16747fcb4 100644
--- a/.github/values-06-session-routing.yaml
+++ b/.github/values-06-session-routing.yaml
@@ -6,10 +6,10 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -31,6 +31,10 @@ servingEngineSpec:
         nixlEnableGc: true
         enablePD: true
         cpuOffloadingBufferSize: 0
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8001
+        distributedUrl: "localhost:30081"
       labels:
         model: "opt125m-prefill"
       chatTemplate: "chat.jinja2"
@@ -40,10 +44,10 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-decode"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -63,6 +67,10 @@ servingEngineSpec:
         nixlBufferDevice: "cuda"
         nixlEnableGc: true
         enablePD: true
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8002
+        distributedUrl: "localhost:30082"
       labels:
         model: "opt125m-decode"
       chatTemplate: "chat.jinja2"
@@ -81,6 +89,7 @@ routerSpec:
     type: Recreate
   enableRouter: true
   routingLogic: "session"
+  lmcacheControllerPort: 9000
   sessionKey: "x-user-id"
   extraArgs:
     - "--log-level"
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
index 4b8bf76af..dd1b2aff1 100644
--- a/.github/values-07-prefix-routing.yaml
+++ b/.github/values-07-prefix-routing.yaml
@@ -6,10 +6,10 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -31,6 +31,10 @@ servingEngineSpec:
         nixlEnableGc: true
         enablePD: true
         cpuOffloadingBufferSize: 0
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8001
+        distributedUrl: "localhost:30081"
       labels:
         model: "opt125m-prefill"
       chatTemplate: "chat.jinja2"
@@ -40,10 +44,10 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-decode"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -63,6 +67,10 @@ servingEngineSpec:
         nixlBufferDevice: "cuda"
         nixlEnableGc: true
         enablePD: true
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8002
+        distributedUrl: "localhost:30082"
       labels:
         model: "opt125m-decode"
       chatTemplate: "chat.jinja2"
@@ -84,3 +92,4 @@ routerSpec:
   extraArgs:
     - "--log-level"
     - "info"
+  lmcacheControllerPort: 9000
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
index e9362eee6..23be0c106 100644
--- a/.github/values-08-roundrobin-routing.yaml
+++ b/.github/values-08-roundrobin-routing.yaml
@@ -6,10 +6,10 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -31,6 +31,10 @@ servingEngineSpec:
         nixlEnableGc: true
         enablePD: true
         cpuOffloadingBufferSize: 0
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8001
+        distributedUrl: "localhost:30081"
       labels:
         model: "opt125m-prefill"
       chatTemplate: "chat.jinja2"
@@ -40,10 +44,10 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-decode"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -63,6 +67,10 @@ servingEngineSpec:
         nixlBufferDevice: "cuda"
         nixlEnableGc: true
         enablePD: true
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8002
+        distributedUrl: "localhost:30082"
       labels:
         model: "opt125m-decode"
       chatTemplate: "chat.jinja2"
@@ -84,3 +92,4 @@ routerSpec:
   extraArgs:
     - "--log-level"
     - "info"
+  lmcacheControllerPort: 9000
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
index ac58c26f6..09659422b 100644
--- a/.github/values-09-kvaware-routing.yaml
+++ b/.github/values-09-kvaware-routing.yaml
@@ -6,10 +6,10 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -31,6 +31,10 @@ servingEngineSpec:
         nixlEnableGc: true
         enablePD: true
         cpuOffloadingBufferSize: 0
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8001
+        distributedUrl: "localhost:30081"
       labels:
         model: "opt125m-prefill"
       chatTemplate: "chat.jinja2"
@@ -40,10 +44,10 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-decode"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -63,6 +67,10 @@ servingEngineSpec:
         nixlBufferDevice: "cuda"
         nixlEnableGc: true
         enablePD: true
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8002
+        distributedUrl: "localhost:30082"
       labels:
         model: "opt125m-decode"
       chatTemplate: "chat.jinja2"
@@ -84,3 +92,4 @@ routerSpec:
   extraArgs:
     - "--log-level"
     - "info"
+  lmcacheControllerPort: 9000
diff --git a/.github/values-10-disagg-prefill.yaml b/.github/values-10-disagg-prefill.yaml
index 548d284f5..236b46d33 100644
--- a/.github/values-10-disagg-prefill.yaml
+++ b/.github/values-10-disagg-prefill.yaml
@@ -9,10 +9,10 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -34,6 +34,10 @@ servingEngineSpec:
         nixlEnableGc: true
         enablePD: true
         cpuOffloadingBufferSize: 0
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8001
+        distributedUrl: "localhost:30081"
       labels:
         model: "opt125m-prefill"
       chatTemplate: "chat.jinja2"
@@ -43,10 +47,10 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-decode"
       repository: "lmcache/vllm-openai"
-      tag: "2025-05-27-v1"
+      tag: "latest"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 8
+      requestCPU: 6
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -66,6 +70,10 @@ servingEngineSpec:
         nixlBufferDevice: "cuda"
         nixlEnableGc: true
         enablePD: true
+        enableController: true
+        controllerPort: 9000
+        workerPort: 8002
+        distributedUrl: "localhost:30082"
       labels:
         model: "opt125m-decode"
       chatTemplate: "chat.jinja2"
@@ -90,6 +98,7 @@ routerSpec:
   engineScrapeInterval: 15
   requestStatsWindow: 60
   enablePD: true
+  lmcacheControllerPort: 9000
   resources:
     requests:
       cpu: "4"

From 8c338623cc33393098bb7a714045dd2a05f40bfc Mon Sep 17 00:00:00 2001
From: Rui Zhang <zrfishnoodles@gmail.com>
Date: Tue, 16 Sep 2025 01:03:36 +0000
Subject: [PATCH 03/11] Revert "fix dynamic config"

This reverts commit 9ac02f992e58fa102a891d3dfe903142bf913f0b.

Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com>
---
 src/vllm_router/service_discovery.py | 42 ++--------------------------
 1 file changed, 2 insertions(+), 40 deletions(-)

diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
index eca70bb9a..12f3a694a 100644
--- a/src/vllm_router/service_discovery.py
+++ b/src/vllm_router/service_discovery.py
@@ -226,7 +226,6 @@ def __init__(
         self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
         self.added_timestamp = int(time.time())
         self.unhealthy_endpoint_hashes = []
-        self._running = True
         if static_backend_health_checks:
             self.start_health_check_task()
         self.prefill_model_labels = prefill_model_labels
@@ -251,13 +250,10 @@ def get_unhealthy_endpoint_hashes(self) -> list[str]:
         return unhealthy_endpoints
 
     async def check_model_health(self):
-        while self._running:
+        while True:
             try:
                 self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes()
-                await asyncio.sleep(60)
-            except asyncio.CancelledError:
-                logger.debug("Health check task cancelled")
-                break
+                time.sleep(60)
             except Exception as e:
                 logger.error(e)
 
@@ -344,40 +340,6 @@ async def initialize_client_sessions(self) -> None:
                         timeout=aiohttp.ClientTimeout(total=None),
                     )
 
-    def close(self):
-        """
-        Close the service discovery module and clean up health check resources.
-        """
-        self._running = False
-        if hasattr(self, "loop") and self.loop.is_running():
-            # Schedule a coroutine to gracefully shut down the event loop
-            async def shutdown():
-                tasks = [
-                    t
-                    for t in asyncio.all_tasks(self.loop)
-                    if t is not asyncio.current_task()
-                ]
-                for task in tasks:
-                    task.cancel()
-                await asyncio.gather(*tasks, return_exceptions=True)
-                self.loop.stop()
-
-            future = asyncio.run_coroutine_threadsafe(shutdown(), self.loop)
-            try:
-                future.result(timeout=15.0)
-            except asyncio.TimeoutError:
-                logger.warning(
-                    "Timed out waiting for shutdown(loop might already be closed)"
-                )
-            except Exception as e:
-                logger.warning(f"Error during health check shutdown: {e}")
-
-        if hasattr(self, "thread") and self.thread.is_alive():
-            self.thread.join(timeout=5.0)
-
-        if hasattr(self, "loop") and not self.loop.is_closed():
-            self.loop.close()
-
 
 class K8sPodIPServiceDiscovery(ServiceDiscovery):
     def __init__(

From 06b933be3fde90e9ac1df2cd40a6920815ba0962 Mon Sep 17 00:00:00 2001
From: Rui Zhang <zrfishnoodles@gmail.com>
Date: Tue, 16 Sep 2025 18:53:25 +0000
Subject: [PATCH 04/11] modify CI

Signed-off-by: Rui Zhang <zrfishnoodles@gmail.com>
---
 .github/values-06-session-routing.yaml    | 58 ++++++++++-------------
 .github/values-07-prefix-routing.yaml     | 58 ++++++++++-------------
 .github/values-08-roundrobin-routing.yaml | 58 ++++++++++-------------
 .github/values-09-kvaware-routing.yaml    | 58 ++++++++++-------------
 tutorials/assets/values-17-kv-aware.yaml  | 22 ++++-----
 5 files changed, 111 insertions(+), 143 deletions(-)

diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
index 16747fcb4..3d9156b91 100644
--- a/.github/values-06-session-routing.yaml
+++ b/.github/values-06-session-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -89,6 +74,13 @@ routerSpec:
     type: Recreate
   enableRouter: true
   routingLogic: "session"
+  resources:
+    requests:
+      cpu: "1"
+      memory: "2G"
+    limits:
+      cpu: "1"
+      memory: "2G"
   lmcacheControllerPort: 9000
   sessionKey: "x-user-id"
   extraArgs:
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
index dd1b2aff1..38f009421 100644
--- a/.github/values-07-prefix-routing.yaml
+++ b/.github/values-07-prefix-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -87,6 +72,13 @@ routerSpec:
   imagePullPolicy: "IfNotPresent"
   strategy:
     type: Recreate
+  resources:
+    requests:
+      cpu: "1"
+      memory: "2G"
+    limits:
+      cpu: "1"
+      memory: "2G"
   enableRouter: true
   routingLogic: "prefixaware"
   extraArgs:
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
index 23be0c106..3ad0ca0b5 100644
--- a/.github/values-08-roundrobin-routing.yaml
+++ b/.github/values-08-roundrobin-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -92,4 +77,11 @@ routerSpec:
   extraArgs:
     - "--log-level"
     - "info"
+  resources:
+    requests:
+      cpu: "1"
+      memory: "2G"
+    limits:
+      cpu: "1"
+      memory: "2G"
   lmcacheControllerPort: 9000
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
index 09659422b..9642ae1ac 100644
--- a/.github/values-09-kvaware-routing.yaml
+++ b/.github/values-09-kvaware-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "10"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -87,6 +72,13 @@ routerSpec:
   imagePullPolicy: "IfNotPresent"
   strategy:
     type: Recreate
+  resources:
+    requests:
+      cpu: "1"
+      memory: "2G"
+    limits:
+      cpu: "1"
+      memory: "2G"
   enableRouter: true
   routingLogic: "kvaware"
   extraArgs:
diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml
index 7c17e8cd5..8b45e459b 100644
--- a/tutorials/assets/values-17-kv-aware.yaml
+++ b/tutorials/assets/values-17-kv-aware.yaml
@@ -3,7 +3,7 @@ servingEngineSpec:
   modelSpec:
   - name: "llama1"
     repository: "lmcache/vllm-openai"
-    tag: "2025-05-17-v1"
+    tag: "v0.3.5"
     modelURL: "meta-llama/Llama-3.1-8B-Instruct"
     replicaCount: 1
     requestCPU: 6
@@ -19,9 +19,9 @@ servingEngineSpec:
       cpuOffloadingBufferSize: "60"
       enableController: true
       instanceId: "default1"
-      controllerPort: "9000"
+      controllerPort: 9000
       workerPort: 8001
-      distributedUrl: "localhost:8201"
+      distributedUrl: "localhost:30081"
 
     env:
       - name: LMCACHE_LOG_LEVEL
@@ -29,7 +29,7 @@ servingEngineSpec:
     hf_token: <HF_TOKEN>
   - name: "llama2"
     repository: "lmcache/vllm-openai"
-    tag: "2025-05-17-v1"
+    tag: "v0.3.5"
     modelURL: "meta-llama/Llama-3.1-8B-Instruct"
     replicaCount: 1
     requestCPU: 6
@@ -45,9 +45,9 @@ servingEngineSpec:
       cpuOffloadingBufferSize: "60"
       enableController: true
       instanceId: "default2"
-      controllerPort: "9000"
+      controllerPort: 9000
       workerPort: 8002
-      distributedUrl: "localhost:8202"
+      distributedUrl: "localhost:30082"
 
     env:
       - name: LMCACHE_LOG_LEVEL
@@ -56,7 +56,7 @@ servingEngineSpec:
 
   - name: "llama3"
     repository: "lmcache/vllm-openai"
-    tag: "2025-05-17-v1"
+    tag: "v0.3.5"
     modelURL: "meta-llama/Llama-3.1-8B-Instruct"
     replicaCount: 1
     requestCPU: 6
@@ -72,9 +72,9 @@ servingEngineSpec:
       cpuOffloadingBufferSize: "60"
       enableController: true
       instanceId: "default3"
-      controllerPort: "9000"
+      controllerPort: 9000
       workerPort: 8003
-      distributedUrl: "localhost:8203"
+      distributedUrl: "localhost:30083"
 
     env:
       - name: LMCACHE_LOG_LEVEL
@@ -82,7 +82,7 @@ servingEngineSpec:
     hf_token: <HF_TOKEN>
   - name: "llama4"
     repository: "lmcache/vllm-openai"
-    tag: "2025-05-17-v1"
+    tag: "v0.3.5"
     modelURL: "meta-llama/Llama-3.1-8B-Instruct"
     replicaCount: 1
     requestCPU: 6
@@ -100,7 +100,7 @@ servingEngineSpec:
       instanceId: "default4"
       controllerPort: "9000"
       workerPort: 8004
-      distributedUrl: "localhost:8204"
+      distributedUrl: "localhost:30084"
 
     env:
       - name: LMCACHE_LOG_LEVEL

From 57c69ae2e3cdc39a10933ea089ce438ff29657d1 Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 00:06:40 +0000
Subject: [PATCH 05/11] bugfix: fix bug for kvaware routing to be compatiable
 with lmcache 0.3.9

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 .github/values-06-session-routing.yaml    |  12 +--
 .github/values-07-prefix-routing.yaml     |  12 +--
 .github/values-08-roundrobin-routing.yaml |  12 +--
 .github/values-09-kvaware-routing.yaml    |  12 +--
 helm/templates/deployment-vllm-multi.yaml |  27 ++++--
 tutorials/17-kv-aware-routing.md          |   4 +-
 tutorials/assets/values-17-kv-aware.yaml  | 106 ++++------------------
 7 files changed, 62 insertions(+), 123 deletions(-)

diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
index 3d9156b91..708656ecf 100644
--- a/.github/values-06-session-routing.yaml
+++ b/.github/values-06-session-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
index 38f009421..cba67c46d 100644
--- a/.github/values-07-prefix-routing.yaml
+++ b/.github/values-07-prefix-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
index 3ad0ca0b5..b3d8063b2 100644
--- a/.github/values-08-roundrobin-routing.yaml
+++ b/.github/values-08-roundrobin-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
index 9642ae1ac..195e0f1ab 100644
--- a/.github/values-09-kvaware-routing.yaml
+++ b/.github/values-09-kvaware-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
index 453eaa550..971fa10dc 100644
--- a/helm/templates/deployment-vllm-multi.yaml
+++ b/helm/templates/deployment-vllm-multi.yaml
@@ -321,18 +321,31 @@ spec:
           {{-   if hasKey $modelSpec.lmcacheConfig "instanceId" }}
           - name: LMCACHE_LMCACHE_INSTANCE_ID
             value: {{ $modelSpec.lmcacheConfig.instanceId | quote }}
+          {{- else }}
+          - name: LMCACHE_LMCACHE_INSTANCE_ID
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
           {{-   end }}
           {{-   if hasKey $modelSpec.lmcacheConfig "controllerPort" }}
-          - name: LMCACHE_CONTROLLER_URL
+          - name: LMCACHE_CONTROLLER_PULL_URL
             value: "{{ .Release.Name }}-router-service:{{ $modelSpec.lmcacheConfig.controllerPort }}"
           {{-   end }}
-          {{-   if hasKey $modelSpec.lmcacheConfig "workerPort" }}
-          - name: LMCACHE_LMCACHE_WORKER_PORT
-            value: {{ $modelSpec.lmcacheConfig.workerPort | quote }}
+          {{-   if hasKey $modelSpec.lmcacheConfig "workerPorts" }}
+          - name: LMCACHE_LMCACHE_WORKER_PORTS
+            value: {{ $modelSpec.lmcacheConfig.workerPorts | quote }}
+          {{-   end }}
+          {{-  if hasKey $modelSpec.lmcacheConfig "p2pHost" }}
+          - name: LMCACHE_P2P_HOST
+            value: {{ $modelSpec.lmcacheConfig.p2pHost | quote }}
+          {{-   end }}
+          {{-   if hasKey $modelSpec.lmcacheConfig "p2pInitPorts" }}
+          - name: LMCACHE_P2P_INIT_PORTS
+            value: {{ $modelSpec.lmcacheConfig.p2pInitPorts | quote }}
           {{-   end }}
-          {{-  if hasKey $modelSpec.lmcacheConfig "distributedUrl" }}
-          - name: LMCACHE_DISTRIBUTED_URL
-            value: {{ $modelSpec.lmcacheConfig.distributedUrl | quote }}
+          {{-   if hasKey $modelSpec.lmcacheConfig "workerHeartbeatTime" }}
+          - name: LMCACHE_LMCACHE_WORKER_HEARTBEAT_TIME
+            value: {{ $modelSpec.lmcacheConfig.workerHeartbeatTime | quote }}
           {{-   end }}
           {{- end }}
           {{- if or .Values.servingEngineSpec.configs $modelSpec.envFromSecret }}
diff --git a/tutorials/17-kv-aware-routing.md b/tutorials/17-kv-aware-routing.md
index e71da792f..0c21a8a4c 100644
--- a/tutorials/17-kv-aware-routing.md
+++ b/tutorials/17-kv-aware-routing.md
@@ -54,7 +54,7 @@ First, send a request to the router:
 curl http://localhost:30080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "model": "openai/gpt-oss-20b",
     "prompt": "What is the capital of France?",
     "max_tokens": 100
   }'
@@ -66,7 +66,7 @@ Then, send another request with the same prompt prefix:
 curl http://localhost:30080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "model": "openai/gpt-oss-20b",
     "prompt": "What is the capital of France? And what is its population?",
     "max_tokens": 100
   }'
diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml
index 8b45e459b..4d410597f 100644
--- a/tutorials/assets/values-17-kv-aware.yaml
+++ b/tutorials/assets/values-17-kv-aware.yaml
@@ -1,111 +1,38 @@
 servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
-  - name: "llama1"
+  - name: "gpt-oss-20b"
     repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "70Gi"
+    tag: "v0.3.9post2"
+    modelURL: "openai/gpt-oss-20b"
+    replicaCount: 2
+    requestCPU: 8
+    requestMemory: "128Gi"
     requestGPU: 1
-    pvcStorage: "50Gi"
+    pvcStorage: "256Gi"
     vllmConfig:
       enablePrefixCaching: true
-      maxModelLen: 32000
+      maxModelLen: 8000
+      gpuMemoryUtilization: "0.9"
 
     lmcacheConfig:
       enabled: true
       cpuOffloadingBufferSize: "60"
       enableController: true
-      instanceId: "default1"
-      controllerPort: 9000
-      workerPort: 8001
-      distributedUrl: "localhost:30081"
-
-    env:
-      - name: LMCACHE_LOG_LEVEL
-        value: "DEBUG"
-    hf_token: <HF_TOKEN>
-  - name: "llama2"
-    repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "30Gi"
-    requestGPU: 1
-    pvcStorage: "50Gi"
-    vllmConfig:
-      enablePrefixCaching: true
-      maxModelLen: 32000
-
-    lmcacheConfig:
-      enabled: true
-      cpuOffloadingBufferSize: "60"
-      enableController: true
-      instanceId: "default2"
-      controllerPort: 9000
-      workerPort: 8002
-      distributedUrl: "localhost:30082"
-
-    env:
-      - name: LMCACHE_LOG_LEVEL
-        value: "DEBUG"
-    hf_token: <HF_TOKEN>
-
-  - name: "llama3"
-    repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "70Gi"
-    requestGPU: 1
-    pvcStorage: "50Gi"
-    vllmConfig:
-      enablePrefixCaching: true
-      maxModelLen: 32000
 
-    lmcacheConfig:
-      enabled: true
-      cpuOffloadingBufferSize: "60"
-      enableController: true
-      instanceId: "default3"
       controllerPort: 9000
-      workerPort: 8003
-      distributedUrl: "localhost:30083"
+      workerPorts: "8001"
+      p2pHost: "localhost"
+      p2pInitPorts: "30081"
+      workerHeartbeatTime: "30"
 
     env:
       - name: LMCACHE_LOG_LEVEL
         value: "DEBUG"
-    hf_token: <HF_TOKEN>
-  - name: "llama4"
-    repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "70Gi"
-    requestGPU: 1
-    pvcStorage: "50Gi"
-    vllmConfig:
-      enablePrefixCaching: true
-      maxModelLen: 32000
-
-    lmcacheConfig:
-      enabled: true
-      cpuOffloadingBufferSize: "60"
-      enableController: true
-      instanceId: "default4"
-      controllerPort: "9000"
-      workerPort: 8004
-      distributedUrl: "localhost:30084"
-
-    env:
-      - name: LMCACHE_LOG_LEVEL
+      - name: VLLM_LOGGING_LEVEL
         value: "DEBUG"
-    hf_token: <HF_TOKEN>
+      - name: HF_HOME
+        value: "/data"
 
 routerSpec:
   repository: "lmcache/lmstack-router"
@@ -119,5 +46,4 @@ routerSpec:
       memory: "2G"
   routingLogic: "kvaware"
   lmcacheControllerPort: 9000
-  hf_token: <HF_TOKEN>
   sessionKey: "x-user-id"

From cf4162bd08278569587c9017c9a485a8dff96b3d Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 00:18:21 +0000
Subject: [PATCH 06/11] bugfix: fix ci

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e23a9ae32..ad2fd116a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,13 +42,13 @@ semantic_cache = [
     "huggingface-hub==0.34.0",
 ]
 lmcache = [
-    "lmcache==0.3.5",
-    "vllm==0.10.1.1",
+    "lmcache==0.3.9post2",
+    "vllm==0.11.0",
 ]
 test = [
     "pytest>=8.3.4",
     "pytest-asyncio>=0.25.3",
-    "vllm==0.10.2"
+    "vllm==0.11.0"
 ]
 
 [build-system]

From 61cfb766354aadcf2ef13dbcd4f40f3bb52388da Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 00:28:50 +0000
Subject: [PATCH 07/11] fix ci

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 .github/values-06-session-routing.yaml    |  4 +--
 .github/values-07-prefix-routing.yaml     |  4 +--
 .github/values-08-roundrobin-routing.yaml |  4 +--
 .github/values-09-kvaware-routing.yaml    |  4 +--
 src/vllm_router/service_discovery.py      | 44 ++++++++++++++++++++---
 5 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
index 708656ecf..f713ebf48 100644
--- a/.github/values-06-session-routing.yaml
+++ b/.github/values-06-session-routing.yaml
@@ -6,7 +6,7 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
@@ -35,7 +35,7 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
index cba67c46d..ffd810bc2 100644
--- a/.github/values-07-prefix-routing.yaml
+++ b/.github/values-07-prefix-routing.yaml
@@ -6,7 +6,7 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
@@ -35,7 +35,7 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
index b3d8063b2..6a751a7c4 100644
--- a/.github/values-08-roundrobin-routing.yaml
+++ b/.github/values-08-roundrobin-routing.yaml
@@ -6,7 +6,7 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
@@ -35,7 +35,7 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
index 195e0f1ab..120e155bc 100644
--- a/.github/values-09-kvaware-routing.yaml
+++ b/.github/values-09-kvaware-routing.yaml
@@ -6,7 +6,7 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
@@ -35,7 +35,7 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "v0.3.5"
+      tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
index 12f3a694a..e3b47be6e 100644
--- a/src/vllm_router/service_discovery.py
+++ b/src/vllm_router/service_discovery.py
@@ -226,6 +226,7 @@ def __init__(
         self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
         self.added_timestamp = int(time.time())
         self.unhealthy_endpoint_hashes = []
+        self.running = True
         if static_backend_health_checks:
             self.start_health_check_task()
         self.prefill_model_labels = prefill_model_labels
@@ -250,12 +251,13 @@ def get_unhealthy_endpoint_hashes(self) -> list[str]:
         return unhealthy_endpoints
 
     async def check_model_health(self):
-        while True:
+        while self.running:
             try:
                 self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes()
-                time.sleep(60)
-            except Exception as e:
-                logger.error(e)
+                await asyncio.sleep(60)
+            except asyncio.CancelledError:
+                logger.debug("Health check task cancelled")
+                break
 
     def start_health_check_task(self) -> None:
         self.loop = asyncio.new_event_loop()
@@ -340,6 +342,40 @@ async def initialize_client_sessions(self) -> None:
                         timeout=aiohttp.ClientTimeout(total=None),
                     )
 
+    def close(self):
+        """
+        Close the service discovery module and clean up health check resources.
+        """
+        self._running = False
+        if hasattr(self, "loop") and self.loop.is_running():
+            # Schedule a coroutine to gracefully shut down the event loop
+            async def shutdown():
+                tasks = [
+                    t
+                    for t in asyncio.all_tasks(self.loop)
+                    if t is not asyncio.current_task()
+                ]
+                for task in tasks:
+                    task.cancel()
+                await asyncio.gather(*tasks, return_exceptions=True)
+                self.loop.stop()
+
+            future = asyncio.run_coroutine_threadsafe(shutdown(), self.loop)
+            try:
+                future.result(timeout=15.0)
+            except asyncio.TimeoutError:
+                logger.warning(
+                    "Timed out waiting for shutdown(loop might already be closed)"
+                )
+            except Exception as e:
+                logger.warning(f"Error during health check shutdown: {e}")
+
+        if hasattr(self, "thread") and self.thread.is_alive():
+            self.thread.join(timeout=5.0)
+
+        if hasattr(self, "loop") and not self.loop.is_closed():
+            self.loop.close()
+
 
 class K8sPodIPServiceDiscovery(ServiceDiscovery):
     def __init__(

From 08a17346ea5668cb94cb6858fc91e85433207440 Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 00:38:56 +0000
Subject: [PATCH 08/11] modify ci

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 .github/values-06-session-routing.yaml    | 35 ++---------------------
 .github/values-07-prefix-routing.yaml     | 35 ++---------------------
 .github/values-08-roundrobin-routing.yaml | 35 ++---------------------
 .github/values-09-kvaware-routing.yaml    | 35 ++---------------------
 .github/values-10-disagg-prefill.yaml     | 17 +++--------
 5 files changed, 12 insertions(+), 145 deletions(-)

diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
index f713ebf48..13ab01220 100644
--- a/.github/values-06-session-routing.yaml
+++ b/.github/values-06-session-routing.yaml
@@ -3,12 +3,11 @@ servingEngineSpec:
     type: Recreate
   runtimeClassName: ""
   modelSpec:
-    # Prefill node configuration
-    - name: "opt125m-1"
+    - name: "opt125m"
       repository: "lmcache/vllm-openai"
       tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
-      replicaCount: 1
+      replicaCount: 2
       requestCPU: 6
       requestMemory: "30Gi"
       requestGPU: 1
@@ -32,36 +31,6 @@ servingEngineSpec:
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
-    # Decode node configuration
-    - name: "opt125m-2"
-      repository: "lmcache/vllm-openai"
-      tag: "v0.3.9post2"
-      modelURL: "facebook/opt-125m"
-      replicaCount: 1
-      requestCPU: 6
-      requestMemory: "30Gi"
-      requestGPU: 1
-      pvcStorage: "50Gi"
-      vllmConfig:
-        enablePrefixCaching: true
-        maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
-      lmcacheConfig:
-        enabled: true
-        cpuOffloadingBufferSize: "10"
-        enableController: true
-        controllerPort: 9000
-        workerPorts: "8002"
-        p2pHost: "localhost"
-        p2pInitPorts: "30082"
-      env:
-        - name: LMCACHE_LOG_LEVEL
-          value: "DEBUG"
-      chatTemplate: "chat.jinja2"
-      chatTemplateConfigMap: |-
-        {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-        {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
   containerSecurityContext:
     capabilities:
       add:
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
index ffd810bc2..4dcd73bed 100644
--- a/.github/values-07-prefix-routing.yaml
+++ b/.github/values-07-prefix-routing.yaml
@@ -3,12 +3,11 @@ servingEngineSpec:
     type: Recreate
   runtimeClassName: ""
   modelSpec:
-    # Prefill node configuration
-    - name: "opt125m-1"
+    - name: "opt125m"
       repository: "lmcache/vllm-openai"
       tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
-      replicaCount: 1
+      replicaCount: 2
       requestCPU: 6
       requestMemory: "30Gi"
       requestGPU: 1
@@ -32,36 +31,6 @@ servingEngineSpec:
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
-    # Decode node configuration
-    - name: "opt125m-2"
-      repository: "lmcache/vllm-openai"
-      tag: "v0.3.9post2"
-      modelURL: "facebook/opt-125m"
-      replicaCount: 1
-      requestCPU: 6
-      requestMemory: "30Gi"
-      requestGPU: 1
-      pvcStorage: "50Gi"
-      vllmConfig:
-        enablePrefixCaching: true
-        maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
-      lmcacheConfig:
-        enabled: true
-        cpuOffloadingBufferSize: "10"
-        enableController: true
-        controllerPort: 9000
-        workerPorts: "8002"
-        p2pHost: "localhost"
-        p2pInitPorts: "30082"
-      env:
-        - name: LMCACHE_LOG_LEVEL
-          value: "DEBUG"
-      chatTemplate: "chat.jinja2"
-      chatTemplateConfigMap: |-
-        {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-        {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
   containerSecurityContext:
     capabilities:
       add:
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
index 6a751a7c4..93b8ce194 100644
--- a/.github/values-08-roundrobin-routing.yaml
+++ b/.github/values-08-roundrobin-routing.yaml
@@ -3,12 +3,11 @@ servingEngineSpec:
     type: Recreate
   runtimeClassName: ""
   modelSpec:
-    # Prefill node configuration
-    - name: "opt125m-1"
+    - name: "opt125m"
       repository: "lmcache/vllm-openai"
       tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
-      replicaCount: 1
+      replicaCount: 2
       requestCPU: 6
       requestMemory: "30Gi"
       requestGPU: 1
@@ -32,36 +31,6 @@ servingEngineSpec:
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
-    # Decode node configuration
-    - name: "opt125m-2"
-      repository: "lmcache/vllm-openai"
-      tag: "v0.3.9post2"
-      modelURL: "facebook/opt-125m"
-      replicaCount: 1
-      requestCPU: 6
-      requestMemory: "30Gi"
-      requestGPU: 1
-      pvcStorage: "50Gi"
-      vllmConfig:
-        enablePrefixCaching: true
-        maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
-      lmcacheConfig:
-        enabled: true
-        cpuOffloadingBufferSize: "10"
-        enableController: true
-        controllerPort: 9000
-        workerPorts: "8002"
-        p2pHost: "localhost"
-        p2pInitPorts: "30082"
-      env:
-        - name: LMCACHE_LOG_LEVEL
-          value: "DEBUG"
-      chatTemplate: "chat.jinja2"
-      chatTemplateConfigMap: |-
-        {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-        {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
   containerSecurityContext:
     capabilities:
       add:
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
index 120e155bc..c471e9f86 100644
--- a/.github/values-09-kvaware-routing.yaml
+++ b/.github/values-09-kvaware-routing.yaml
@@ -3,12 +3,11 @@ servingEngineSpec:
     type: Recreate
   runtimeClassName: ""
   modelSpec:
-    # Prefill node configuration
-    - name: "opt125m-1"
+    - name: "opt125m"
       repository: "lmcache/vllm-openai"
       tag: "v0.3.9post2"
       modelURL: "facebook/opt-125m"
-      replicaCount: 1
+      replicaCount: 2
       requestCPU: 6
       requestMemory: "30Gi"
       requestGPU: 1
@@ -32,36 +31,6 @@ servingEngineSpec:
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
-    # Decode node configuration
-    - name: "opt125m-2"
-      repository: "lmcache/vllm-openai"
-      tag: "v0.3.9post2"
-      modelURL: "facebook/opt-125m"
-      replicaCount: 1
-      requestCPU: 6
-      requestMemory: "30Gi"
-      requestGPU: 1
-      pvcStorage: "50Gi"
-      vllmConfig:
-        enablePrefixCaching: true
-        maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
-      lmcacheConfig:
-        enabled: true
-        cpuOffloadingBufferSize: "10"
-        enableController: true
-        controllerPort: 9000
-        workerPorts: "8002"
-        p2pHost: "localhost"
-        p2pInitPorts: "30082"
-      env:
-        - name: LMCACHE_LOG_LEVEL
-          value: "DEBUG"
-      chatTemplate: "chat.jinja2"
-      chatTemplateConfigMap: |-
-        {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-        {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
   containerSecurityContext:
     capabilities:
       add:
diff --git a/.github/values-10-disagg-prefill.yaml b/.github/values-10-disagg-prefill.yaml
index 236b46d33..548d284f5 100644
--- a/.github/values-10-disagg-prefill.yaml
+++ b/.github/values-10-disagg-prefill.yaml
@@ -9,10 +9,10 @@ servingEngineSpec:
     # Prefill node configuration
     - name: "opt125m-prefill"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "2025-05-27-v1"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 6
+      requestCPU: 8
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -34,10 +34,6 @@ servingEngineSpec:
         nixlEnableGc: true
         enablePD: true
         cpuOffloadingBufferSize: 0
-        enableController: true
-        controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
       labels:
         model: "opt125m-prefill"
       chatTemplate: "chat.jinja2"
@@ -47,10 +43,10 @@ servingEngineSpec:
     # Decode node configuration
     - name: "opt125m-decode"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "2025-05-27-v1"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
-      requestCPU: 6
+      requestCPU: 8
       requestMemory: "30Gi"
       # requestGPU: 1
       pvcStorage: "50Gi"
@@ -70,10 +66,6 @@ servingEngineSpec:
         nixlBufferDevice: "cuda"
         nixlEnableGc: true
         enablePD: true
-        enableController: true
-        controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
       labels:
         model: "opt125m-decode"
       chatTemplate: "chat.jinja2"
@@ -98,7 +90,6 @@ routerSpec:
   engineScrapeInterval: 15
   requestStatsWindow: 60
   enablePD: true
-  lmcacheControllerPort: 9000
   resources:
     requests:
       cpu: "4"

From 5b1bb6eb0c5fd36082585d34b6a08993558f14fd Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 00:42:41 +0000
Subject: [PATCH 09/11] modify ci

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 src/vllm_router/service_discovery.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
index e3b47be6e..f3a990ea3 100644
--- a/src/vllm_router/service_discovery.py
+++ b/src/vllm_router/service_discovery.py
@@ -226,7 +226,7 @@ def __init__(
         self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
         self.added_timestamp = int(time.time())
         self.unhealthy_endpoint_hashes = []
-        self.running = True
+        self._running = True
         if static_backend_health_checks:
             self.start_health_check_task()
         self.prefill_model_labels = prefill_model_labels
@@ -251,13 +251,15 @@ def get_unhealthy_endpoint_hashes(self) -> list[str]:
         return unhealthy_endpoints
 
     async def check_model_health(self):
-        while self.running:
+        while self._running:
             try:
                 self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes()
                 await asyncio.sleep(60)
             except asyncio.CancelledError:
                 logger.debug("Health check task cancelled")
                 break
+            except Exception as e:
+                logger.error(e)
 
     def start_health_check_task(self) -> None:
         self.loop = asyncio.new_event_loop()
@@ -785,7 +787,7 @@ def close(self):
         """
         Close the service discovery module.
         """
-        self.running = False
+        self._running = False
         self.k8s_watcher.stop()
         self.watcher_thread.join()
 

From 8fff9b473e72a9624593411fdccfc9758cdc7725 Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 00:52:35 +0000
Subject: [PATCH 10/11] modify ci

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index ad2fd116a..066096a13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,9 @@ write_to = "src/vllm_router/_version.py"
 [tool.isort]
 profile = "black"
 
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+
 [dependency-groups]
 lint = [
     "pre-commit>=4.1.0",

From 49e2b72d7b4907edf0bbb9007a2d7dd307afece8 Mon Sep 17 00:00:00 2001
From: Rui Zhang <rzhan229@ucsc.edu>
Date: Thu, 13 Nov 2025 01:00:01 +0000
Subject: [PATCH 11/11] modify ci

Signed-off-by: Rui Zhang <rzhan229@ucsc.edu>
---
 src/vllm_router/service_discovery.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
index f3a990ea3..eca70bb9a 100644
--- a/src/vllm_router/service_discovery.py
+++ b/src/vllm_router/service_discovery.py
@@ -787,7 +787,7 @@ def close(self):
         """
         Close the service discovery module.
         """
-        self._running = False
+        self.running = False
         self.k8s_watcher.stop()
         self.watcher_thread.join()