bugfix: fix bug for kvaware routing to be compatiable with lmcache 0.3.9

zerofishnoodles · zerofishnoodles · commit 57c69ae2e3cd · 2025-11-13T00:06:40.000Z
Signed-off-by: Rui Zhang &lt;rzhan229@ucsc.edu&gt;
diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
@@ -21,10 +21,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default1"
         controllerPort: 9000
-        workerPort: 8001
-        distributedUrl: "localhost:30081"
+        workerPorts: "8001"
+        p2pHost: "localhost"
+        p2pInitPorts: "30081"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
         enabled: true
         cpuOffloadingBufferSize: "10"
         enableController: true
-        instanceId: "default2"
         controllerPort: 9000
-        workerPort: 8002
-        distributedUrl: "localhost:30082"
+        workerPorts: "8002"
+        p2pHost: "localhost"
+        p2pInitPorts: "30082"
       env:
         - name: LMCACHE_LOG_LEVEL
           value: "DEBUG"
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -321,18 +321,31 @@ spec:
           {{-   if hasKey $modelSpec.lmcacheConfig "instanceId" }}
           - name: LMCACHE_LMCACHE_INSTANCE_ID
             value: {{ $modelSpec.lmcacheConfig.instanceId | quote }}
+          {{- else }}
+          - name: LMCACHE_LMCACHE_INSTANCE_ID
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
           {{-   end }}
           {{-   if hasKey $modelSpec.lmcacheConfig "controllerPort" }}
-          - name: LMCACHE_CONTROLLER_URL
+          - name: LMCACHE_CONTROLLER_PULL_URL
             value: "{{ .Release.Name }}-router-service:{{ $modelSpec.lmcacheConfig.controllerPort }}"
           {{-   end }}
-          {{-   if hasKey $modelSpec.lmcacheConfig "workerPort" }}
-          - name: LMCACHE_LMCACHE_WORKER_PORT
-            value: {{ $modelSpec.lmcacheConfig.workerPort | quote }}
+          {{-   if hasKey $modelSpec.lmcacheConfig "workerPorts" }}
+          - name: LMCACHE_LMCACHE_WORKER_PORTS
+            value: {{ $modelSpec.lmcacheConfig.workerPorts | quote }}
+          {{-   end }}
+          {{-  if hasKey $modelSpec.lmcacheConfig "p2pHost" }}
+          - name: LMCACHE_P2P_HOST
+            value: {{ $modelSpec.lmcacheConfig.p2pHost | quote }}
+          {{-   end }}
+          {{-   if hasKey $modelSpec.lmcacheConfig "p2pInitPorts" }}
+          - name: LMCACHE_P2P_INIT_PORTS
+            value: {{ $modelSpec.lmcacheConfig.p2pInitPorts | quote }}
           {{-   end }}
-          {{-  if hasKey $modelSpec.lmcacheConfig "distributedUrl" }}
-          - name: LMCACHE_DISTRIBUTED_URL
-            value: {{ $modelSpec.lmcacheConfig.distributedUrl | quote }}
+          {{-   if hasKey $modelSpec.lmcacheConfig "workerHeartbeatTime" }}
+          - name: LMCACHE_LMCACHE_WORKER_HEARTBEAT_TIME
+            value: {{ $modelSpec.lmcacheConfig.workerHeartbeatTime | quote }}
           {{-   end }}
           {{- end }}
           {{- if or .Values.servingEngineSpec.configs $modelSpec.envFromSecret }}
diff --git a/tutorials/17-kv-aware-routing.md b/tutorials/17-kv-aware-routing.md
@@ -54,7 +54,7 @@ First, send a request to the router:
 curl http://localhost:30080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "model": "openai/gpt-oss-20b",
     "prompt": "What is the capital of France?",
     "max_tokens": 100
   }'
@@ -66,7 +66,7 @@ Then, send another request with the same prompt prefix:
 curl http://localhost:30080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "model": "openai/gpt-oss-20b",
     "prompt": "What is the capital of France? And what is its population?",
     "max_tokens": 100
   }'
diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml
@@ -1,111 +1,38 @@
 servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
-  - name: "llama1"
+  - name: "gpt-oss-20b"
     repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "70Gi"
+    tag: "v0.3.9post2"
+    modelURL: "openai/gpt-oss-20b"
+    replicaCount: 2
+    requestCPU: 8
+    requestMemory: "128Gi"
     requestGPU: 1
-    pvcStorage: "50Gi"
+    pvcStorage: "256Gi"
     vllmConfig:
       enablePrefixCaching: true
-      maxModelLen: 32000
+      maxModelLen: 8000
+      gpuMemoryUtilization: "0.9"
 
     lmcacheConfig:
       enabled: true
       cpuOffloadingBufferSize: "60"
       enableController: true
-      instanceId: "default1"
-      controllerPort: 9000
-      workerPort: 8001
-      distributedUrl: "localhost:30081"
-
-    env:
-      - name: LMCACHE_LOG_LEVEL
-        value: "DEBUG"
-    hf_token: <HF_TOKEN>
-  - name: "llama2"
-    repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "30Gi"
-    requestGPU: 1
-    pvcStorage: "50Gi"
-    vllmConfig:
-      enablePrefixCaching: true
-      maxModelLen: 32000
-
-    lmcacheConfig:
-      enabled: true
-      cpuOffloadingBufferSize: "60"
-      enableController: true
-      instanceId: "default2"
-      controllerPort: 9000
-      workerPort: 8002
-      distributedUrl: "localhost:30082"
-
-    env:
-      - name: LMCACHE_LOG_LEVEL
-        value: "DEBUG"
-    hf_token: <HF_TOKEN>
-
-  - name: "llama3"
-    repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "70Gi"
-    requestGPU: 1
-    pvcStorage: "50Gi"
-    vllmConfig:
-      enablePrefixCaching: true
-      maxModelLen: 32000
 
-    lmcacheConfig:
-      enabled: true
-      cpuOffloadingBufferSize: "60"
-      enableController: true
-      instanceId: "default3"
       controllerPort: 9000
-      workerPort: 8003
-      distributedUrl: "localhost:30083"
+      workerPorts: "8001"
+      p2pHost: "localhost"
+      p2pInitPorts: "30081"
+      workerHeartbeatTime: "30"
 
     env:
       - name: LMCACHE_LOG_LEVEL
         value: "DEBUG"
-    hf_token: <HF_TOKEN>
-  - name: "llama4"
-    repository: "lmcache/vllm-openai"
-    tag: "v0.3.5"
-    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
-    replicaCount: 1
-    requestCPU: 6
-    requestMemory: "70Gi"
-    requestGPU: 1
-    pvcStorage: "50Gi"
-    vllmConfig:
-      enablePrefixCaching: true
-      maxModelLen: 32000
-
-    lmcacheConfig:
-      enabled: true
-      cpuOffloadingBufferSize: "60"
-      enableController: true
-      instanceId: "default4"
-      controllerPort: "9000"
-      workerPort: 8004
-      distributedUrl: "localhost:30084"
-
-    env:
-      - name: LMCACHE_LOG_LEVEL
+      - name: VLLM_LOGGING_LEVEL
         value: "DEBUG"
-    hf_token: <HF_TOKEN>
+      - name: HF_HOME
+        value: "/data"
 
 routerSpec:
   repository: "lmcache/lmstack-router"
@@ -119,5 +46,4 @@ routerSpec:
       memory: "2G"
   routingLogic: "kvaware"
   lmcacheControllerPort: 9000
-  hf_token: <HF_TOKEN>
   sessionKey: "x-user-id"