modify CI

zerofishnoodles · zerofishnoodles · commit d6b9a1dd3b72 · 2025-09-16T18:56:38.000Z
Signed-off-by: Rui Zhang &lt;zrfishnoodles@gmail.com&gt;
diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml
@@ -4,75 +4,60 @@ servingEngineSpec:
   runtimeClassName: ""
   modelSpec:
     # Prefill node configuration
-    - name: "opt125m-prefill"
+    - name: "opt125m-1"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
-        v1: 1
-        gpuMemoryUtilization: 0.6
+        gpuMemoryUtilization: 0.8
       lmcacheConfig:
-        cudaVisibleDevices: "0"
         enabled: true
-        kvRole: "kv_producer"
-        enableNixl: true
-        nixlRole: "sender"
-        nixlPeerHost: "vllm-opt125m-decode-engine-service"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
-        cpuOffloadingBufferSize: 0
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default1"
         controllerPort: 9000
         workerPort: 8001
         distributedUrl: "localhost:30081"
-      labels:
-        model: "opt125m-prefill"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
         {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
     # Decode node configuration
-    - name: "opt125m-decode"
+    - name: "opt125m-2"
       repository: "lmcache/vllm-openai"
-      tag: "latest"
+      tag: "v0.3.5"
       modelURL: "facebook/opt-125m"
       replicaCount: 1
       requestCPU: 6
       requestMemory: "30Gi"
-      # requestGPU: 1
+      requestGPU: 1
       pvcStorage: "50Gi"
       vllmConfig:
         enablePrefixCaching: true
         maxModelLen: 1024
         v1: 1
+        gpuMemoryUtilization: 0.6
       lmcacheConfig:
-        cudaVisibleDevices: "1"
         enabled: true
-        kvRole: "kv_consumer"  # Set decode node as consumer
-        enableNixl: true
-        nixlRole: "receiver"
-        nixlPeerHost: "0.0.0.0"
-        nixlPeerPort: "55555"
-        nixlBufferSize: "1073741824"  # 1GB
-        nixlBufferDevice: "cuda"
-        nixlEnableGc: true
-        enablePD: true
+        cpuOffloadingBufferSize: "60"
         enableController: true
+        instanceId: "default2"
         controllerPort: 9000
         workerPort: 8002
         distributedUrl: "localhost:30082"
-      labels:
-        model: "opt125m-decode"
+      env:
+        - name: LMCACHE_LOG_LEVEL
+          value: "DEBUG"
       chatTemplate: "chat.jinja2"
       chatTemplateConfigMap: |-
         {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -109,6 +109,7 @@ spec:
           {{- if or
             (eq $modelSpec.tag "2025-05-27-v1")
             (eq $modelSpec.tag "2025-05-17-v1")
+            (eq $modelSpec.tag "v0.3.5")
             (eq $modelSpec.tag "latest-nightly")
             (hasPrefix "nightly-" $modelSpec.tag)
             (and
diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml