Skip to content

Commit 57c69ae

Browse files
bugfix: fix bug for kvaware routing to be compatiable with lmcache 0.3.9
Signed-off-by: Rui Zhang <[email protected]>
1 parent 06b933b commit 57c69ae

File tree

7 files changed

+62
-123
lines changed

7 files changed

+62
-123
lines changed

.github/values-06-session-routing.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ servingEngineSpec:
2121
enabled: true
2222
cpuOffloadingBufferSize: "10"
2323
enableController: true
24-
instanceId: "default1"
2524
controllerPort: 9000
26-
workerPort: 8001
27-
distributedUrl: "localhost:30081"
25+
workerPorts: "8001"
26+
p2pHost: "localhost"
27+
p2pInitPorts: "30081"
2828
env:
2929
- name: LMCACHE_LOG_LEVEL
3030
value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
5151
enabled: true
5252
cpuOffloadingBufferSize: "10"
5353
enableController: true
54-
instanceId: "default2"
5554
controllerPort: 9000
56-
workerPort: 8002
57-
distributedUrl: "localhost:30082"
55+
workerPorts: "8002"
56+
p2pHost: "localhost"
57+
p2pInitPorts: "30082"
5858
env:
5959
- name: LMCACHE_LOG_LEVEL
6060
value: "DEBUG"

.github/values-07-prefix-routing.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ servingEngineSpec:
2121
enabled: true
2222
cpuOffloadingBufferSize: "10"
2323
enableController: true
24-
instanceId: "default1"
2524
controllerPort: 9000
26-
workerPort: 8001
27-
distributedUrl: "localhost:30081"
25+
workerPorts: "8001"
26+
p2pHost: "localhost"
27+
p2pInitPorts: "30081"
2828
env:
2929
- name: LMCACHE_LOG_LEVEL
3030
value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
5151
enabled: true
5252
cpuOffloadingBufferSize: "10"
5353
enableController: true
54-
instanceId: "default2"
5554
controllerPort: 9000
56-
workerPort: 8002
57-
distributedUrl: "localhost:30082"
55+
workerPorts: "8002"
56+
p2pHost: "localhost"
57+
p2pInitPorts: "30082"
5858
env:
5959
- name: LMCACHE_LOG_LEVEL
6060
value: "DEBUG"

.github/values-08-roundrobin-routing.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ servingEngineSpec:
2121
enabled: true
2222
cpuOffloadingBufferSize: "10"
2323
enableController: true
24-
instanceId: "default1"
2524
controllerPort: 9000
26-
workerPort: 8001
27-
distributedUrl: "localhost:30081"
25+
workerPorts: "8001"
26+
p2pHost: "localhost"
27+
p2pInitPorts: "30081"
2828
env:
2929
- name: LMCACHE_LOG_LEVEL
3030
value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
5151
enabled: true
5252
cpuOffloadingBufferSize: "10"
5353
enableController: true
54-
instanceId: "default2"
5554
controllerPort: 9000
56-
workerPort: 8002
57-
distributedUrl: "localhost:30082"
55+
workerPorts: "8002"
56+
p2pHost: "localhost"
57+
p2pInitPorts: "30082"
5858
env:
5959
- name: LMCACHE_LOG_LEVEL
6060
value: "DEBUG"

.github/values-09-kvaware-routing.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ servingEngineSpec:
2121
enabled: true
2222
cpuOffloadingBufferSize: "10"
2323
enableController: true
24-
instanceId: "default1"
2524
controllerPort: 9000
26-
workerPort: 8001
27-
distributedUrl: "localhost:30081"
25+
workerPorts: "8001"
26+
p2pHost: "localhost"
27+
p2pInitPorts: "30081"
2828
env:
2929
- name: LMCACHE_LOG_LEVEL
3030
value: "DEBUG"
@@ -51,10 +51,10 @@ servingEngineSpec:
5151
enabled: true
5252
cpuOffloadingBufferSize: "10"
5353
enableController: true
54-
instanceId: "default2"
5554
controllerPort: 9000
56-
workerPort: 8002
57-
distributedUrl: "localhost:30082"
55+
workerPorts: "8002"
56+
p2pHost: "localhost"
57+
p2pInitPorts: "30082"
5858
env:
5959
- name: LMCACHE_LOG_LEVEL
6060
value: "DEBUG"

helm/templates/deployment-vllm-multi.yaml

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -321,18 +321,31 @@ spec:
321321
{{- if hasKey $modelSpec.lmcacheConfig "instanceId" }}
322322
- name: LMCACHE_LMCACHE_INSTANCE_ID
323323
value: {{ $modelSpec.lmcacheConfig.instanceId | quote }}
324+
{{- else }}
325+
- name: LMCACHE_LMCACHE_INSTANCE_ID
326+
valueFrom:
327+
fieldRef:
328+
fieldPath: metadata.name
324329
{{- end }}
325330
{{- if hasKey $modelSpec.lmcacheConfig "controllerPort" }}
326-
- name: LMCACHE_CONTROLLER_URL
331+
- name: LMCACHE_CONTROLLER_PULL_URL
327332
value: "{{ .Release.Name }}-router-service:{{ $modelSpec.lmcacheConfig.controllerPort }}"
328333
{{- end }}
329-
{{- if hasKey $modelSpec.lmcacheConfig "workerPort" }}
330-
- name: LMCACHE_LMCACHE_WORKER_PORT
331-
value: {{ $modelSpec.lmcacheConfig.workerPort | quote }}
334+
{{- if hasKey $modelSpec.lmcacheConfig "workerPorts" }}
335+
- name: LMCACHE_LMCACHE_WORKER_PORTS
336+
value: {{ $modelSpec.lmcacheConfig.workerPorts | quote }}
337+
{{- end }}
338+
{{- if hasKey $modelSpec.lmcacheConfig "p2pHost" }}
339+
- name: LMCACHE_P2P_HOST
340+
value: {{ $modelSpec.lmcacheConfig.p2pHost | quote }}
341+
{{- end }}
342+
{{- if hasKey $modelSpec.lmcacheConfig "p2pInitPorts" }}
343+
- name: LMCACHE_P2P_INIT_PORTS
344+
value: {{ $modelSpec.lmcacheConfig.p2pInitPorts | quote }}
332345
{{- end }}
333-
{{- if hasKey $modelSpec.lmcacheConfig "distributedUrl" }}
334-
- name: LMCACHE_DISTRIBUTED_URL
335-
value: {{ $modelSpec.lmcacheConfig.distributedUrl | quote }}
346+
{{- if hasKey $modelSpec.lmcacheConfig "workerHeartbeatTime" }}
347+
- name: LMCACHE_LMCACHE_WORKER_HEARTBEAT_TIME
348+
value: {{ $modelSpec.lmcacheConfig.workerHeartbeatTime | quote }}
336349
{{- end }}
337350
{{- end }}
338351
{{- if or .Values.servingEngineSpec.configs $modelSpec.envFromSecret }}

tutorials/17-kv-aware-routing.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ First, send a request to the router:
5454
curl http://localhost:30080/v1/completions \
5555
-H "Content-Type: application/json" \
5656
-d '{
57-
"model": "meta-llama/Llama-3.1-8B-Instruct",
57+
"model": "openai/gpt-oss-20b",
5858
"prompt": "What is the capital of France?",
5959
"max_tokens": 100
6060
}'
@@ -66,7 +66,7 @@ Then, send another request with the same prompt prefix:
6666
curl http://localhost:30080/v1/completions \
6767
-H "Content-Type: application/json" \
6868
-d '{
69-
"model": "meta-llama/Llama-3.1-8B-Instruct",
69+
"model": "openai/gpt-oss-20b",
7070
"prompt": "What is the capital of France? And what is its population?",
7171
"max_tokens": 100
7272
}'
Lines changed: 16 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,111 +1,38 @@
11
servingEngineSpec:
22
runtimeClassName: ""
33
modelSpec:
4-
- name: "llama1"
4+
- name: "gpt-oss-20b"
55
repository: "lmcache/vllm-openai"
6-
tag: "v0.3.5"
7-
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
8-
replicaCount: 1
9-
requestCPU: 6
10-
requestMemory: "70Gi"
6+
tag: "v0.3.9post2"
7+
modelURL: "openai/gpt-oss-20b"
8+
replicaCount: 2
9+
requestCPU: 8
10+
requestMemory: "128Gi"
1111
requestGPU: 1
12-
pvcStorage: "50Gi"
12+
pvcStorage: "256Gi"
1313
vllmConfig:
1414
enablePrefixCaching: true
15-
maxModelLen: 32000
15+
maxModelLen: 8000
16+
gpuMemoryUtilization: "0.9"
1617

1718
lmcacheConfig:
1819
enabled: true
1920
cpuOffloadingBufferSize: "60"
2021
enableController: true
21-
instanceId: "default1"
22-
controllerPort: 9000
23-
workerPort: 8001
24-
distributedUrl: "localhost:30081"
25-
26-
env:
27-
- name: LMCACHE_LOG_LEVEL
28-
value: "DEBUG"
29-
hf_token: <HF_TOKEN>
30-
- name: "llama2"
31-
repository: "lmcache/vllm-openai"
32-
tag: "v0.3.5"
33-
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
34-
replicaCount: 1
35-
requestCPU: 6
36-
requestMemory: "30Gi"
37-
requestGPU: 1
38-
pvcStorage: "50Gi"
39-
vllmConfig:
40-
enablePrefixCaching: true
41-
maxModelLen: 32000
42-
43-
lmcacheConfig:
44-
enabled: true
45-
cpuOffloadingBufferSize: "60"
46-
enableController: true
47-
instanceId: "default2"
48-
controllerPort: 9000
49-
workerPort: 8002
50-
distributedUrl: "localhost:30082"
51-
52-
env:
53-
- name: LMCACHE_LOG_LEVEL
54-
value: "DEBUG"
55-
hf_token: <HF_TOKEN>
56-
57-
- name: "llama3"
58-
repository: "lmcache/vllm-openai"
59-
tag: "v0.3.5"
60-
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
61-
replicaCount: 1
62-
requestCPU: 6
63-
requestMemory: "70Gi"
64-
requestGPU: 1
65-
pvcStorage: "50Gi"
66-
vllmConfig:
67-
enablePrefixCaching: true
68-
maxModelLen: 32000
6922

70-
lmcacheConfig:
71-
enabled: true
72-
cpuOffloadingBufferSize: "60"
73-
enableController: true
74-
instanceId: "default3"
7523
controllerPort: 9000
76-
workerPort: 8003
77-
distributedUrl: "localhost:30083"
24+
workerPorts: "8001"
25+
p2pHost: "localhost"
26+
p2pInitPorts: "30081"
27+
workerHeartbeatTime: "30"
7828

7929
env:
8030
- name: LMCACHE_LOG_LEVEL
8131
value: "DEBUG"
82-
hf_token: <HF_TOKEN>
83-
- name: "llama4"
84-
repository: "lmcache/vllm-openai"
85-
tag: "v0.3.5"
86-
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
87-
replicaCount: 1
88-
requestCPU: 6
89-
requestMemory: "70Gi"
90-
requestGPU: 1
91-
pvcStorage: "50Gi"
92-
vllmConfig:
93-
enablePrefixCaching: true
94-
maxModelLen: 32000
95-
96-
lmcacheConfig:
97-
enabled: true
98-
cpuOffloadingBufferSize: "60"
99-
enableController: true
100-
instanceId: "default4"
101-
controllerPort: "9000"
102-
workerPort: 8004
103-
distributedUrl: "localhost:30084"
104-
105-
env:
106-
- name: LMCACHE_LOG_LEVEL
32+
- name: VLLM_LOGGING_LEVEL
10733
value: "DEBUG"
108-
hf_token: <HF_TOKEN>
34+
- name: HF_HOME
35+
value: "/data"
10936

11037
routerSpec:
11138
repository: "lmcache/lmstack-router"
@@ -119,5 +46,4 @@ routerSpec:
11946
memory: "2G"
12047
routingLogic: "kvaware"
12148
lmcacheControllerPort: 9000
122-
hf_token: <HF_TOKEN>
12349
sessionKey: "x-user-id"

0 commit comments

Comments
 (0)