Skip to content

Commit 25f9d09

Browse files
zerofishnoodlesTheCodeWrangler
authored andcommitted
[Feat] Use the lmcache 0.3.5 for kvaware routing (vllm-project#673)
Signed-off-by: Rui Zhang <[email protected]> Signed-off-by: Nathan Price <[email protected]>
1 parent 817688b commit 25f9d09

File tree

4 files changed

+20
-6
lines changed

4 files changed

+20
-6
lines changed

helm/templates/deployment-vllm-multi.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,9 +334,13 @@ spec:
334334
value: "{{ .Release.Name }}-router-service:{{ $modelSpec.lmcacheConfig.controllerPort }}"
335335
{{- end }}
336336
{{- if hasKey $modelSpec.lmcacheConfig "workerPort" }}
337-
- name: LMCACHE_WORKER_PORT
337+
- name: LMCACHE_LMCACHE_WORKER_PORT
338338
value: {{ $modelSpec.lmcacheConfig.workerPort | quote }}
339339
{{- end }}
340+
{{- if hasKey $modelSpec.lmcacheConfig "distributedUrl" }}
341+
- name: LMCACHE_DISTRIBUTED_URL
342+
value: {{ $modelSpec.lmcacheConfig.distributedUrl | quote }}
343+
{{- end }}
340344
{{- end }}
341345
{{- if or .Values.servingEngineSpec.configs $modelSpec.envFromSecret }}
342346
envFrom:

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ default = [] # leave this empty because pip requires at least one specifier
3939
semantic_cache = [
4040
"sentence-transformers==2.2.2",
4141
"faiss-cpu==1.10.0",
42-
"huggingface-hub==0.25.2", # downgrade to 0.25.2 to avoid breaking changes
42+
"huggingface-hub==0.34.0",
4343
]
4444
lmcache = [
45-
"lmcache==0.2.1",
45+
"lmcache==0.3.5",
4646
]
4747

4848
[build-system]

src/vllm_router/routers/routing_logic.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import math
1919
import random
2020
import threading
21+
import uuid
2122
from typing import Dict, List
2223

2324
from fastapi import Request
@@ -289,7 +290,9 @@ async def route_request(
289290
url = endpoints[0].url + "/tokenize"
290291
# TODO (Yuhan): Handle chat completions
291292
token_ids = self.tokenizer.encode(request_json["prompt"])
292-
msg = LookupMsg(tokens=token_ids)
293+
event_id = "Lookup" + str(uuid.uuid4())
294+
logger.debug(f"Lookup event id: {event_id}")
295+
msg = LookupMsg(tokens=token_ids, event_id=event_id)
293296
instance_id = await self.query_manager(msg)
294297
matched_tokens = math.inf
295298
if len(list(instance_id.layout_info.keys())) > 0:
@@ -321,10 +324,13 @@ async def route_request(
321324
queried_instance_ids = [info for info in instance_id.layout_info]
322325
if queried_instance_ids[0] not in self.instance_id_to_ip:
323326
for endpoint in endpoints:
327+
event_id = "QueryInst" + str(uuid.uuid4())
328+
logger.debug(f"QueryInst event id: {event_id}")
324329
query_message = QueryInstMsg(
325330
ip=endpoint.url.split(f":{endpoint.url.split(':')[-1]}")[
326331
0
327-
].split("//")[1]
332+
].split("//")[1],
333+
event_id=event_id,
328334
)
329335
endpoint_instance_id = await self.query_manager(query_message)
330336

tutorials/assets/values-17-kv-aware.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ servingEngineSpec:
2121
instanceId: "default1"
2222
controllerPort: "9000"
2323
workerPort: 8001
24+
distributedUrl: "localhost:8201"
2425

2526
env:
2627
- name: LMCACHE_LOG_LEVEL
@@ -46,6 +47,7 @@ servingEngineSpec:
4647
instanceId: "default2"
4748
controllerPort: "9000"
4849
workerPort: 8002
50+
distributedUrl: "localhost:8202"
4951

5052
env:
5153
- name: LMCACHE_LOG_LEVEL
@@ -72,6 +74,7 @@ servingEngineSpec:
7274
instanceId: "default3"
7375
controllerPort: "9000"
7476
workerPort: 8003
77+
distributedUrl: "localhost:8203"
7578

7679
env:
7780
- name: LMCACHE_LOG_LEVEL
@@ -97,6 +100,7 @@ servingEngineSpec:
97100
instanceId: "default4"
98101
controllerPort: "9000"
99102
workerPort: 8004
103+
distributedUrl: "localhost:8204"
100104

101105
env:
102106
- name: LMCACHE_LOG_LEVEL
@@ -105,7 +109,7 @@ servingEngineSpec:
105109

106110
routerSpec:
107111
repository: "lmcache/lmstack-router"
108-
tag: "kvaware-latest"
112+
tag: "latest"
109113
resources:
110114
requests:
111115
cpu: "1"

0 commit comments

Comments
 (0)