diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 2a63e2f5c..f162822d3 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -334,9 +334,13 @@ spec: value: "{{ .Release.Name }}-router-service:{{ $modelSpec.lmcacheConfig.controllerPort }}" {{- end }} {{- if hasKey $modelSpec.lmcacheConfig "workerPort" }} - - name: LMCACHE_WORKER_PORT + - name: LMCACHE_LMCACHE_WORKER_PORT value: {{ $modelSpec.lmcacheConfig.workerPort | quote }} {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "distributedUrl" }} + - name: LMCACHE_DISTRIBUTED_URL + value: {{ $modelSpec.lmcacheConfig.distributedUrl | quote }} + {{- end }} {{- end }} {{- if or .Values.servingEngineSpec.configs $modelSpec.envFromSecret }} envFrom: diff --git a/pyproject.toml b/pyproject.toml index dfc923313..ed9de06b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,10 +39,10 @@ default = [] # leave this empty because pip requires at least one specifier semantic_cache = [ "sentence-transformers==2.2.2", "faiss-cpu==1.10.0", - "huggingface-hub==0.25.2", # downgrade to 0.25.2 to avoid breaking changes + "huggingface-hub==0.34.0", ] lmcache = [ - "lmcache==0.2.1", + "lmcache==0.3.5", ] [build-system] diff --git a/src/vllm_router/routers/routing_logic.py b/src/vllm_router/routers/routing_logic.py index 093872b3f..f5ae7cced 100644 --- a/src/vllm_router/routers/routing_logic.py +++ b/src/vllm_router/routers/routing_logic.py @@ -18,6 +18,7 @@ import math import random import threading +import uuid from typing import Dict, List from fastapi import Request @@ -289,7 +290,9 @@ async def route_request( url = endpoints[0].url + "/tokenize" # TODO (Yuhan): Handle chat completions token_ids = self.tokenizer.encode(request_json["prompt"]) - msg = LookupMsg(tokens=token_ids) + event_id = "Lookup" + str(uuid.uuid4()) + logger.debug(f"Lookup event id: {event_id}") + msg = LookupMsg(tokens=token_ids, event_id=event_id) instance_id = await self.query_manager(msg) matched_tokens = math.inf if len(list(instance_id.layout_info.keys())) > 0: @@ -321,10 +324,13 @@ async def route_request( queried_instance_ids = [info for info in instance_id.layout_info] if queried_instance_ids[0] not in self.instance_id_to_ip: for endpoint in endpoints: + event_id = "QueryInst" + str(uuid.uuid4()) + logger.debug(f"QueryInst event id: {event_id}") query_message = QueryInstMsg( ip=endpoint.url.split(f":{endpoint.url.split(':')[-1]}")[ 0 - ].split("//")[1] + ].split("//")[1], + event_id=event_id, ) endpoint_instance_id = await self.query_manager(query_message) diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml index c68cd8aed..7c17e8cd5 100644 --- a/tutorials/assets/values-17-kv-aware.yaml +++ b/tutorials/assets/values-17-kv-aware.yaml @@ -21,6 +21,7 @@ servingEngineSpec: instanceId: "default1" controllerPort: "9000" workerPort: 8001 + distributedUrl: "localhost:8201" env: - name: LMCACHE_LOG_LEVEL @@ -46,6 +47,7 @@ servingEngineSpec: instanceId: "default2" controllerPort: "9000" workerPort: 8002 + distributedUrl: "localhost:8202" env: - name: LMCACHE_LOG_LEVEL @@ -72,6 +74,7 @@ servingEngineSpec: instanceId: "default3" controllerPort: "9000" workerPort: 8003 + distributedUrl: "localhost:8203" env: - name: LMCACHE_LOG_LEVEL @@ -97,6 +100,7 @@ servingEngineSpec: instanceId: "default4" controllerPort: "9000" workerPort: 8004 + distributedUrl: "localhost:8204" env: - name: LMCACHE_LOG_LEVEL @@ -105,7 +109,7 @@ servingEngineSpec: routerSpec: repository: "lmcache/lmstack-router" - tag: "kvaware-latest" + tag: "latest" resources: requests: cpu: "1"