From 094a0bf0be030143748d2cf5cf879f05d05e3fb5 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Mon, 15 Sep 2025 18:01:53 +0000 Subject: [PATCH 01/11] [bugfix] fix kvaware routing Signed-off-by: Rui Zhang --- .github/workflows/router-e2e-test.yml | 2 +- docker/Dockerfile | 9 +++-- helm/templates/deployment-router.yaml | 2 + helm/templates/deployment-vllm-multi.yaml | 2 + pyproject.toml | 1 + src/vllm_router/app.py | 5 +++ src/vllm_router/routers/routing_logic.py | 45 +++++++++++++++++++---- 7 files changed, 54 insertions(+), 12 deletions(-) diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml index 6430547d8..eec857cc1 100644 --- a/.github/workflows/router-e2e-test.yml +++ b/.github/workflows/router-e2e-test.yml @@ -124,7 +124,7 @@ jobs: echo "🔨 Building router docker image" cd ${{ github.workspace }} eval "$(minikube docker-env)" - docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware . + docker build -t git-act-router -f docker/Dockerfile . - name: Run all k8s discovery routing tests run: | diff --git a/docker/Dockerfile b/docker/Dockerfile index 0246105d2..9fe7646b4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,9 +19,12 @@ ARG INSTALL_OPTIONAL_DEP=semantic_cache,lmcache ENV INSTALL_OPTIONAL_DEP=${INSTALL_OPTIONAL_DEP} # Install dependencies (use cache, and delete after install, to speed up the build) -RUN pip install --upgrade --no-cache-dir pip setuptools_scm && \ - pip install --no-cache-dir .[$INSTALL_OPTIONAL_DEP] +RUN pip install --no-cache-dir uv && \ + uv venv /opt/venv && \ + . /opt/venv/bin/activate && \ + uv pip install --upgrade --no-cache-dir pip setuptools_scm && \ + uv pip install --no-cache-dir .[$INSTALL_OPTIONAL_DEP] # Set the entrypoint -ENTRYPOINT ["vllm-router"] +ENTRYPOINT ["/opt/venv/bin/vllm-router"] CMD [] diff --git a/helm/templates/deployment-router.yaml b/helm/templates/deployment-router.yaml index d81da3f6f..cf735ffd9 100644 --- a/helm/templates/deployment-router.yaml +++ b/helm/templates/deployment-router.yaml @@ -50,6 +50,8 @@ spec: - name: HF_TOKEN value: "{{ .Values.routerSpec.hf_token }}" {{- end }} + - name: PYTHONHASHSEED + value: "123" - name: LMCACHE_LOG_LEVEL value: "DEBUG" {{- if .Values.servingEngineSpec.enableEngine -}} diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 207be3297..453eaa550 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -190,6 +190,8 @@ spec: {{- end }} imagePullPolicy: "{{ .Values.servingEngineSpec.imagePullPolicy | default "Always" }}" env: + - name: PYTHONHASHSEED + value: "123" - name: HF_HOME {{- if hasKey $modelSpec "pvcStorage" }} value: /data diff --git a/pyproject.toml b/pyproject.toml index 6643c744d..e23a9ae32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ semantic_cache = [ ] lmcache = [ "lmcache==0.3.5", + "vllm==0.10.1.1", ] test = [ "pytest>=8.3.4", diff --git a/src/vllm_router/app.py b/src/vllm_router/app.py index 9b99f3b01..112b9a2c0 100644 --- a/src/vllm_router/app.py +++ b/src/vllm_router/app.py @@ -33,6 +33,7 @@ from vllm_router.routers.main_router import main_router from vllm_router.routers.metrics_router import metrics_router from vllm_router.routers.routing_logic import ( + cleanup_routing_logic, get_routing_logic, initialize_routing_logic, ) @@ -111,6 +112,10 @@ async def lifespan(app: FastAPI): logger.info("Closing dynamic config watcher") dyn_cfg_watcher.close() + # Close routing logic instances + logger.info("Closing routing logic instances") + cleanup_routing_logic() + def initialize_all(app: FastAPI, args): """ diff --git a/src/vllm_router/routers/routing_logic.py b/src/vllm_router/routers/routing_logic.py index 86b05518d..63f8f0638 100644 --- a/src/vllm_router/routers/routing_logic.py +++ b/src/vllm_router/routers/routing_logic.py @@ -14,6 +14,7 @@ import abc import asyncio +import concurrent.futures import enum import math import random @@ -265,7 +266,9 @@ def start_kv_manager(self): self.loop = asyncio.new_event_loop() self.thread = threading.Thread(target=self.loop.run_forever, daemon=True) self.thread.start() - asyncio.run_coroutine_threadsafe(self.kv_manager.start_all(), self.loop) + self.lmcache_cluster_monitor_task = asyncio.run_coroutine_threadsafe( + self.kv_manager.start_all(), self.loop + ) def query_manager(self, msg) -> str: """ @@ -274,6 +277,20 @@ def query_manager(self, msg) -> str: instance_id = self.kv_manager.handle_orchestration_message(msg) return instance_id + def close(self): + """Gracefully shutdown the lmcache cluster monitor task.""" + if ( + hasattr(self, "lmcache_cluster_monitor_task") + and self.lmcache_cluster_monitor_task + ): + logger.info("Shutting down lmcache cluster monitor task") + self.lmcache_cluster_monitor_task.cancel() + try: + self.lmcache_cluster_monitor_task.result() + except concurrent.futures.CancelledError: + pass + self.lmcache_cluster_monitor_task = None + async def route_request( self, endpoints: List[EndpointInfo], @@ -323,8 +340,10 @@ async def route_request( event_id = "Lookup" + str(uuid.uuid4()) logger.debug(f"Lookup event id: {event_id}") msg = LookupMsg(tokens=token_ids, event_id=event_id) + logger.debug(f"Lookup message: {msg}") instance_id = await self.query_manager(msg) matched_tokens = math.inf + logger.debug(f"Instance id: {instance_id}") if len(list(instance_id.layout_info.keys())) > 0: matched_instance_id = list(instance_id.layout_info.keys())[ 0 @@ -359,8 +378,9 @@ async def route_request( ].split("//")[1], event_id=event_id, ) + logger.debug(f"QueryInst message: {query_message}") endpoint_instance_id = await self.query_manager(query_message) - + logger.debug(f"Endpoint instance id: {endpoint_instance_id}") self.instance_id_to_ip[endpoint_instance_id.instance_id] = ( endpoint.url ) @@ -528,19 +548,26 @@ def reconfigure_routing_logic( routing_logic: RoutingLogic, *args, **kwargs ) -> RoutingInterface: # Remove the existing routers from the singleton registry + cleanup_routing_logic() + return initialize_routing_logic(routing_logic, *args, **kwargs) + + +def get_routing_logic() -> RoutingInterface: + # Look up in our singleton registry which router (if any) has been created. for cls in ( SessionRouter, RoundRobinRouter, KvawareRouter, + PrefixAwareRouter, DisaggregatedPrefillRouter, ): if cls in SingletonABCMeta._instances: - del SingletonABCMeta._instances[cls] - return initialize_routing_logic(routing_logic, *args, **kwargs) + return cls() + raise ValueError("The global router has not been initialized") -def get_routing_logic() -> RoutingInterface: - # Look up in our singleton registry which router (if any) has been created. +def cleanup_routing_logic(): + """Clean up all routing logic instances.""" for cls in ( SessionRouter, RoundRobinRouter, @@ -549,5 +576,7 @@ def get_routing_logic() -> RoutingInterface: DisaggregatedPrefillRouter, ): if cls in SingletonABCMeta._instances: - return cls() - raise ValueError("The global router has not been initialized") + instance = cls() + if hasattr(instance, "close"): + instance.close() + del SingletonABCMeta._instances[cls] From 7408cb17ef48dbf6ac5642012efe9e7fcbb32ab0 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 16 Sep 2025 01:01:17 +0000 Subject: [PATCH 02/11] Modify CI to be compatible Signed-off-by: Rui Zhang --- .github/values-06-session-routing.yaml | 17 +++++++++++++---- .github/values-07-prefix-routing.yaml | 17 +++++++++++++---- .github/values-08-roundrobin-routing.yaml | 17 +++++++++++++---- .github/values-09-kvaware-routing.yaml | 17 +++++++++++++---- .github/values-10-disagg-prefill.yaml | 17 +++++++++++++---- 5 files changed, 65 insertions(+), 20 deletions(-) diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml index 43974f94e..16747fcb4 100644 --- a/.github/values-06-session-routing.yaml +++ b/.github/values-06-session-routing.yaml @@ -6,10 +6,10 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-prefill" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -31,6 +31,10 @@ servingEngineSpec: nixlEnableGc: true enablePD: true cpuOffloadingBufferSize: 0 + enableController: true + controllerPort: 9000 + workerPort: 8001 + distributedUrl: "localhost:30081" labels: model: "opt125m-prefill" chatTemplate: "chat.jinja2" @@ -40,10 +44,10 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-decode" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -63,6 +67,10 @@ servingEngineSpec: nixlBufferDevice: "cuda" nixlEnableGc: true enablePD: true + enableController: true + controllerPort: 9000 + workerPort: 8002 + distributedUrl: "localhost:30082" labels: model: "opt125m-decode" chatTemplate: "chat.jinja2" @@ -81,6 +89,7 @@ routerSpec: type: Recreate enableRouter: true routingLogic: "session" + lmcacheControllerPort: 9000 sessionKey: "x-user-id" extraArgs: - "--log-level" diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml index 4b8bf76af..dd1b2aff1 100644 --- a/.github/values-07-prefix-routing.yaml +++ b/.github/values-07-prefix-routing.yaml @@ -6,10 +6,10 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-prefill" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -31,6 +31,10 @@ servingEngineSpec: nixlEnableGc: true enablePD: true cpuOffloadingBufferSize: 0 + enableController: true + controllerPort: 9000 + workerPort: 8001 + distributedUrl: "localhost:30081" labels: model: "opt125m-prefill" chatTemplate: "chat.jinja2" @@ -40,10 +44,10 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-decode" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -63,6 +67,10 @@ servingEngineSpec: nixlBufferDevice: "cuda" nixlEnableGc: true enablePD: true + enableController: true + controllerPort: 9000 + workerPort: 8002 + distributedUrl: "localhost:30082" labels: model: "opt125m-decode" chatTemplate: "chat.jinja2" @@ -84,3 +92,4 @@ routerSpec: extraArgs: - "--log-level" - "info" + lmcacheControllerPort: 9000 diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml index e9362eee6..23be0c106 100644 --- a/.github/values-08-roundrobin-routing.yaml +++ b/.github/values-08-roundrobin-routing.yaml @@ -6,10 +6,10 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-prefill" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -31,6 +31,10 @@ servingEngineSpec: nixlEnableGc: true enablePD: true cpuOffloadingBufferSize: 0 + enableController: true + controllerPort: 9000 + workerPort: 8001 + distributedUrl: "localhost:30081" labels: model: "opt125m-prefill" chatTemplate: "chat.jinja2" @@ -40,10 +44,10 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-decode" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -63,6 +67,10 @@ servingEngineSpec: nixlBufferDevice: "cuda" nixlEnableGc: true enablePD: true + enableController: true + controllerPort: 9000 + workerPort: 8002 + distributedUrl: "localhost:30082" labels: model: "opt125m-decode" chatTemplate: "chat.jinja2" @@ -84,3 +92,4 @@ routerSpec: extraArgs: - "--log-level" - "info" + lmcacheControllerPort: 9000 diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml index ac58c26f6..09659422b 100644 --- a/.github/values-09-kvaware-routing.yaml +++ b/.github/values-09-kvaware-routing.yaml @@ -6,10 +6,10 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-prefill" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -31,6 +31,10 @@ servingEngineSpec: nixlEnableGc: true enablePD: true cpuOffloadingBufferSize: 0 + enableController: true + controllerPort: 9000 + workerPort: 8001 + distributedUrl: "localhost:30081" labels: model: "opt125m-prefill" chatTemplate: "chat.jinja2" @@ -40,10 +44,10 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-decode" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -63,6 +67,10 @@ servingEngineSpec: nixlBufferDevice: "cuda" nixlEnableGc: true enablePD: true + enableController: true + controllerPort: 9000 + workerPort: 8002 + distributedUrl: "localhost:30082" labels: model: "opt125m-decode" chatTemplate: "chat.jinja2" @@ -84,3 +92,4 @@ routerSpec: extraArgs: - "--log-level" - "info" + lmcacheControllerPort: 9000 diff --git a/.github/values-10-disagg-prefill.yaml b/.github/values-10-disagg-prefill.yaml index 548d284f5..236b46d33 100644 --- a/.github/values-10-disagg-prefill.yaml +++ b/.github/values-10-disagg-prefill.yaml @@ -9,10 +9,10 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-prefill" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -34,6 +34,10 @@ servingEngineSpec: nixlEnableGc: true enablePD: true cpuOffloadingBufferSize: 0 + enableController: true + controllerPort: 9000 + workerPort: 8001 + distributedUrl: "localhost:30081" labels: model: "opt125m-prefill" chatTemplate: "chat.jinja2" @@ -43,10 +47,10 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-decode" repository: "lmcache/vllm-openai" - tag: "2025-05-27-v1" + tag: "latest" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 8 + requestCPU: 6 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -66,6 +70,10 @@ servingEngineSpec: nixlBufferDevice: "cuda" nixlEnableGc: true enablePD: true + enableController: true + controllerPort: 9000 + workerPort: 8002 + distributedUrl: "localhost:30082" labels: model: "opt125m-decode" chatTemplate: "chat.jinja2" @@ -90,6 +98,7 @@ routerSpec: engineScrapeInterval: 15 requestStatsWindow: 60 enablePD: true + lmcacheControllerPort: 9000 resources: requests: cpu: "4" From 8c338623cc33393098bb7a714045dd2a05f40bfc Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 16 Sep 2025 01:03:36 +0000 Subject: [PATCH 03/11] Revert "fix dynamic config" This reverts commit 9ac02f992e58fa102a891d3dfe903142bf913f0b. Signed-off-by: Rui Zhang --- src/vllm_router/service_discovery.py | 42 ++-------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py index eca70bb9a..12f3a694a 100644 --- a/src/vllm_router/service_discovery.py +++ b/src/vllm_router/service_discovery.py @@ -226,7 +226,6 @@ def __init__( self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))] self.added_timestamp = int(time.time()) self.unhealthy_endpoint_hashes = [] - self._running = True if static_backend_health_checks: self.start_health_check_task() self.prefill_model_labels = prefill_model_labels @@ -251,13 +250,10 @@ def get_unhealthy_endpoint_hashes(self) -> list[str]: return unhealthy_endpoints async def check_model_health(self): - while self._running: + while True: try: self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes() - await asyncio.sleep(60) - except asyncio.CancelledError: - logger.debug("Health check task cancelled") - break + time.sleep(60) except Exception as e: logger.error(e) @@ -344,40 +340,6 @@ async def initialize_client_sessions(self) -> None: timeout=aiohttp.ClientTimeout(total=None), ) - def close(self): - """ - Close the service discovery module and clean up health check resources. - """ - self._running = False - if hasattr(self, "loop") and self.loop.is_running(): - # Schedule a coroutine to gracefully shut down the event loop - async def shutdown(): - tasks = [ - t - for t in asyncio.all_tasks(self.loop) - if t is not asyncio.current_task() - ] - for task in tasks: - task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - self.loop.stop() - - future = asyncio.run_coroutine_threadsafe(shutdown(), self.loop) - try: - future.result(timeout=15.0) - except asyncio.TimeoutError: - logger.warning( - "Timed out waiting for shutdown(loop might already be closed)" - ) - except Exception as e: - logger.warning(f"Error during health check shutdown: {e}") - - if hasattr(self, "thread") and self.thread.is_alive(): - self.thread.join(timeout=5.0) - - if hasattr(self, "loop") and not self.loop.is_closed(): - self.loop.close() - class K8sPodIPServiceDiscovery(ServiceDiscovery): def __init__( From 06b933be3fde90e9ac1df2cd40a6920815ba0962 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Tue, 16 Sep 2025 18:53:25 +0000 Subject: [PATCH 04/11] modify CI Signed-off-by: Rui Zhang --- .github/values-06-session-routing.yaml | 58 ++++++++++------------- .github/values-07-prefix-routing.yaml | 58 ++++++++++------------- .github/values-08-roundrobin-routing.yaml | 58 ++++++++++------------- .github/values-09-kvaware-routing.yaml | 58 ++++++++++------------- tutorials/assets/values-17-kv-aware.yaml | 22 ++++----- 5 files changed, 111 insertions(+), 143 deletions(-) diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml index 16747fcb4..3d9156b91 100644 --- a/.github/values-06-session-routing.yaml +++ b/.github/values-06-session-routing.yaml @@ -4,75 +4,60 @@ servingEngineSpec: runtimeClassName: "" modelSpec: # Prefill node configuration - - name: "opt125m-prefill" + - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 + gpuMemoryUtilization: 0.8 lmcacheConfig: - cudaVisibleDevices: "0" enabled: true - kvRole: "kv_producer" - enableNixl: true - nixlRole: "sender" - nixlPeerHost: "vllm-opt125m-decode-engine-service" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true - cpuOffloadingBufferSize: 0 + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default1" controllerPort: 9000 workerPort: 8001 distributedUrl: "localhost:30081" - labels: - model: "opt125m-prefill" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} # Decode node configuration - - name: "opt125m-decode" + - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 v1: 1 + gpuMemoryUtilization: 0.6 lmcacheConfig: - cudaVisibleDevices: "1" enabled: true - kvRole: "kv_consumer" # Set decode node as consumer - enableNixl: true - nixlRole: "receiver" - nixlPeerHost: "0.0.0.0" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default2" controllerPort: 9000 workerPort: 8002 distributedUrl: "localhost:30082" - labels: - model: "opt125m-decode" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} @@ -89,6 +74,13 @@ routerSpec: type: Recreate enableRouter: true routingLogic: "session" + resources: + requests: + cpu: "1" + memory: "2G" + limits: + cpu: "1" + memory: "2G" lmcacheControllerPort: 9000 sessionKey: "x-user-id" extraArgs: diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml index dd1b2aff1..38f009421 100644 --- a/.github/values-07-prefix-routing.yaml +++ b/.github/values-07-prefix-routing.yaml @@ -4,75 +4,60 @@ servingEngineSpec: runtimeClassName: "" modelSpec: # Prefill node configuration - - name: "opt125m-prefill" + - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 + gpuMemoryUtilization: 0.8 lmcacheConfig: - cudaVisibleDevices: "0" enabled: true - kvRole: "kv_producer" - enableNixl: true - nixlRole: "sender" - nixlPeerHost: "vllm-opt125m-decode-engine-service" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true - cpuOffloadingBufferSize: 0 + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default1" controllerPort: 9000 workerPort: 8001 distributedUrl: "localhost:30081" - labels: - model: "opt125m-prefill" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} # Decode node configuration - - name: "opt125m-decode" + - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 v1: 1 + gpuMemoryUtilization: 0.6 lmcacheConfig: - cudaVisibleDevices: "1" enabled: true - kvRole: "kv_consumer" # Set decode node as consumer - enableNixl: true - nixlRole: "receiver" - nixlPeerHost: "0.0.0.0" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default2" controllerPort: 9000 workerPort: 8002 distributedUrl: "localhost:30082" - labels: - model: "opt125m-decode" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} @@ -87,6 +72,13 @@ routerSpec: imagePullPolicy: "IfNotPresent" strategy: type: Recreate + resources: + requests: + cpu: "1" + memory: "2G" + limits: + cpu: "1" + memory: "2G" enableRouter: true routingLogic: "prefixaware" extraArgs: diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml index 23be0c106..3ad0ca0b5 100644 --- a/.github/values-08-roundrobin-routing.yaml +++ b/.github/values-08-roundrobin-routing.yaml @@ -4,75 +4,60 @@ servingEngineSpec: runtimeClassName: "" modelSpec: # Prefill node configuration - - name: "opt125m-prefill" + - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 + gpuMemoryUtilization: 0.8 lmcacheConfig: - cudaVisibleDevices: "0" enabled: true - kvRole: "kv_producer" - enableNixl: true - nixlRole: "sender" - nixlPeerHost: "vllm-opt125m-decode-engine-service" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true - cpuOffloadingBufferSize: 0 + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default1" controllerPort: 9000 workerPort: 8001 distributedUrl: "localhost:30081" - labels: - model: "opt125m-prefill" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} # Decode node configuration - - name: "opt125m-decode" + - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 v1: 1 + gpuMemoryUtilization: 0.6 lmcacheConfig: - cudaVisibleDevices: "1" enabled: true - kvRole: "kv_consumer" # Set decode node as consumer - enableNixl: true - nixlRole: "receiver" - nixlPeerHost: "0.0.0.0" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default2" controllerPort: 9000 workerPort: 8002 distributedUrl: "localhost:30082" - labels: - model: "opt125m-decode" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} @@ -92,4 +77,11 @@ routerSpec: extraArgs: - "--log-level" - "info" + resources: + requests: + cpu: "1" + memory: "2G" + limits: + cpu: "1" + memory: "2G" lmcacheControllerPort: 9000 diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml index 09659422b..9642ae1ac 100644 --- a/.github/values-09-kvaware-routing.yaml +++ b/.github/values-09-kvaware-routing.yaml @@ -4,75 +4,60 @@ servingEngineSpec: runtimeClassName: "" modelSpec: # Prefill node configuration - - name: "opt125m-prefill" + - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 + gpuMemoryUtilization: 0.8 lmcacheConfig: - cudaVisibleDevices: "0" enabled: true - kvRole: "kv_producer" - enableNixl: true - nixlRole: "sender" - nixlPeerHost: "vllm-opt125m-decode-engine-service" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true - cpuOffloadingBufferSize: 0 + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default1" controllerPort: 9000 workerPort: 8001 distributedUrl: "localhost:30081" - labels: - model: "opt125m-prefill" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} # Decode node configuration - - name: "opt125m-decode" + - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "v0.3.5" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 requestMemory: "30Gi" - # requestGPU: 1 + requestGPU: 1 pvcStorage: "50Gi" vllmConfig: enablePrefixCaching: true maxModelLen: 1024 v1: 1 + gpuMemoryUtilization: 0.6 lmcacheConfig: - cudaVisibleDevices: "1" enabled: true - kvRole: "kv_consumer" # Set decode node as consumer - enableNixl: true - nixlRole: "receiver" - nixlPeerHost: "0.0.0.0" - nixlPeerPort: "55555" - nixlBufferSize: "1073741824" # 1GB - nixlBufferDevice: "cuda" - nixlEnableGc: true - enablePD: true + cpuOffloadingBufferSize: "10" enableController: true + instanceId: "default2" controllerPort: 9000 workerPort: 8002 distributedUrl: "localhost:30082" - labels: - model: "opt125m-decode" + env: + - name: LMCACHE_LOG_LEVEL + value: "DEBUG" chatTemplate: "chat.jinja2" chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} @@ -87,6 +72,13 @@ routerSpec: imagePullPolicy: "IfNotPresent" strategy: type: Recreate + resources: + requests: + cpu: "1" + memory: "2G" + limits: + cpu: "1" + memory: "2G" enableRouter: true routingLogic: "kvaware" extraArgs: diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml index 7c17e8cd5..8b45e459b 100644 --- a/tutorials/assets/values-17-kv-aware.yaml +++ b/tutorials/assets/values-17-kv-aware.yaml @@ -3,7 +3,7 @@ servingEngineSpec: modelSpec: - name: "llama1" repository: "lmcache/vllm-openai" - tag: "2025-05-17-v1" + tag: "v0.3.5" modelURL: "meta-llama/Llama-3.1-8B-Instruct" replicaCount: 1 requestCPU: 6 @@ -19,9 +19,9 @@ servingEngineSpec: cpuOffloadingBufferSize: "60" enableController: true instanceId: "default1" - controllerPort: "9000" + controllerPort: 9000 workerPort: 8001 - distributedUrl: "localhost:8201" + distributedUrl: "localhost:30081" env: - name: LMCACHE_LOG_LEVEL @@ -29,7 +29,7 @@ servingEngineSpec: hf_token: - name: "llama2" repository: "lmcache/vllm-openai" - tag: "2025-05-17-v1" + tag: "v0.3.5" modelURL: "meta-llama/Llama-3.1-8B-Instruct" replicaCount: 1 requestCPU: 6 @@ -45,9 +45,9 @@ servingEngineSpec: cpuOffloadingBufferSize: "60" enableController: true instanceId: "default2" - controllerPort: "9000" + controllerPort: 9000 workerPort: 8002 - distributedUrl: "localhost:8202" + distributedUrl: "localhost:30082" env: - name: LMCACHE_LOG_LEVEL @@ -56,7 +56,7 @@ servingEngineSpec: - name: "llama3" repository: "lmcache/vllm-openai" - tag: "2025-05-17-v1" + tag: "v0.3.5" modelURL: "meta-llama/Llama-3.1-8B-Instruct" replicaCount: 1 requestCPU: 6 @@ -72,9 +72,9 @@ servingEngineSpec: cpuOffloadingBufferSize: "60" enableController: true instanceId: "default3" - controllerPort: "9000" + controllerPort: 9000 workerPort: 8003 - distributedUrl: "localhost:8203" + distributedUrl: "localhost:30083" env: - name: LMCACHE_LOG_LEVEL @@ -82,7 +82,7 @@ servingEngineSpec: hf_token: - name: "llama4" repository: "lmcache/vllm-openai" - tag: "2025-05-17-v1" + tag: "v0.3.5" modelURL: "meta-llama/Llama-3.1-8B-Instruct" replicaCount: 1 requestCPU: 6 @@ -100,7 +100,7 @@ servingEngineSpec: instanceId: "default4" controllerPort: "9000" workerPort: 8004 - distributedUrl: "localhost:8204" + distributedUrl: "localhost:30084" env: - name: LMCACHE_LOG_LEVEL From 57c69ae2e3cdc39a10933ea089ce438ff29657d1 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 00:06:40 +0000 Subject: [PATCH 05/11] bugfix: fix bug for kvaware routing to be compatiable with lmcache 0.3.9 Signed-off-by: Rui Zhang --- .github/values-06-session-routing.yaml | 12 +-- .github/values-07-prefix-routing.yaml | 12 +-- .github/values-08-roundrobin-routing.yaml | 12 +-- .github/values-09-kvaware-routing.yaml | 12 +-- helm/templates/deployment-vllm-multi.yaml | 27 ++++-- tutorials/17-kv-aware-routing.md | 4 +- tutorials/assets/values-17-kv-aware.yaml | 106 ++++------------------ 7 files changed, 62 insertions(+), 123 deletions(-) diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml index 3d9156b91..708656ecf 100644 --- a/.github/values-06-session-routing.yaml +++ b/.github/values-06-session-routing.yaml @@ -21,10 +21,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default1" controllerPort: 9000 - workerPort: 8001 - distributedUrl: "localhost:30081" + workerPorts: "8001" + p2pHost: "localhost" + p2pInitPorts: "30081" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" @@ -51,10 +51,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default2" controllerPort: 9000 - workerPort: 8002 - distributedUrl: "localhost:30082" + workerPorts: "8002" + p2pHost: "localhost" + p2pInitPorts: "30082" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml index 38f009421..cba67c46d 100644 --- a/.github/values-07-prefix-routing.yaml +++ b/.github/values-07-prefix-routing.yaml @@ -21,10 +21,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default1" controllerPort: 9000 - workerPort: 8001 - distributedUrl: "localhost:30081" + workerPorts: "8001" + p2pHost: "localhost" + p2pInitPorts: "30081" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" @@ -51,10 +51,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default2" controllerPort: 9000 - workerPort: 8002 - distributedUrl: "localhost:30082" + workerPorts: "8002" + p2pHost: "localhost" + p2pInitPorts: "30082" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml index 3ad0ca0b5..b3d8063b2 100644 --- a/.github/values-08-roundrobin-routing.yaml +++ b/.github/values-08-roundrobin-routing.yaml @@ -21,10 +21,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default1" controllerPort: 9000 - workerPort: 8001 - distributedUrl: "localhost:30081" + workerPorts: "8001" + p2pHost: "localhost" + p2pInitPorts: "30081" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" @@ -51,10 +51,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default2" controllerPort: 9000 - workerPort: 8002 - distributedUrl: "localhost:30082" + workerPorts: "8002" + p2pHost: "localhost" + p2pInitPorts: "30082" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml index 9642ae1ac..195e0f1ab 100644 --- a/.github/values-09-kvaware-routing.yaml +++ b/.github/values-09-kvaware-routing.yaml @@ -21,10 +21,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default1" controllerPort: 9000 - workerPort: 8001 - distributedUrl: "localhost:30081" + workerPorts: "8001" + p2pHost: "localhost" + p2pInitPorts: "30081" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" @@ -51,10 +51,10 @@ servingEngineSpec: enabled: true cpuOffloadingBufferSize: "10" enableController: true - instanceId: "default2" controllerPort: 9000 - workerPort: 8002 - distributedUrl: "localhost:30082" + workerPorts: "8002" + p2pHost: "localhost" + p2pInitPorts: "30082" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 453eaa550..971fa10dc 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -321,18 +321,31 @@ spec: {{- if hasKey $modelSpec.lmcacheConfig "instanceId" }} - name: LMCACHE_LMCACHE_INSTANCE_ID value: {{ $modelSpec.lmcacheConfig.instanceId | quote }} + {{- else }} + - name: LMCACHE_LMCACHE_INSTANCE_ID + valueFrom: + fieldRef: + fieldPath: metadata.name {{- end }} {{- if hasKey $modelSpec.lmcacheConfig "controllerPort" }} - - name: LMCACHE_CONTROLLER_URL + - name: LMCACHE_CONTROLLER_PULL_URL value: "{{ .Release.Name }}-router-service:{{ $modelSpec.lmcacheConfig.controllerPort }}" {{- end }} - {{- if hasKey $modelSpec.lmcacheConfig "workerPort" }} - - name: LMCACHE_LMCACHE_WORKER_PORT - value: {{ $modelSpec.lmcacheConfig.workerPort | quote }} + {{- if hasKey $modelSpec.lmcacheConfig "workerPorts" }} + - name: LMCACHE_LMCACHE_WORKER_PORTS + value: {{ $modelSpec.lmcacheConfig.workerPorts | quote }} + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "p2pHost" }} + - name: LMCACHE_P2P_HOST + value: {{ $modelSpec.lmcacheConfig.p2pHost | quote }} + {{- end }} + {{- if hasKey $modelSpec.lmcacheConfig "p2pInitPorts" }} + - name: LMCACHE_P2P_INIT_PORTS + value: {{ $modelSpec.lmcacheConfig.p2pInitPorts | quote }} {{- end }} - {{- if hasKey $modelSpec.lmcacheConfig "distributedUrl" }} - - name: LMCACHE_DISTRIBUTED_URL - value: {{ $modelSpec.lmcacheConfig.distributedUrl | quote }} + {{- if hasKey $modelSpec.lmcacheConfig "workerHeartbeatTime" }} + - name: LMCACHE_LMCACHE_WORKER_HEARTBEAT_TIME + value: {{ $modelSpec.lmcacheConfig.workerHeartbeatTime | quote }} {{- end }} {{- end }} {{- if or .Values.servingEngineSpec.configs $modelSpec.envFromSecret }} diff --git a/tutorials/17-kv-aware-routing.md b/tutorials/17-kv-aware-routing.md index e71da792f..0c21a8a4c 100644 --- a/tutorials/17-kv-aware-routing.md +++ b/tutorials/17-kv-aware-routing.md @@ -54,7 +54,7 @@ First, send a request to the router: curl http://localhost:30080/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", + "model": "openai/gpt-oss-20b", "prompt": "What is the capital of France?", "max_tokens": 100 }' @@ -66,7 +66,7 @@ Then, send another request with the same prompt prefix: curl http://localhost:30080/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "meta-llama/Llama-3.1-8B-Instruct", + "model": "openai/gpt-oss-20b", "prompt": "What is the capital of France? And what is its population?", "max_tokens": 100 }' diff --git a/tutorials/assets/values-17-kv-aware.yaml b/tutorials/assets/values-17-kv-aware.yaml index 8b45e459b..4d410597f 100644 --- a/tutorials/assets/values-17-kv-aware.yaml +++ b/tutorials/assets/values-17-kv-aware.yaml @@ -1,111 +1,38 @@ servingEngineSpec: runtimeClassName: "" modelSpec: - - name: "llama1" + - name: "gpt-oss-20b" repository: "lmcache/vllm-openai" - tag: "v0.3.5" - modelURL: "meta-llama/Llama-3.1-8B-Instruct" - replicaCount: 1 - requestCPU: 6 - requestMemory: "70Gi" + tag: "v0.3.9post2" + modelURL: "openai/gpt-oss-20b" + replicaCount: 2 + requestCPU: 8 + requestMemory: "128Gi" requestGPU: 1 - pvcStorage: "50Gi" + pvcStorage: "256Gi" vllmConfig: enablePrefixCaching: true - maxModelLen: 32000 + maxModelLen: 8000 + gpuMemoryUtilization: "0.9" lmcacheConfig: enabled: true cpuOffloadingBufferSize: "60" enableController: true - instanceId: "default1" - controllerPort: 9000 - workerPort: 8001 - distributedUrl: "localhost:30081" - - env: - - name: LMCACHE_LOG_LEVEL - value: "DEBUG" - hf_token: - - name: "llama2" - repository: "lmcache/vllm-openai" - tag: "v0.3.5" - modelURL: "meta-llama/Llama-3.1-8B-Instruct" - replicaCount: 1 - requestCPU: 6 - requestMemory: "30Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 32000 - - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "60" - enableController: true - instanceId: "default2" - controllerPort: 9000 - workerPort: 8002 - distributedUrl: "localhost:30082" - - env: - - name: LMCACHE_LOG_LEVEL - value: "DEBUG" - hf_token: - - - name: "llama3" - repository: "lmcache/vllm-openai" - tag: "v0.3.5" - modelURL: "meta-llama/Llama-3.1-8B-Instruct" - replicaCount: 1 - requestCPU: 6 - requestMemory: "70Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 32000 - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "60" - enableController: true - instanceId: "default3" controllerPort: 9000 - workerPort: 8003 - distributedUrl: "localhost:30083" + workerPorts: "8001" + p2pHost: "localhost" + p2pInitPorts: "30081" + workerHeartbeatTime: "30" env: - name: LMCACHE_LOG_LEVEL value: "DEBUG" - hf_token: - - name: "llama4" - repository: "lmcache/vllm-openai" - tag: "v0.3.5" - modelURL: "meta-llama/Llama-3.1-8B-Instruct" - replicaCount: 1 - requestCPU: 6 - requestMemory: "70Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 32000 - - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "60" - enableController: true - instanceId: "default4" - controllerPort: "9000" - workerPort: 8004 - distributedUrl: "localhost:30084" - - env: - - name: LMCACHE_LOG_LEVEL + - name: VLLM_LOGGING_LEVEL value: "DEBUG" - hf_token: + - name: HF_HOME + value: "/data" routerSpec: repository: "lmcache/lmstack-router" @@ -119,5 +46,4 @@ routerSpec: memory: "2G" routingLogic: "kvaware" lmcacheControllerPort: 9000 - hf_token: sessionKey: "x-user-id" From cf4162bd08278569587c9017c9a485a8dff96b3d Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 00:18:21 +0000 Subject: [PATCH 06/11] bugfix: fix ci Signed-off-by: Rui Zhang --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e23a9ae32..ad2fd116a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,13 +42,13 @@ semantic_cache = [ "huggingface-hub==0.34.0", ] lmcache = [ - "lmcache==0.3.5", - "vllm==0.10.1.1", + "lmcache==0.3.9post2", + "vllm==0.11.0", ] test = [ "pytest>=8.3.4", "pytest-asyncio>=0.25.3", - "vllm==0.10.2" + "vllm==0.11.0" ] [build-system] From 61cfb766354aadcf2ef13dbcd4f40f3bb52388da Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 00:28:50 +0000 Subject: [PATCH 07/11] fix ci Signed-off-by: Rui Zhang --- .github/values-06-session-routing.yaml | 4 +-- .github/values-07-prefix-routing.yaml | 4 +-- .github/values-08-roundrobin-routing.yaml | 4 +-- .github/values-09-kvaware-routing.yaml | 4 +-- src/vllm_router/service_discovery.py | 44 ++++++++++++++++++++--- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml index 708656ecf..f713ebf48 100644 --- a/.github/values-06-session-routing.yaml +++ b/.github/values-06-session-routing.yaml @@ -6,7 +6,7 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 @@ -35,7 +35,7 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml index cba67c46d..ffd810bc2 100644 --- a/.github/values-07-prefix-routing.yaml +++ b/.github/values-07-prefix-routing.yaml @@ -6,7 +6,7 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 @@ -35,7 +35,7 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml index b3d8063b2..6a751a7c4 100644 --- a/.github/values-08-roundrobin-routing.yaml +++ b/.github/values-08-roundrobin-routing.yaml @@ -6,7 +6,7 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 @@ -35,7 +35,7 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml index 195e0f1ab..120e155bc 100644 --- a/.github/values-09-kvaware-routing.yaml +++ b/.github/values-09-kvaware-routing.yaml @@ -6,7 +6,7 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-1" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 @@ -35,7 +35,7 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-2" repository: "lmcache/vllm-openai" - tag: "v0.3.5" + tag: "v0.3.9post2" modelURL: "facebook/opt-125m" replicaCount: 1 requestCPU: 6 diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py index 12f3a694a..e3b47be6e 100644 --- a/src/vllm_router/service_discovery.py +++ b/src/vllm_router/service_discovery.py @@ -226,6 +226,7 @@ def __init__( self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))] self.added_timestamp = int(time.time()) self.unhealthy_endpoint_hashes = [] + self.running = True if static_backend_health_checks: self.start_health_check_task() self.prefill_model_labels = prefill_model_labels @@ -250,12 +251,13 @@ def get_unhealthy_endpoint_hashes(self) -> list[str]: return unhealthy_endpoints async def check_model_health(self): - while True: + while self.running: try: self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes() - time.sleep(60) - except Exception as e: - logger.error(e) + await asyncio.sleep(60) + except asyncio.CancelledError: + logger.debug("Health check task cancelled") + break def start_health_check_task(self) -> None: self.loop = asyncio.new_event_loop() @@ -340,6 +342,40 @@ async def initialize_client_sessions(self) -> None: timeout=aiohttp.ClientTimeout(total=None), ) + def close(self): + """ + Close the service discovery module and clean up health check resources. + """ + self._running = False + if hasattr(self, "loop") and self.loop.is_running(): + # Schedule a coroutine to gracefully shut down the event loop + async def shutdown(): + tasks = [ + t + for t in asyncio.all_tasks(self.loop) + if t is not asyncio.current_task() + ] + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + self.loop.stop() + + future = asyncio.run_coroutine_threadsafe(shutdown(), self.loop) + try: + future.result(timeout=15.0) + except asyncio.TimeoutError: + logger.warning( + "Timed out waiting for shutdown(loop might already be closed)" + ) + except Exception as e: + logger.warning(f"Error during health check shutdown: {e}") + + if hasattr(self, "thread") and self.thread.is_alive(): + self.thread.join(timeout=5.0) + + if hasattr(self, "loop") and not self.loop.is_closed(): + self.loop.close() + class K8sPodIPServiceDiscovery(ServiceDiscovery): def __init__( From 08a17346ea5668cb94cb6858fc91e85433207440 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 00:38:56 +0000 Subject: [PATCH 08/11] modify ci Signed-off-by: Rui Zhang --- .github/values-06-session-routing.yaml | 35 ++--------------------- .github/values-07-prefix-routing.yaml | 35 ++--------------------- .github/values-08-roundrobin-routing.yaml | 35 ++--------------------- .github/values-09-kvaware-routing.yaml | 35 ++--------------------- .github/values-10-disagg-prefill.yaml | 17 +++-------- 5 files changed, 12 insertions(+), 145 deletions(-) diff --git a/.github/values-06-session-routing.yaml b/.github/values-06-session-routing.yaml index f713ebf48..13ab01220 100644 --- a/.github/values-06-session-routing.yaml +++ b/.github/values-06-session-routing.yaml @@ -3,12 +3,11 @@ servingEngineSpec: type: Recreate runtimeClassName: "" modelSpec: - # Prefill node configuration - - name: "opt125m-1" + - name: "opt125m" repository: "lmcache/vllm-openai" tag: "v0.3.9post2" modelURL: "facebook/opt-125m" - replicaCount: 1 + replicaCount: 2 requestCPU: 6 requestMemory: "30Gi" requestGPU: 1 @@ -32,36 +31,6 @@ servingEngineSpec: chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} - # Decode node configuration - - name: "opt125m-2" - repository: "lmcache/vllm-openai" - tag: "v0.3.9post2" - modelURL: "facebook/opt-125m" - replicaCount: 1 - requestCPU: 6 - requestMemory: "30Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "10" - enableController: true - controllerPort: 9000 - workerPorts: "8002" - p2pHost: "localhost" - p2pInitPorts: "30082" - env: - - name: LMCACHE_LOG_LEVEL - value: "DEBUG" - chatTemplate: "chat.jinja2" - chatTemplateConfigMap: |- - {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} - {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} containerSecurityContext: capabilities: add: diff --git a/.github/values-07-prefix-routing.yaml b/.github/values-07-prefix-routing.yaml index ffd810bc2..4dcd73bed 100644 --- a/.github/values-07-prefix-routing.yaml +++ b/.github/values-07-prefix-routing.yaml @@ -3,12 +3,11 @@ servingEngineSpec: type: Recreate runtimeClassName: "" modelSpec: - # Prefill node configuration - - name: "opt125m-1" + - name: "opt125m" repository: "lmcache/vllm-openai" tag: "v0.3.9post2" modelURL: "facebook/opt-125m" - replicaCount: 1 + replicaCount: 2 requestCPU: 6 requestMemory: "30Gi" requestGPU: 1 @@ -32,36 +31,6 @@ servingEngineSpec: chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} - # Decode node configuration - - name: "opt125m-2" - repository: "lmcache/vllm-openai" - tag: "v0.3.9post2" - modelURL: "facebook/opt-125m" - replicaCount: 1 - requestCPU: 6 - requestMemory: "30Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "10" - enableController: true - controllerPort: 9000 - workerPorts: "8002" - p2pHost: "localhost" - p2pInitPorts: "30082" - env: - - name: LMCACHE_LOG_LEVEL - value: "DEBUG" - chatTemplate: "chat.jinja2" - chatTemplateConfigMap: |- - {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} - {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} containerSecurityContext: capabilities: add: diff --git a/.github/values-08-roundrobin-routing.yaml b/.github/values-08-roundrobin-routing.yaml index 6a751a7c4..93b8ce194 100644 --- a/.github/values-08-roundrobin-routing.yaml +++ b/.github/values-08-roundrobin-routing.yaml @@ -3,12 +3,11 @@ servingEngineSpec: type: Recreate runtimeClassName: "" modelSpec: - # Prefill node configuration - - name: "opt125m-1" + - name: "opt125m" repository: "lmcache/vllm-openai" tag: "v0.3.9post2" modelURL: "facebook/opt-125m" - replicaCount: 1 + replicaCount: 2 requestCPU: 6 requestMemory: "30Gi" requestGPU: 1 @@ -32,36 +31,6 @@ servingEngineSpec: chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} - # Decode node configuration - - name: "opt125m-2" - repository: "lmcache/vllm-openai" - tag: "v0.3.9post2" - modelURL: "facebook/opt-125m" - replicaCount: 1 - requestCPU: 6 - requestMemory: "30Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "10" - enableController: true - controllerPort: 9000 - workerPorts: "8002" - p2pHost: "localhost" - p2pInitPorts: "30082" - env: - - name: LMCACHE_LOG_LEVEL - value: "DEBUG" - chatTemplate: "chat.jinja2" - chatTemplateConfigMap: |- - {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} - {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} containerSecurityContext: capabilities: add: diff --git a/.github/values-09-kvaware-routing.yaml b/.github/values-09-kvaware-routing.yaml index 120e155bc..c471e9f86 100644 --- a/.github/values-09-kvaware-routing.yaml +++ b/.github/values-09-kvaware-routing.yaml @@ -3,12 +3,11 @@ servingEngineSpec: type: Recreate runtimeClassName: "" modelSpec: - # Prefill node configuration - - name: "opt125m-1" + - name: "opt125m" repository: "lmcache/vllm-openai" tag: "v0.3.9post2" modelURL: "facebook/opt-125m" - replicaCount: 1 + replicaCount: 2 requestCPU: 6 requestMemory: "30Gi" requestGPU: 1 @@ -32,36 +31,6 @@ servingEngineSpec: chatTemplateConfigMap: |- {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} - # Decode node configuration - - name: "opt125m-2" - repository: "lmcache/vllm-openai" - tag: "v0.3.9post2" - modelURL: "facebook/opt-125m" - replicaCount: 1 - requestCPU: 6 - requestMemory: "30Gi" - requestGPU: 1 - pvcStorage: "50Gi" - vllmConfig: - enablePrefixCaching: true - maxModelLen: 1024 - v1: 1 - gpuMemoryUtilization: 0.6 - lmcacheConfig: - enabled: true - cpuOffloadingBufferSize: "10" - enableController: true - controllerPort: 9000 - workerPorts: "8002" - p2pHost: "localhost" - p2pInitPorts: "30082" - env: - - name: LMCACHE_LOG_LEVEL - value: "DEBUG" - chatTemplate: "chat.jinja2" - chatTemplateConfigMap: |- - {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} - {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} containerSecurityContext: capabilities: add: diff --git a/.github/values-10-disagg-prefill.yaml b/.github/values-10-disagg-prefill.yaml index 236b46d33..548d284f5 100644 --- a/.github/values-10-disagg-prefill.yaml +++ b/.github/values-10-disagg-prefill.yaml @@ -9,10 +9,10 @@ servingEngineSpec: # Prefill node configuration - name: "opt125m-prefill" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "2025-05-27-v1" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 6 + requestCPU: 8 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -34,10 +34,6 @@ servingEngineSpec: nixlEnableGc: true enablePD: true cpuOffloadingBufferSize: 0 - enableController: true - controllerPort: 9000 - workerPort: 8001 - distributedUrl: "localhost:30081" labels: model: "opt125m-prefill" chatTemplate: "chat.jinja2" @@ -47,10 +43,10 @@ servingEngineSpec: # Decode node configuration - name: "opt125m-decode" repository: "lmcache/vllm-openai" - tag: "latest" + tag: "2025-05-27-v1" modelURL: "facebook/opt-125m" replicaCount: 1 - requestCPU: 6 + requestCPU: 8 requestMemory: "30Gi" # requestGPU: 1 pvcStorage: "50Gi" @@ -70,10 +66,6 @@ servingEngineSpec: nixlBufferDevice: "cuda" nixlEnableGc: true enablePD: true - enableController: true - controllerPort: 9000 - workerPort: 8002 - distributedUrl: "localhost:30082" labels: model: "opt125m-decode" chatTemplate: "chat.jinja2" @@ -98,7 +90,6 @@ routerSpec: engineScrapeInterval: 15 requestStatsWindow: 60 enablePD: true - lmcacheControllerPort: 9000 resources: requests: cpu: "4" From 5b1bb6eb0c5fd36082585d34b6a08993558f14fd Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 00:42:41 +0000 Subject: [PATCH 09/11] modify ci Signed-off-by: Rui Zhang --- src/vllm_router/service_discovery.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py index e3b47be6e..f3a990ea3 100644 --- a/src/vllm_router/service_discovery.py +++ b/src/vllm_router/service_discovery.py @@ -226,7 +226,7 @@ def __init__( self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))] self.added_timestamp = int(time.time()) self.unhealthy_endpoint_hashes = [] - self.running = True + self._running = True if static_backend_health_checks: self.start_health_check_task() self.prefill_model_labels = prefill_model_labels @@ -251,13 +251,15 @@ def get_unhealthy_endpoint_hashes(self) -> list[str]: return unhealthy_endpoints async def check_model_health(self): - while self.running: + while self._running: try: self.unhealthy_endpoint_hashes = self.get_unhealthy_endpoint_hashes() await asyncio.sleep(60) except asyncio.CancelledError: logger.debug("Health check task cancelled") break + except Exception as e: + logger.error(e) def start_health_check_task(self) -> None: self.loop = asyncio.new_event_loop() @@ -785,7 +787,7 @@ def close(self): """ Close the service discovery module. """ - self.running = False + self._running = False self.k8s_watcher.stop() self.watcher_thread.join() From 8fff9b473e72a9624593411fdccfc9758cdc7725 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 00:52:35 +0000 Subject: [PATCH 10/11] modify ci Signed-off-by: Rui Zhang --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index ad2fd116a..066096a13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,9 @@ write_to = "src/vllm_router/_version.py" [tool.isort] profile = "black" +[tool.pytest.ini_options] +asyncio_mode = "auto" + [dependency-groups] lint = [ "pre-commit>=4.1.0", From 49e2b72d7b4907edf0bbb9007a2d7dd307afece8 Mon Sep 17 00:00:00 2001 From: Rui Zhang Date: Thu, 13 Nov 2025 01:00:01 +0000 Subject: [PATCH 11/11] modify ci Signed-off-by: Rui Zhang --- src/vllm_router/service_discovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py index f3a990ea3..eca70bb9a 100644 --- a/src/vllm_router/service_discovery.py +++ b/src/vllm_router/service_discovery.py @@ -787,7 +787,7 @@ def close(self): """ Close the service discovery module. """ - self._running = False + self.running = False self.k8s_watcher.stop() self.watcher_thread.join()