diff --git a/docker/ray-llm/Dockerfile b/docker/ray-llm/Dockerfile index f58524c5609a..294150359ddd 100644 --- a/docker/ray-llm/Dockerfile +++ b/docker/ray-llm/Dockerfile @@ -7,7 +7,7 @@ COPY python/deplocks/llm/rayllm_*.lock ./ # vLLM version tag to use for EP kernel and DeepGEMM install scripts # Keep in sync with vllm version in python/requirements/llm/llm-requirements.txt -ARG VLLM_SCRIPTS_REF="v0.18.0" +ARG VLLM_SCRIPTS_REF="v0.19.0" RUN < Any: multi_modal_data = request.multimodal_data if request.prompt_token_ids is not None: - llm_prompt = vllm.inputs.data.TokensPrompt( + llm_prompt = vllm.inputs.TokensPrompt( prompt_token_ids=request.prompt_token_ids, multi_modal_data=multi_modal_data, mm_processor_kwargs=request.mm_processor_kwargs, @@ -543,7 +543,7 @@ async def _generate_async(self, request: vLLMEngineRequest) -> Any: ) else: assert request.prompt - llm_prompt = vllm.inputs.data.TextPrompt( + llm_prompt = vllm.inputs.TextPrompt( prompt=request.prompt, multi_modal_data=multi_modal_data, mm_processor_kwargs=request.mm_processor_kwargs, diff --git a/python/ray/llm/_internal/serve/core/ingress/mixins/pausable.py b/python/ray/llm/_internal/serve/core/ingress/mixins/pausable.py index 07858a2c8e11..23653b5599b9 100644 --- a/python/ray/llm/_internal/serve/core/ingress/mixins/pausable.py +++ b/python/ray/llm/_internal/serve/core/ingress/mixins/pausable.py @@ -26,7 +26,7 @@ class PauseRequest(BaseModel): model: str options: Dict[str, Any] = Field( default_factory=dict, - description="Engine-specific pause options (e.g., wait_for_inflight_requests, clear_cache)", + description="Engine-specific pause options (e.g., mode, clear_cache)", ) @@ -73,8 +73,7 @@ async def pause(self, body: PauseRequest) -> Response: Args: body: Request containing the model ID and engine-specific options. Options may include: - - wait_for_inflight_requests (bool): Wait for in-flight requests - to finish before pausing. Default False (abort immediately). + - mode (str): "abort" (default), "wait", or "keep". - clear_cache (bool): Clear KV cache after draining. Default True. Returns: diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 28b537334ac8..ab7eb0f30de8 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -3,7 +3,16 @@ import inspect import json import typing -from typing import TYPE_CHECKING, Any, AsyncGenerator, List, Optional, Tuple, Union +from typing import ( + TYPE_CHECKING, + Any, + AsyncGenerator, + List, + Literal, + Optional, + Tuple, + Union, +) from pydantic import BaseModel, field_validator from starlette.datastructures import State @@ -211,9 +220,11 @@ def validate_tags(cls, v: Any) -> Optional[List[str]]: class VLLMPauseConfig(BaseModel): """vLLM-specific configuration for pause operation.""" - wait_for_inflight_requests: bool = False - """When True, waits for in-flight requests to finish before pausing. - When False (default), aborts in-flight requests immediately. + mode: Literal["abort", "wait", "keep"] = "abort" + """Pause mode: + - "abort" (default): Abort all in-flight requests immediately. + - "wait": Wait for in-flight requests to complete before pausing. + - "keep": Freeze requests in queue; they resume on resume_generation(). """ clear_cache: bool = True @@ -789,14 +800,13 @@ async def pause(self, **kwargs: Any) -> None: Args: **kwargs: Options parsed into VLLMPauseConfig. - - wait_for_inflight_requests (bool): Wait for in-flight requests - to finish. Default False. + - mode (str): "abort" (default), "wait", or "keep". - clear_cache (bool): Clear KV cache after draining. Default True. """ assert self._engine_client is not None, "engine_client is not initialized" config = VLLMPauseConfig(**kwargs) await self._engine_client.pause_generation( - wait_for_inflight_requests=config.wait_for_inflight_requests, + mode=config.mode, clear_cache=config.clear_cache, ) diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 23cc074c5e0c..8d928a01bd82 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -115,7 +115,7 @@ async def pause(self, **kwargs: Any) -> None: This mimics vLLM's behavior: halts generation while keeping weights in GPU. Args: - **kwargs: Engine-specific options (wait_for_inflight_requests, clear_cache). + **kwargs: Engine-specific options (mode, clear_cache). """ if not self.started: raise RuntimeError("Engine not started") diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt index d84e353c3524..e994e7a8d984 100644 --- a/python/requirements/llm/llm-requirements.txt +++ b/python/requirements/llm/llm-requirements.txt @@ -2,7 +2,7 @@ # constraining to a maximum version (i.e. <=) to temporarily work around a bug. # Those pins for the sake of workarounds should not be advertised as constraints # on future releases in setup.py. -vllm[audio]>=0.18.0 +vllm[audio]>=0.19.0 nixl>=1.0.0 anyio>=4.5.0 # For json mode diff --git a/python/setup.py b/python/setup.py index 73024b144f28..6751632a83b6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -366,7 +366,7 @@ def get_packages(self): setup_spec.extras["llm"] = list( set( [ - "vllm[audio]>=0.18.0", + "vllm[audio]>=0.19.0", "nixl>=1.0.0", "jsonref>=1.1.0", "jsonschema", diff --git a/release/llm_tests/batch/test_batch_vllm.py b/release/llm_tests/batch/test_batch_vllm.py index 01b67efe7795..a0bf0f2c63ce 100644 --- a/release/llm_tests/batch/test_batch_vllm.py +++ b/release/llm_tests/batch/test_batch_vllm.py @@ -48,9 +48,65 @@ def add_buffer_time_between_tests(): def cleanup_ray_resources(): """Automatically cleanup Ray resources between tests to prevent conflicts.""" yield + _cleanup_gpu_processes() ray.shutdown() +def _cleanup_gpu_processes(): + """ + Kill GPU processes on all nodes in the cluster. With Ray as the external orchestrator, + mp backend suffers from uncoordinated shutdown issues, leaving orphaned GPU processes. + + TODO (jeffreywang): Remove this once https://github.com/vllm-project/vllm/pull/39846 lands. + """ + if not ray.is_initialized(): + return + + @ray.remote(num_cpus=0) + def _remote_kill_gpu_processes(): + import os + import signal + + import pynvml + + pids = set() + try: + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + for i in range(device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle): + pids.add(proc.pid) + pynvml.nvmlShutdown() + except Exception: + pass + + for pid in pids: + try: + os.kill(pid, signal.SIGKILL) + except (ProcessLookupError, ValueError): + pass + + try: + nodes = ray.nodes() + refs = [] + for node in nodes: + if not node.get("Alive", False): + continue + node_id = node["NodeID"] + refs.append( + _remote_kill_gpu_processes.options( + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=node_id, soft=False + ), + ).remote() + ) + if refs: + ray.get(refs, timeout=30) + except Exception as e: + logging.warning(f"Failed to kill GPU processes on remote nodes: {e}") + + @pytest.mark.asyncio async def test_vllm_multimodal_utils(): """Test vLLM's multimodal utilities. diff --git a/release/llm_tests/serve/test_llm_serve_multi_node_integration.py b/release/llm_tests/serve/test_llm_serve_multi_node_integration.py index 84ac30bfed4d..c397e5dc6c0c 100644 --- a/release/llm_tests/serve/test_llm_serve_multi_node_integration.py +++ b/release/llm_tests/serve/test_llm_serve_multi_node_integration.py @@ -327,7 +327,10 @@ def test_llm_serve_prefill_decode_with_data_parallelism(): }, }, experimental_configs={ - "NIXL_SIDE_CHANNEL_PORT_BASE": 40000, # Prefill port range + # Use ports below the Linux ephemeral range (32768-60999) to + # prevent conflicts with the vLLM DP coordinator's random TCP + # port allocations. + "NIXL_SIDE_CHANNEL_PORT_BASE": 15000, # Prefill port range }, runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, ) @@ -343,7 +346,7 @@ def test_llm_serve_prefill_decode_with_data_parallelism(): }, }, experimental_configs={ - "NIXL_SIDE_CHANNEL_PORT_BASE": 41000, # Decode port range (different) + "NIXL_SIDE_CHANNEL_PORT_BASE": 16000, # Decode port range (different) }, runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, ) diff --git a/release/llm_tests/serve/test_llm_serve_pause_resume.py b/release/llm_tests/serve/test_llm_serve_pause_resume.py index e3f17514b31f..595996f1774d 100644 --- a/release/llm_tests/serve/test_llm_serve_pause_resume.py +++ b/release/llm_tests/serve/test_llm_serve_pause_resume.py @@ -128,7 +128,7 @@ def test_pause_resume_lifecycle(): f"{BASE_URL}/pause", json={ "model": MODEL_ID, - "options": {"wait_for_inflight_requests": False, "clear_cache": True}, + "options": {"mode": "abort", "clear_cache": True}, }, timeout=60, )