diff --git a/ci/compile_llm_requirements.sh b/ci/compile_llm_requirements.sh index fd486f833900..6030be18f7c6 100755 --- a/ci/compile_llm_requirements.sh +++ b/ci/compile_llm_requirements.sh @@ -6,9 +6,9 @@ CONFIG_PATH="${1:-ci/raydepsets/configs/rayllm.depsets.yaml}" mkdir -p /tmp/ray-deps -# Remove the GPU constraints +# Remove the GPU constraints, numpy, scipy, and pandas pin (vLLM 0.15.0+ requires numpy>=2, compatible scipy, and pandas>=2.0) cp python/requirements_compiled.txt /tmp/ray-deps/requirements_compiled.txt -sed -e '/^--extra-index-url /d' -e '/^--find-links /d' /tmp/ray-deps/requirements_compiled.txt > /tmp/ray-deps/requirements_compiled.txt.tmp +sed -e '/^--extra-index-url /d' -e '/^--find-links /d' -e '/^numpy==/d' -e '/^scipy==/d' -e '/^pandas==/d' /tmp/ray-deps/requirements_compiled.txt > /tmp/ray-deps/requirements_compiled.txt.tmp mv /tmp/ray-deps/requirements_compiled.txt.tmp /tmp/ray-deps/requirements_compiled.txt bazel run //ci/raydepsets:raydepsets -- build "${CONFIG_PATH}" diff --git a/ci/raydepsets/configs/llm_release_tests.depsets.yaml b/ci/raydepsets/configs/llm_release_tests.depsets.yaml index 1c125554f679..4c0b2f7c7712 100644 --- a/ci/raydepsets/configs/llm_release_tests.depsets.yaml +++ b/ci/raydepsets/configs/llm_release_tests.depsets.yaml @@ -8,8 +8,10 @@ build_arg_sets: append_flags: - --python-version=3.11 - --unsafe-package ray - - --python-platform=linux + # Use manylinux_2_31 for vllm 0.15.0 wheel compatibility + - --python-platform=x86_64-manylinux_2_31 - --extra-index-url https://download.pytorch.org/whl/${CUDA_CODE} + - --override python/requirements/llm/llm-override.txt build_arg_sets: - cu128 diff --git a/ci/raydepsets/configs/rayllm.depsets.yaml b/ci/raydepsets/configs/rayllm.depsets.yaml index 5a6bde5c15c1..05ae0f2c29c8 100644 --- a/ci/raydepsets/configs/rayllm.depsets.yaml +++ b/ci/raydepsets/configs/rayllm.depsets.yaml @@ -11,8 +11,10 @@ build_arg_sets: append_flags: - --python-version=3.11 - --unsafe-package ray - - --python-platform=linux + # Use manylinux_2_31 for vllm 0.15.0 wheel compatibility + - --python-platform=x86_64-manylinux_2_31 - --extra-index-url https://download.pytorch.org/whl/${CUDA_CODE} + - --override python/requirements/llm/llm-override.txt build_arg_sets: - cpu - cu128 diff --git a/ci/raydepsets/pre_hooks/remove-compiled-headers.sh b/ci/raydepsets/pre_hooks/remove-compiled-headers.sh index 69e9faf26839..bc4e89c10614 100755 --- a/ci/raydepsets/pre_hooks/remove-compiled-headers.sh +++ b/ci/raydepsets/pre_hooks/remove-compiled-headers.sh @@ -12,7 +12,7 @@ fi mkdir -p /tmp/ray-deps -# Remove the GPU constraints +# Remove the GPU constraints, numpy, scipy, and pandas pin (vLLM 0.15.0+ requires numpy>=2, compatible scipy, and pandas>=2.0) cp "python/${FILENAME}" "/tmp/ray-deps/${FILENAME}" -sed -e '/^--extra-index-url /d' -e '/^--find-links /d' "/tmp/ray-deps/${FILENAME}" > "/tmp/ray-deps/${FILENAME}.tmp" +sed -e '/^--extra-index-url /d' -e '/^--find-links /d' -e '/^numpy==/d' -e '/^scipy==/d' -e '/^pandas==/d' "/tmp/ray-deps/${FILENAME}" > "/tmp/ray-deps/${FILENAME}.tmp" mv "/tmp/ray-deps/${FILENAME}.tmp" "/tmp/ray-deps/${FILENAME}" diff --git a/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py b/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py index 6c0cc00cdf57..ca79f5ac9fe4 100644 --- a/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py +++ b/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py @@ -42,7 +42,7 @@ def _testing_build_dp_openai_app(builder_config, **kwargs): # Configure the model with data parallel settings config = LLMConfig( model_loading_config={ - "model_id": "Qwen/Qwen2.5-0.5B-Instruct" + "model_id": "microsoft/Phi-tiny-MoE-instruct" }, engine_kwargs={ "data_parallel_size": 2, # Number of DP replicas diff --git a/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py b/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py index 6ad4fc4f0c1c..e803e8fe0354 100644 --- a/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py +++ b/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py @@ -57,7 +57,7 @@ def _testing_build_dp_deployment(llm_config, **kwargs): # Configure prefill with data parallel attention prefill_config = LLMConfig( model_loading_config={ - "model_id": "Qwen/Qwen2.5-0.5B-Instruct" + "model_id": "microsoft/Phi-tiny-MoE-instruct" }, engine_kwargs={ "data_parallel_size": 2, # 2 DP replicas for prefill @@ -78,7 +78,7 @@ def _testing_build_dp_deployment(llm_config, **kwargs): # Configure decode with data parallel attention decode_config = LLMConfig( model_loading_config={ - "model_id": "Qwen/Qwen2.5-0.5B-Instruct" + "model_id": "microsoft/Phi-tiny-MoE-instruct" }, engine_kwargs={ "data_parallel_size": 2, # 2 DP replicas for decode (adjusted for 4 GPU limit) diff --git a/docker/ray-llm/Dockerfile b/docker/ray-llm/Dockerfile index cf8d7ed0e20f..9e3aac7b7840 100644 --- a/docker/ray-llm/Dockerfile +++ b/docker/ray-llm/Dockerfile @@ -7,7 +7,7 @@ COPY python/deplocks/llm/rayllm_*.lock ./ # vLLM version tag to use for EP kernel and DeepGEMM install scripts # Keep in sync with vllm version in python/requirements/llm/llm-requirements.txt -ARG VLLM_SCRIPTS_REF="v0.12.0" +ARG VLLM_SCRIPTS_REF="v0.15.0" RUN < AsyncIterator[Dict[str, Any] along with processing metadata. """ try: - from vllm.entrypoints.chat_utils import parse_chat_messages_futures + from vllm.entrypoints.chat_utils import parse_chat_messages_async except ImportError as e: raise ImportError( "vLLM is not installed or failed to import. Please run " "`pip install ray[llm]` to install required dependencies." ) from e - async def _get_mm_data(row: Dict[str, Any], conversation, fut, uuid): - multimodal_data = await fut - return row, conversation, uuid, multimodal_data - - tasks = [] - for row in batch: + async def _process_row(row: Dict[str, Any]): # Extract system messages to keep them as strings (not converted to list format) # This avoids issues with chat templates that expect string system messages. system_messages = [] @@ -155,7 +150,7 @@ async def _get_mm_data(row: Dict[str, Any], conversation, fut, uuid): # Users can provide stable IDs for each multimodal item from messages to # enable engine to cache and reuse work across requests. - conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( + conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages_to_parse, self.model_config, content_format=self.chat_template_content_format, @@ -164,11 +159,9 @@ async def _get_mm_data(row: Dict[str, Any], conversation, fut, uuid): if system_messages: conversation = system_messages + conversation - tasks.append( - asyncio.create_task( - _get_mm_data(row, conversation, mm_data_future, mm_uuids) - ) - ) + return row, conversation, mm_uuids, mm_data + + tasks = [asyncio.create_task(_process_row(row)) for row in batch] for task in asyncio.as_completed(tasks): row, conversation, uuid, multimodal_data = await task diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index 4150cfcf2cc4..15b367ff2a46 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -7,20 +7,21 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest as vLLMChatCompletionRequest, ChatCompletionResponse as vLLMChatCompletionResponse, ChatCompletionStreamResponse as vLLMChatCompletionStreamResponse, +) +from vllm.entrypoints.openai.completion.protocol import ( CompletionRequest as vLLMCompletionRequest, CompletionResponse as vLLMCompletionResponse, CompletionStreamResponse as vLLMCompletionStreamResponse, - DetokenizeRequest as vLLMDetokenizeRequest, - DetokenizeResponse as vLLMDetokenizeResponse, +) +from vllm.entrypoints.openai.engine.protocol import ( ErrorInfo as vLLMErrorInfo, ErrorResponse as vLLMErrorResponse, - TokenizeChatRequest as vLLMTokenizeChatRequest, - TokenizeCompletionRequest as vLLMTokenizeCompletionRequest, - TokenizeResponse as vLLMTokenizeResponse, +) +from vllm.entrypoints.openai.translations.protocol import ( TranscriptionRequest as vLLMTranscriptionRequest, TranscriptionResponse as vLLMTranscriptionResponse, TranscriptionStreamResponse as vLLMTranscriptionStreamResponse, @@ -31,8 +32,15 @@ EmbeddingResponse as vLLMEmbeddingResponse, ) from vllm.entrypoints.pooling.score.protocol import ( - ScoreRequest as vLLMScoreRequest, ScoreResponse as vLLMScoreResponse, + ScoreTextRequest as vLLMScoreTextRequest, +) +from vllm.entrypoints.serve.tokenize.protocol import ( + DetokenizeRequest as vLLMDetokenizeRequest, + DetokenizeResponse as vLLMDetokenizeResponse, + TokenizeChatRequest as vLLMTokenizeChatRequest, + TokenizeCompletionRequest as vLLMTokenizeCompletionRequest, + TokenizeResponse as vLLMTokenizeResponse, ) from vllm.utils import random_uuid @@ -107,7 +115,7 @@ class TranscriptionStreamResponse(vLLMTranscriptionStreamResponse): model_config = ConfigDict(arbitrary_types_allowed=True) -class ScoreRequest(vLLMScoreRequest): +class ScoreRequest(vLLMScoreTextRequest): model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index e4dd4e842052..5efb6043d92c 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -8,7 +8,7 @@ from starlette.requests import Request from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.openai.cli_args import FrontendArgs -from vllm.entrypoints.openai.protocol import ErrorResponse as VLLMErrorResponse +from vllm.entrypoints.openai.engine.protocol import ErrorResponse as VLLMErrorResponse import ray from ray.llm._internal.common.callbacks.base import CallbackCtx @@ -426,7 +426,7 @@ def _start_async_llm_engine( return engine_client async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): - from vllm.entrypoints.openai.protocol import LoadLoRAAdapterRequest + from vllm.entrypoints.serve.lora.protocol import LoadLoRAAdapterRequest self._validate_openai_serving_models() diff --git a/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py b/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py index a860cab68c02..f0587dfdd521 100644 --- a/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py +++ b/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py @@ -348,7 +348,7 @@ async def test_vllm_wrapper_embed(model_opt_125m): [ ({}, True), ({"truncate_prompt_tokens": 3}, False), - ({"normalize": True}, False), + ({"use_activation": True}, False), ], ) async def test_vllm_wrapper_embed_pooling_params( diff --git a/python/requirements/llm/llm-override.txt b/python/requirements/llm/llm-override.txt new file mode 100644 index 000000000000..c269d4a13ed1 --- /dev/null +++ b/python/requirements/llm/llm-override.txt @@ -0,0 +1,9 @@ +# Override vLLM's torch==2.9.1+cpu requirement to allow CUDA variants +torch>=2.9.0 +# Override numpy constraint (vLLM requires opencv-python-headless>=4.13.0 which requires numpy>=2) +# Upper bound <2.3 due to cupy-cuda12x==13.4.0 compatibility +numpy>=2.0.0,<2.3 +# Override scipy to allow version compatible with numpy 2.x (scipy>=1.14 supports numpy 2.x) +scipy>=1.14.0 +# Override pandas to allow version compatible with numpy 2.x (pandas>=2.0 supports numpy 2.x) +pandas>=2.0.0 diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt index 2caf7daaf5b0..3c329236560e 100644 --- a/python/requirements/llm/llm-requirements.txt +++ b/python/requirements/llm/llm-requirements.txt @@ -2,7 +2,7 @@ # constraining to a maximum version (i.e. <=) to temporarily work around a bug. # Those pins for the sake of workarounds should not be advertised as constraints # on future releases in setup.py. -vllm[audio]>=0.13.0 +vllm[audio]>=0.15.0 nixl>=0.6.1 anyio>=4.5.0 # For json mode diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt index ed8f008db80e..f7a06056eef5 100644 --- a/python/requirements_compiled.txt +++ b/python/requirements_compiled.txt @@ -558,7 +558,7 @@ flask-cors==4.0.0 # via # -r python/requirements/ml/data-requirements.txt # moto -flatbuffers==23.5.26 +flatbuffers==25.12.19 # via # onnxruntime # tensorflow @@ -620,7 +620,7 @@ glfw==2.6.3 # mujoco glom==22.1.0 # via semgrep -google-api-core==2.24.2 +google-api-core==2.29.0 # via # -r python/requirements/ml/data-test-requirements.txt # google-api-python-client @@ -678,7 +678,7 @@ google-resumable-media==2.6.0 # via # google-cloud-bigquery # google-cloud-storage -googleapis-common-protos==1.61.0 +googleapis-common-protos==1.72.0 # via # google-api-core # grpcio-status @@ -702,7 +702,7 @@ graphviz==0.20.3 # via -r python/requirements/test-requirements.txt greenlet==3.0.1 # via sqlalchemy -grpcio==1.74.0 +grpcio==1.76.0 # via # -r python/requirements.txt # -r python/requirements/cloud-requirements.txt @@ -2394,6 +2394,7 @@ typing-extensions==4.15.0 # gradio # gradio-client # graphene + # grpcio # gymnasium # huggingface-hub # lightning-utilities diff --git a/python/setup.py b/python/setup.py index 76d146eed8f5..a04fa86b4677 100644 --- a/python/setup.py +++ b/python/setup.py @@ -365,7 +365,7 @@ def get_packages(self): setup_spec.extras["llm"] = list( set( [ - "vllm[audio]>=0.13.0", + "vllm[audio]>=0.15.0", "nixl>=0.6.1", # TODO(llm): remove after next vLLM version bump "transformers>=4.57.3", diff --git a/release/llm_tests/batch/test_batch_vllm.py b/release/llm_tests/batch/test_batch_vllm.py index b9b659b5a710..941a2278e3c6 100644 --- a/release/llm_tests/batch/test_batch_vllm.py +++ b/release/llm_tests/batch/test_batch_vllm.py @@ -56,16 +56,16 @@ async def test_vllm_multimodal_utils(): """Test vLLM's multimodal utilities. This test is adapted from https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/test_chat_utils.py. - `parse_chat_messages_futures` is thoroughly tested in vLLM. This test serves as an + `parse_chat_messages_async` is thoroughly tested in vLLM. This test serves as an integration test to verify that the function isn't moved to an unexpected location and its signature isn't changed. """ from vllm.config import ModelConfig - from vllm.entrypoints.chat_utils import parse_chat_messages_futures + from vllm.entrypoints.chat_utils import parse_chat_messages_async image_url = "https://air-example-data.s3.us-west-2.amazonaws.com/rayllm-ossci/assets/cherry_blossom.jpg" image_uuid = str(hash(image_url)) - conversation, mm_future, mm_uuids = parse_chat_messages_futures( + conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", @@ -92,7 +92,6 @@ async def test_vllm_multimodal_utils(): {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] - mm_data = await mm_future assert mm_data is not None assert set(mm_data.keys()) == {"image"} diff --git a/release/ray_release/byod/llm_batch/llm_batch_single_node_benchmark_py311_cu128.lock b/release/ray_release/byod/llm_batch/llm_batch_single_node_benchmark_py311_cu128.lock index f850f7446964..b3b2cf62f78c 100644 --- a/release/ray_release/byod/llm_batch/llm_batch_single_node_benchmark_py311_cu128.lock +++ b/release/ray_release/byod/llm_batch/llm_batch_single_node_benchmark_py311_cu128.lock @@ -1421,6 +1421,7 @@ numpy==2.2.6 \ --hash=sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de \ --hash=sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8 # via + # --override python/requirements/llm/llm-override.txt # -r python/requirements.txt # cupy-cuda12x # datasets @@ -1579,6 +1580,7 @@ pandas==2.3.3 \ --hash=sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c \ --hash=sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee # via + # --override python/requirements/llm/llm-override.txt # -r python/requirements.txt # datasets platformdirs==4.5.0 \ @@ -2257,7 +2259,9 @@ scipy==1.16.3 \ --hash=sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa \ --hash=sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b \ --hash=sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d - # via -r python/requirements.txt + # via + # --override python/requirements/llm/llm-override.txt + # -r python/requirements.txt shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de