diff --git a/recipe/patches/0001-Search-for-the-CUDA-package-in-CMakeLists.patch b/recipe/patches/0001-Search-for-the-CUDA-package-in-CMakeLists.patch index 2a2ca01..7c27400 100644 --- a/recipe/patches/0001-Search-for-the-CUDA-package-in-CMakeLists.patch +++ b/recipe/patches/0001-Search-for-the-CUDA-package-in-CMakeLists.patch @@ -1,17 +1,17 @@ -From 7d8537c0d09b0817053f05f30a871bd7f3f9cca7 Mon Sep 17 00:00:00 2001 +From b4e6f75bcd35bfe2513be1a1c6c82c0720a08d8d Mon Sep 17 00:00:00 2001 From: Sherman Siu Date: Mon, 21 Jul 2025 05:01:32 -0400 -Subject: [PATCH 1/4] Search for the CUDA package in CMakeLists +Subject: [PATCH 1/6] Search for the CUDA package in CMakeLists --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt -index 15db4a4f4..d86fab3f4 100644 +index 0129f8512..564445522 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -67,6 +67,7 @@ endif() +@@ -66,6 +66,7 @@ endif() append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # Ensure the 'nvcc' command is in the PATH diff --git a/recipe/patches/0002-Remove-ninja-pip-requirement.patch b/recipe/patches/0002-Remove-ninja-pip-requirement.patch index 77d077e..af3fdd6 100644 --- a/recipe/patches/0002-Remove-ninja-pip-requirement.patch +++ b/recipe/patches/0002-Remove-ninja-pip-requirement.patch @@ -1,18 +1,19 @@ -From 69d16023885cab08141882461ab01b3bf951d9de Mon Sep 17 00:00:00 2001 +From 643db40d812d663f825f99b028e5b32765ad266b Mon Sep 17 00:00:00 2001 From: Sherman Siu Date: Mon, 21 Jul 2025 05:04:55 -0400 -Subject: [PATCH 2/4] Remove ninja pip requirement +Subject: [PATCH 2/6] Remove ninja pip requirement --- requirements/common.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt -index 24a1e6d67..0f9cac6c8 100644 +index 8bc0be777..5d0fb5235 100644 --- a/requirements/common.txt +++ b/requirements/common.txt -@@ -42,4 +42,3 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry +@@ -43,5 +43,4 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry watchfiles # required for http server to monitor the updates of TLS files - python-json-logger # Used by logging as per examples/other/logging_configuration.md + python-json-logger # Used by logging as per examples/others/logging_configuration.md scipy # Required for phi-4-multimodal-instruct -ninja # Required for xgrammar, rocm, tpu, xpu + pybase64 # fast base64 implementation diff --git a/recipe/patches/0003-Manually-define-gettid.patch b/recipe/patches/0003-Manually-define-gettid.patch index b796c9c..c78e830 100644 --- a/recipe/patches/0003-Manually-define-gettid.patch +++ b/recipe/patches/0003-Manually-define-gettid.patch @@ -1,7 +1,7 @@ -From 9fe9a9d7e6dc4cf38e8b5c29f386d1b445c73af8 Mon Sep 17 00:00:00 2001 +From 8135446bc484b8edda531c1f81fd0ef9c4d314be Mon Sep 17 00:00:00 2001 From: Sherman Siu Date: Mon, 21 Jul 2025 05:05:36 -0400 -Subject: [PATCH 3/4] Manually define gettid +Subject: [PATCH 3/6] Manually define gettid - `gettid` is not defined by default until glibc 2.30: see https://stackoverflow.com/questions/30680550/c-gettid-was-not-declared-in-this-scope for details --- @@ -9,10 +9,10 @@ Subject: [PATCH 3/4] Manually define gettid 1 file changed, 3 insertions(+) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp -index 79771ecd9..e21efc92a 100644 +index 02514edce..e605da905 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp -@@ -7,6 +7,9 @@ +@@ -12,6 +12,9 @@ #include "cpu_types.hpp" diff --git a/recipe/patches/0004-Factor-in-the-cmake-args-when-building-e.g.-for-cros.patch b/recipe/patches/0004-Factor-in-the-cmake-args-when-building-e.g.-for-cros.patch index e0a7017..8d4f7ce 100644 --- a/recipe/patches/0004-Factor-in-the-cmake-args-when-building-e.g.-for-cros.patch +++ b/recipe/patches/0004-Factor-in-the-cmake-args-when-building-e.g.-for-cros.patch @@ -1,7 +1,7 @@ -From 940ed92c475e6b14e94acba9bd376dde9c47733a Mon Sep 17 00:00:00 2001 +From 5e1f64fe4ca4a4a6fc9e0f2adcad3d04469b74dd Mon Sep 17 00:00:00 2001 From: Sherman Siu Date: Mon, 21 Jul 2025 05:08:38 -0400 -Subject: [PATCH 4/4] Factor in the cmake args when building, e.g. for +Subject: [PATCH 4/6] Factor in the cmake args when building, e.g. for cross-compilation --- @@ -9,10 +9,10 @@ Subject: [PATCH 4/4] Factor in the cmake args when building, e.g. for 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py -index b0cc2f481..4068094eb 100755 +index ea7cd0169..42a1a8e8a 100644 --- a/setup.py +++ b/setup.py -@@ -204,6 +204,9 @@ class cmake_build_ext(build_ext): +@@ -205,6 +205,9 @@ class cmake_build_ext(build_ext): # Make sure we use the nvcc from CUDA_HOME if _is_cuda(): cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc'] diff --git a/recipe/patches/0005-Configure-build-to-target-aarch64-even-though-CMake-.patch b/recipe/patches/0005-Configure-build-to-target-aarch64-even-though-CMake-.patch index 2063839..b649f62 100644 --- a/recipe/patches/0005-Configure-build-to-target-aarch64-even-though-CMake-.patch +++ b/recipe/patches/0005-Configure-build-to-target-aarch64-even-though-CMake-.patch @@ -1,18 +1,18 @@ -From 49056e431b67064eec4f284a301c92ab26a9dbd8 Mon Sep 17 00:00:00 2001 +From 0edd58495c6962b1f478a7040bbad7f18e2e20f8 Mon Sep 17 00:00:00 2001 From: Sherman Siu -Date: Fri, 1 Aug 2025 05:10:59 -0400 -Subject: [PATCH 5/5] Configure build to target aarch64 even though CMake finds - x86_64 hardware +Date: Sun, 31 Aug 2025 20:37:56 -0400 +Subject: [PATCH 5/6] Configure build to target aarch64 even though CMake finds + x86_64 hardware --- cmake/cpu_extension.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake -index fdc03a795..3142a1452 100644 +index fc7291972..c745e20c5 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake -@@ -85,6 +85,11 @@ else() +@@ -83,6 +83,11 @@ else() endif() diff --git a/recipe/patches/0006-Use-PyTorch-2.7.0-to-keep-version-number-consistent-.patch b/recipe/patches/0006-Use-PyTorch-2.7.0-to-keep-version-number-consistent-.patch new file mode 100644 index 0000000..5c47da3 --- /dev/null +++ b/recipe/patches/0006-Use-PyTorch-2.7.0-to-keep-version-number-consistent-.patch @@ -0,0 +1,37 @@ +From 9c532a9b58b65017ae517bacbe438d9d71f71891 Mon Sep 17 00:00:00 2001 +From: Sherman Siu +Date: Sun, 31 Aug 2025 18:08:54 -0400 +Subject: [PATCH 6/6] Use PyTorch 2.7.0 to keep version number consistent + across builds + +--- + requirements/cpu-build.txt | 2 +- + requirements/cpu.txt | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt +index 37f072202..91f6cf297 100644 +--- a/requirements/cpu-build.txt ++++ b/requirements/cpu-build.txt +@@ -6,7 +6,7 @@ packaging>=24.2 + setuptools>=77.0.3,<80.0.0 + setuptools-scm>=8 + --extra-index-url https://download.pytorch.org/whl/cpu +-torch==2.6.0+cpu ++torch==2.7.0 + wheel + jinja2>=3.1.6 + regex +diff --git a/requirements/cpu.txt b/requirements/cpu.txt +index df3a33935..29e53b997 100644 +--- a/requirements/cpu.txt ++++ b/requirements/cpu.txt +@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' + packaging>=24.2 + setuptools>=77.0.3,<80.0.0 + --extra-index-url https://download.pytorch.org/whl/cpu +-torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 ++torch==2.7.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 + torch==2.7.0; platform_system == "Darwin" + torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" + diff --git a/recipe/recipe.yaml b/recipe/recipe.yaml index 2ff3aca..b5d53e0 100644 --- a/recipe/recipe.yaml +++ b/recipe/recipe.yaml @@ -1,7 +1,7 @@ context: - version: 0.8.3 - pytorch_version: 2.6.0 + version: 0.9.2 use_cuda: ${{ cuda_compiler_version != "None" }} + pytorch_version: 2.7.1 # The solver refuses to pick up 2.7.0 for some unknown reason vllm_target_device: ${{ "cuda" if use_cuda else "cpu" }} cuda_build_string: cuda_${{ cuda_compiler_version | version_to_buildstring }} string_prefix: ${{ cuda_build_string if cuda_compiler_version != "None" else "cpu_" }} @@ -13,7 +13,7 @@ package: source: - url: https://pypi.org/packages/source/v/vllm/vllm-${{ version }}.tar.gz - sha256: 475a39d1093b8ef8a905d63eafe0c6c9b8f4f4c2ae2d23f1f3d0fae5e37bb4bd + sha256: 6b0d855ea8ba18d76364c9b82ea94bfcaa9c9e724055438b5733e4716ed104e1 patches: - patches/0001-Search-for-the-CUDA-package-in-CMakeLists.patch - patches/0002-Remove-ninja-pip-requirement.patch @@ -26,14 +26,15 @@ source: - if: aarch64 then: - patches/0005-Configure-build-to-target-aarch64-even-though-CMake-.patch + - patches/0006-Use-PyTorch-2.7.0-to-keep-version-number-consistent-.patch target_directory: vllm # Needs to be vendored because vLLM uses a modified version of the flash attention primitives that supports KV-caching. -- url: https://github.com/vllm-project/flash-attention/archive/d637d8927a35922ce6f6c0dff6dd3f765ed71f3c.tar.gz - sha256: 3099add00c9938735b84319d176c5b239c0165e3f9be6540a7a3505cd897c7cd +- url: https://github.com/vllm-project/flash-attention/archive/1c2624e53c078854e0637ee566c72fe2107e75f4.tar.gz + sha256: cca19d7e53af08aa6d6f0c4fd9dd78d30314497e38fb03b1368b3d5a77ab4b5c target_directory: flash-attention build: - number: 4 + number: 0 string: ${{ string_prefix }}py${{ python | version_to_buildstring }}h${{ hash }}_${{ build_number }} script: - sed -i.bak 's/set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")/set(TORCH_SUPPORTED_VERSION_CUDA "${{ pytorch_version }}")/g' flash-attention/CMakeLists.txt @@ -65,7 +66,7 @@ build: requirements: build: - - cmake >=3.26 + - cmake >=3.26.1 - git - ninja - zlib @@ -86,10 +87,11 @@ requirements: host: - python - jinja2 >=3.1.6 - - packaging + - packaging >=24.2 - pip - pytorch ==${{ pytorch_version }} - - setuptools >=61 + - regex + - setuptools >=77.0.3,<80.0.0 - setuptools-scm >=8 - wheel - if: linux @@ -112,22 +114,22 @@ requirements: - blake3 - cachetools - cloudpickle - - compressed-tensors ==0.9.2 + - compressed-tensors ==0.10.2 - depyf ==0.18.0 - einops - fastapi >=0.115.0 - filelock >=3.16.1 - - gguf ==0.10.0 + - gguf >=0.13.0 - importlib-metadata - - hf-xet >=0.1.4 - - huggingface_hub >=0.30.0 + - hf-xet >=1.1.2,<2.0.0 + - huggingface_hub >=0.33.0 - lark ==1.2.2 - - llguidance >=0.7.9,<0.8.0 - lm-format-enforcer >=0.10.11,<0.11 - - mistral-common >=1.5.4 + - mistral-common >=1.6.2 - msgspec + - numba ==0.61.2 - numpy - - openai >=1.52.0 + - openai >=1.52.0,<=1.90.0 - opencv >=4.11.0 - outlines ==0.1.11 - partial-json-parser @@ -137,44 +139,53 @@ requirements: - protobuf - psutil - py-cpuinfo - - pydantic >=2.9 + - pybase64 + - pydantic >=2.10 - python-json-logger - pytorch ==${{ pytorch_version }} - pyyaml - - pyzmq + - pyzmq >=25.0.0 + - regex - requests >=2.26.0 - scipy - sentencepiece - tiktoken >=0.6.0 - - tokenizers >=0.19.1 + - tokenizers >=0.21.1 - tqdm # Newer versions of transformers already define the aimv2 config, so we can't use it for now # See https://github.com/vllm-project/vllm-ascend/issues/2046#issuecomment-3123639101 for more details. # The required fix: https://github.com/vllm-project/vllm/commit/3fc964433a84bad785d9d0656fd56195462321b8 - - transformers >=4.51.0,<4.54.0 + - transformers >=4.51.1,<4.54.0 - typing_extensions >=4.10 - uvicorn-standard - watchfiles - - if: x86_64 or aarch64 + - if: x86_64 or arm64 or aarch64 then: - - xgrammar ==0.1.17 + - llguidance >=0.7.11,< 0.8.0 + - xgrammar ==0.1.19 - if: match(python, ">3.11") then: - six >=1.16.0 - - setuptools >=74.1.1 + - setuptools >=77.0.3,<80 - if: use_cuda then: - - numba ==0.61 - ray-cgraph >=2.43.0,!=2.44 - torchaudio ==${{ pytorch_version }} - - torchvision ==0.21.0 + - torchvision ==0.22.0 - if: linux64 then: - - xformers ==0.0.29.post2 + - xformers ==0.0.30 # platform_system == "Linux" and platform_machine == "x86_64" else: - torchaudio - torchvision + - if: x86_64 + then: + - triton ==3.2.0 run_constraints: + # Fixes issue with incompatibility between old `datasets` versions and `pyarrow` v21+ + # See https://github.com/apache/arrow/issues/47155 for more details. + # The required PR is: https://github.com/huggingface/datasets/pull/6404 + - datasets >=2.15 - if: use_cuda then: - pytorch * [build=cuda*] @@ -189,11 +200,13 @@ tests: - if: linux and use_cuda then: - vllm.vllm_flash_attn - # Disable until opentelemetry-prometheus-exporter has fixed constraints - # https://github.com/conda-forge/opentelemetry-exporter-prometheus-feedstock/pull/24 pip_check: false - script: - - vllm --version + # As of vllm v0.9 and later, it seems like libcuda.so.1 is required for the CLI for CUDA builds (stub libraries don't work) + # We can't test this on the CPU runners, which is what we're using to build the wheel + - if: not use_cuda + then: + - vllm --version - script: # Pick an arbitrary test to run: some of the other ones rely on a bunch of external packages - pytest ./vllm/tests/core/test_scheduler.py