From 9a6f7723ab378a6640c5ff8111b6fd9e9b61a0b8 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 8 Oct 2025 17:17:21 -0400 Subject: [PATCH 1/4] Add FlashInfer as default CUDA dependency Signed-off-by: mgoin --- requirements/cuda.txt | 2 ++ setup.py | 3 +-- vllm/utils/flashinfer.py | 7 ++++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/requirements/cuda.txt b/requirements/cuda.txt index ed03247bcf60..b71fd2b611f2 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -11,3 +11,5 @@ torchaudio==2.8.0 torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 +# FlashInfer should be updated together with the Dockerfile +flashinfer-python==0.3.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 53c460d2c5b8..60dde120d500 100644 --- a/setup.py +++ b/setup.py @@ -714,8 +714,7 @@ def _read_requirements(filename: str) -> list[str]: "mistral_common[audio]", ], # Required for audio processing "video": [], # Kept for backwards compatibility - # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.3.1"], + "flashinfer": [], # Kept for backwards compatibility # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 159d19bfad31..6d3105d85b12 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -12,6 +12,7 @@ import importlib import importlib.util import os +import shutil from typing import Any, Callable, NoReturn import requests @@ -37,7 +38,11 @@ def has_flashinfer() -> bool: """Return ``True`` if FlashInfer is available.""" # Use find_spec to check if the module exists without importing it # This avoids potential CUDA initialization side effects - return importlib.util.find_spec("flashinfer") is not None + # Also check if nvcc is available since it's required to use flashinfer + return ( + importlib.util.find_spec("flashinfer") is not None + and shutil.which("nvcc") is not None + ) def _missing(*_: Any, **__: Any) -> NoReturn: From fb25fc3c6e95c7572f9e0e9aaeba3f62932f719d Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 8 Oct 2025 17:28:13 -0400 Subject: [PATCH 2/4] Update dockerfile Signed-off-by: mgoin --- docker/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f9df931e73b1..8ba85e89dfdf 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -396,7 +396,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh echo "🏗️ Installing FlashInfer from pre-compiled wheel" uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \ - --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then # Download pre-compiled cubins TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ @@ -413,14 +413,14 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' # Install with no-build-isolation since we already built AOT kernels TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ uv pip install --system --no-build-isolation . \ - --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Download pre-compiled cubins TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." else echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" uv pip install --system . \ - --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') fi popd rm -rf flashinfer From 4e020d8786fc9fa54bdef341406eef317b5c58e3 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 8 Oct 2025 18:27:55 -0400 Subject: [PATCH 3/4] Add logs Signed-off-by: mgoin --- vllm/utils/flashinfer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 6d3105d85b12..2c6c0e062a7c 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -38,11 +38,14 @@ def has_flashinfer() -> bool: """Return ``True`` if FlashInfer is available.""" # Use find_spec to check if the module exists without importing it # This avoids potential CUDA initialization side effects - # Also check if nvcc is available since it's required to use flashinfer - return ( - importlib.util.find_spec("flashinfer") is not None - and shutil.which("nvcc") is not None - ) + if importlib.util.find_spec("flashinfer") is None: + logger.debug_once("FlashInfer unavailable since package was not found") + return False + # Also check if nvcc is available since it's required to JIT compile flashinfer + if shutil.which("nvcc") is None: + logger.debug_once("FlashInfer unavailable since nvcc was not found") + return False + return True def _missing(*_: Any, **__: Any) -> NoReturn: From a70d88d0bb3a7bc23c3743568041276c0678dc37 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 9 Oct 2025 11:11:50 -0400 Subject: [PATCH 4/4] Update dockerfile Signed-off-by: mgoin --- docker/Dockerfile | 77 +++++------------------------------------------ 1 file changed, 8 insertions(+), 69 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b48dbca7263f..3a0db3cc49f6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist uv pip install --system dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -# If we need to build FlashInfer wheel before its release: -# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ -# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' -# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive -# $ cd flashinfer -# $ git checkout v0.2.6.post1 -# $ python -m flashinfer.aot -# $ python -m build --no-isolation --wheel -# $ ls -la dist -# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl -# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl - -# Install FlashInfer from source -ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" -# Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.4.0" -# Flag to control whether to compile FlashInfer AOT kernels -# Set to "true" to enable AOT compilation: -# docker build --build-arg FLASHINFER_AOT_COMPILE=true ... -ARG FLASHINFER_AOT_COMPILE=false -RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' - . /etc/environment - git clone --depth 1 --recursive --shallow-submodules \ - --branch ${FLASHINFER_GIT_REF} \ - ${FLASHINFER_GIT_REPO} flashinfer - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" - fi - pushd flashinfer - if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then - # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh - echo "🏗️ Installing FlashInfer from pre-compiled wheel" - uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \ - --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') - if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then - # Download pre-compiled cubins - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." - fi - elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then - echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" - export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" - # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future - uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1) - # Build AOT kernels - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot - # Install with no-build-isolation since we already built AOT kernels - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation . \ - --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') - # Download pre-compiled cubins - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." - else - echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" - uv pip install --system . \ - --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') - fi - popd - rm -rf flashinfer -BASH +# Install FlashInfer pre-compiled kernel cache and binaries +# https://docs.flashinfer.ai/installation.html +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system flashinfer-cubin==0.4.0 \ + && uv pip install --system flashinfer-jit-cache==0.4.0 \ + --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ + && flashinfer show-config + COPY examples examples COPY benchmarks benchmarks COPY ./vllm/collect_env.py .