From 9a6f7723ab378a6640c5ff8111b6fd9e9b61a0b8 Mon Sep 17 00:00:00 2001
From: mgoin <mgoin64@gmail.com>
Date: Wed, 8 Oct 2025 17:17:21 -0400
Subject: [PATCH 1/4] Add FlashInfer as default CUDA dependency

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 requirements/cuda.txt    | 2 ++
 setup.py                 | 3 +--
 vllm/utils/flashinfer.py | 7 ++++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index ed03247bcf60..b71fd2b611f2 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -11,3 +11,5 @@ torchaudio==2.8.0
 torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
 xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
+# FlashInfer should be updated together with the Dockerfile
+flashinfer-python==0.3.1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 53c460d2c5b8..60dde120d500 100644
--- a/setup.py
+++ b/setup.py
@@ -714,8 +714,7 @@ def _read_requirements(filename: str) -> list[str]:
             "mistral_common[audio]",
         ],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
-        # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.3.1"],
+        "flashinfer": [],  # Kept for backwards compatibility
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 159d19bfad31..6d3105d85b12 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -12,6 +12,7 @@
 import importlib
 import importlib.util
 import os
+import shutil
 from typing import Any, Callable, NoReturn
 
 import requests
@@ -37,7 +38,11 @@ def has_flashinfer() -> bool:
     """Return ``True`` if FlashInfer is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
-    return importlib.util.find_spec("flashinfer") is not None
+    # Also check if nvcc is available since it's required to use flashinfer
+    return (
+        importlib.util.find_spec("flashinfer") is not None
+        and shutil.which("nvcc") is not None
+    )
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:

From fb25fc3c6e95c7572f9e0e9aaeba3f62932f719d Mon Sep 17 00:00:00 2001
From: mgoin <mgoin64@gmail.com>
Date: Wed, 8 Oct 2025 17:28:13 -0400
Subject: [PATCH 2/4] Update dockerfile

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docker/Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index f9df931e73b1..8ba85e89dfdf 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -396,7 +396,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
             echo "🏗️  Installing FlashInfer from pre-compiled wheel"
             uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
-                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+                --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
             if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
                 # Download pre-compiled cubins
                 TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
@@ -413,14 +413,14 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             # Install with no-build-isolation since we already built AOT kernels
             TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                 uv pip install --system --no-build-isolation . \
-                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+                --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
             # Download pre-compiled cubins
             TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                 python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
         else
             echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
             uv pip install --system . \
-                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+                --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
         fi
     popd
     rm -rf flashinfer

From 4e020d8786fc9fa54bdef341406eef317b5c58e3 Mon Sep 17 00:00:00 2001
From: mgoin <mgoin64@gmail.com>
Date: Wed, 8 Oct 2025 18:27:55 -0400
Subject: [PATCH 3/4] Add logs

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/utils/flashinfer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 6d3105d85b12..2c6c0e062a7c 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -38,11 +38,14 @@ def has_flashinfer() -> bool:
     """Return ``True`` if FlashInfer is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
-    # Also check if nvcc is available since it's required to use flashinfer
-    return (
-        importlib.util.find_spec("flashinfer") is not None
-        and shutil.which("nvcc") is not None
-    )
+    if importlib.util.find_spec("flashinfer") is None:
+        logger.debug_once("FlashInfer unavailable since package was not found")
+        return False
+    # Also check if nvcc is available since it's required to JIT compile flashinfer
+    if shutil.which("nvcc") is None:
+        logger.debug_once("FlashInfer unavailable since nvcc was not found")
+        return False
+    return True
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:

From a70d88d0bb3a7bc23c3743568041276c0678dc37 Mon Sep 17 00:00:00 2001
From: mgoin <mgoin64@gmail.com>
Date: Thu, 9 Oct 2025 11:11:50 -0400
Subject: [PATCH 4/4] Update dockerfile

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docker/Dockerfile | 77 +++++------------------------------------------
 1 file changed, 8 insertions(+), 69 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b48dbca7263f..3a0db3cc49f6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# If we need to build FlashInfer wheel before its release:
-# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
-# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-# $ cd flashinfer
-# $ git checkout v0.2.6.post1
-# $ python -m flashinfer.aot
-# $ python -m build --no-isolation --wheel
-# $ ls -la dist
-# -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
-# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
-
-# Install FlashInfer from source
-ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.4.0"
-# Flag to control whether to compile FlashInfer AOT kernels
-# Set to "true" to enable AOT compilation:
-# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
-ARG FLASHINFER_AOT_COMPILE=false
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
-  . /etc/environment
-    git clone --depth 1 --recursive --shallow-submodules \
-        --branch ${FLASHINFER_GIT_REF} \
-        ${FLASHINFER_GIT_REPO} flashinfer
-    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-    if [[ "${CUDA_VERSION}" == 11.* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-    else
-        # CUDA 12.8+ supports 10.0a and 12.0
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-    fi
-    pushd flashinfer
-        if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then
-            # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
-            echo "🏗️  Installing FlashInfer from pre-compiled wheel"
-            uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
-                --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-            if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-                # Download pre-compiled cubins
-                TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                    python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
-            fi
-        elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
-            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
-            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
-            # Build AOT kernels
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                python3 -m flashinfer.aot
-            # Install with no-build-isolation since we already built AOT kernels
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                uv pip install --system --no-build-isolation . \
-                --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-            # Download pre-compiled cubins
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
-        else
-            echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
-            uv pip install --system . \
-                --reinstall --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-        fi
-    popd
-    rm -rf flashinfer
-BASH
+# Install FlashInfer pre-compiled kernel cache and binaries
+# https://docs.flashinfer.ai/installation.html
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-cubin==0.4.0 \
+    && uv pip install --system flashinfer-jit-cache==0.4.0 \
+        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+    && flashinfer show-config
+
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .