vllm-project · arpera · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -25,6 +25,7 @@
 ARG CUDA_VERSION=13.0.2
 ARG PYTHON_VERSION=3.12
 ARG UBUNTU_VERSION=22.04
+ARG FLASHINFER_VERSION=0.6.10
 
 # By parameterizing the base images, we allow third-party to use their own
 # base images. One use case is hermetic builds with base images stored in
@@ -101,6 +102,7 @@ FROM ${BUILD_BASE_IMAGE} AS base
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
+ARG FLASHINFER_VERSION
 ARG BUILD_OS
 
 ENV DEBIAN_FRONTEND=noninteractive
@@ -212,6 +214,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
+# `flashinfer-python` is already installed via requirements/cuda.txt above;
+# this only activates its `[cu13]` extra (cu13 deps for the SM100 GDN kernel).
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+        uv pip install --python /opt/venv/bin/python3 \
+            "flashinfer-python[cu13]==${FLASHINFER_VERSION}"; \
+    fi
+
 # Track PyTorch lib versions used during build and match in downstream instances.
 # We do this for both nightly and release so we can strip dependencies/*.txt as needed.
 # Otherwise library dependencies can upgrade/downgrade torch incorrectly.
@@ -519,6 +529,7 @@ FROM ${FINAL_BASE_IMAGE} AS vllm-base
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
+ARG FLASHINFER_VERSION
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL
@@ -620,10 +631,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
     rm /tmp/requirements-cuda.txt /tmp/common.txt
 
+# `flashinfer-python` is already installed via requirements/cuda.txt above;
+# this only activates its `[cu13]` extra (cu13 deps for the SM100 GDN kernel).
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+        uv pip install --system \
+            "flashinfer-python[cu13]==${FLASHINFER_VERSION}"; \
+    fi
+
 # Install FlashInfer JIT cache (requires CUDA-version-specific index URL)
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.8.post1
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
@@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.8.post1
+# release version: v0.6.10
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.8.post1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.10 --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \

diff --git a/docker/versions.json b/docker/versions.json
@@ -10,6 +10,9 @@
     "UBUNTU_VERSION": {
       "default": "22.04"
     },
+    "FLASHINFER_VERSION": {
+      "default": "0.6.10"
+    },
     "BUILD_BASE_IMAGE": {
       "default": "nvidia/cuda:13.0.2-devel-ubuntu22.04"
     },
@@ -67,9 +70,6 @@
     "RUN_WHEEL_CHECK": {
       "default": "true"
     },
-    "FLASHINFER_VERSION": {
-      "default": "0.6.8.post1"
-    },
     "GDRCOPY_CUDA_VERSION": {
       "default": "12.8"
     },

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
@@ -9,8 +9,8 @@ torchaudio==2.11.0
 # These must be updated alongside torch
 torchvision==0.26.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.8.post1
-flashinfer-cubin==0.6.8.post1
+flashinfer-python==0.6.10
+flashinfer-cubin==0.6.10
 apache-tvm-ffi==0.1.9
 tilelang==0.1.9
 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to

diff --git a/setup.py b/setup.py
@@ -969,6 +969,14 @@ def _read_requirements(filename: str) -> list[str]:
                 # vllm-flash-attn is built only for CUDA 12.x.
                 # Skip for other versions.
                 continue
+            if req.startswith("flashinfer-python") and cuda_major == "13":
+                # Activate FI's `[cu13]` extra on cu13 builds (cu13 deps for
+                # the SM100 GDN kernel). Mirrors the Dockerfile cu13 path.
+                req = req.replace(
+                    "flashinfer-python",
+                    "flashinfer-python[cu13]",
+                    1,
+                )
             modified_requirements.append(req)
         requirements = modified_requirements
     elif _is_hip():