From f3361534f9362a0cb8c210684736136cd54c9ddb Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Tue, 5 May 2026 10:58:44 +0300 Subject: [PATCH] [CI/Build] Bump flashinfer to v0.6.10 Signed-off-by: Artem Perevedentsev --- docker/Dockerfile | 20 +++++++++++++++++++- docker/Dockerfile.nightly_torch | 4 ++-- docker/versions.json | 6 +++--- requirements/cuda.txt | 4 ++-- setup.py | 8 ++++++++ 5 files changed, 34 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index fd0622e2416a..649c1f488958 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -25,6 +25,7 @@ ARG CUDA_VERSION=13.0.2 ARG PYTHON_VERSION=3.12 ARG UBUNTU_VERSION=22.04 +ARG FLASHINFER_VERSION=0.6.10 # By parameterizing the base images, we allow third-party to use their own # base images. One use case is hermetic builds with base images stored in @@ -101,6 +102,7 @@ FROM ${BUILD_BASE_IMAGE} AS base ARG CUDA_VERSION ARG PYTHON_VERSION +ARG FLASHINFER_VERSION ARG BUILD_OS ENV DEBIAN_FRONTEND=noninteractive @@ -212,6 +214,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ fi +# `flashinfer-python` is already installed via requirements/cuda.txt above; +# this only activates its `[cu13]` extra (cu13 deps for the SM100 GDN kernel). +RUN --mount=type=cache,target=/root/.cache/uv \ + if [ "${CUDA_VERSION%%.*}" = "13" ]; then \ + uv pip install --python /opt/venv/bin/python3 \ + "flashinfer-python[cu13]==${FLASHINFER_VERSION}"; \ + fi + # Track PyTorch lib versions used during build and match in downstream instances. # We do this for both nightly and release so we can strip dependencies/*.txt as needed. # Otherwise library dependencies can upgrade/downgrade torch incorrectly. @@ -519,6 +529,7 @@ FROM ${FINAL_BASE_IMAGE} AS vllm-base ARG CUDA_VERSION ARG PYTHON_VERSION +ARG FLASHINFER_VERSION ARG DEADSNAKES_MIRROR_URL ARG DEADSNAKES_GPGKEY_URL ARG GET_PIP_URL @@ -620,10 +631,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt +# `flashinfer-python` is already installed via requirements/cuda.txt above; +# this only activates its `[cu13]` extra (cu13 deps for the SM100 GDN kernel). +RUN --mount=type=cache,target=/root/.cache/uv \ + if [ "${CUDA_VERSION%%.*}" = "13" ]; then \ + uv pip install --system \ + "flashinfer-python[cu13]==${FLASHINFER_VERSION}"; \ + fi + # Install FlashInfer JIT cache (requires CUDA-version-specific index URL) # https://docs.flashinfer.ai/installation.html # From versions.json: .flashinfer.version -ARG FLASHINFER_VERSION=0.6.8.post1 RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 6b9d85c9d17f..a6038b02f512 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.6.8.post1 +# release version: v0.6.10 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ echo "git clone flashinfer..." \ - && git clone --depth 1 --branch v0.6.8.post1 --recursive https://github.com/flashinfer-ai/flashinfer.git \ + && git clone --depth 1 --branch v0.6.10 --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ diff --git a/docker/versions.json b/docker/versions.json index 75652823db0b..2cb58c9773be 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -10,6 +10,9 @@ "UBUNTU_VERSION": { "default": "22.04" }, + "FLASHINFER_VERSION": { + "default": "0.6.10" + }, "BUILD_BASE_IMAGE": { "default": "nvidia/cuda:13.0.2-devel-ubuntu22.04" }, @@ -67,9 +70,6 @@ "RUN_WHEEL_CHECK": { "default": "true" }, - "FLASHINFER_VERSION": { - "default": "0.6.8.post1" - }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" }, diff --git a/requirements/cuda.txt b/requirements/cuda.txt index abff2525af9e..9df072ccefc3 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -9,8 +9,8 @@ torchaudio==2.11.0 # These must be updated alongside torch torchvision==0.26.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.6.8.post1 -flashinfer-cubin==0.6.8.post1 +flashinfer-python==0.6.10 +flashinfer-cubin==0.6.10 apache-tvm-ffi==0.1.9 tilelang==0.1.9 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to diff --git a/setup.py b/setup.py index 7c226a72425f..4d790d289cf2 100644 --- a/setup.py +++ b/setup.py @@ -969,6 +969,14 @@ def _read_requirements(filename: str) -> list[str]: # vllm-flash-attn is built only for CUDA 12.x. # Skip for other versions. continue + if req.startswith("flashinfer-python") and cuda_major == "13": + # Activate FI's `[cu13]` extra on cu13 builds (cu13 deps for + # the SM100 GDN kernel). Mirrors the Dockerfile cu13 path. + req = req.replace( + "flashinfer-python", + "flashinfer-python[cu13]", + 1, + ) modified_requirements.append(req) requirements = modified_requirements elif _is_hip():