Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 8 additions & 69 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -356,75 +356,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# If we need to build FlashInfer wheel before its release:
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
# $ git checkout v0.2.6.post1
# $ python -m flashinfer.aot
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl

# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.4.0"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
ARG FLASHINFER_AOT_COMPILE=false
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
pushd flashinfer
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ] && [ "${FLASHINFER_GIT_REF}" = "v0.3.1" ]; then
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
fi
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
# Build AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
# Install with no-build-isolation since we already built AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
else
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
uv pip install --system . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
fi
popd
rm -rf flashinfer
BASH
# Install FlashInfer pre-compiled kernel cache and binaries
# https://docs.flashinfer.ai/installation.html
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.4.0 \
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config

COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
Expand Down
2 changes: 2 additions & 0 deletions requirements/cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ torchaudio==2.8.0
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.4.0
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,8 +714,7 @@ def _read_requirements(filename: str) -> list[str]:
"mistral_common[audio]",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.4.0"],
"flashinfer": [], # Kept for backwards compatibility
# Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"],
},
Expand Down
10 changes: 9 additions & 1 deletion vllm/utils/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import importlib
import importlib.util
import os
import shutil
from typing import Any, Callable, NoReturn

import requests
Expand All @@ -37,7 +38,14 @@ def has_flashinfer() -> bool:
"""Return ``True`` if FlashInfer is available."""
# Use find_spec to check if the module exists without importing it
# This avoids potential CUDA initialization side effects
return importlib.util.find_spec("flashinfer") is not None
if importlib.util.find_spec("flashinfer") is None:
logger.debug_once("FlashInfer unavailable since package was not found")
return False
# Also check if nvcc is available since it's required to JIT compile flashinfer
if shutil.which("nvcc") is None:
logger.debug_once("FlashInfer unavailable since nvcc was not found")
return False
return True


def _missing(*_: Any, **__: Any) -> NoReturn:
Expand Down