From 08b62eb0f96c966a50d675739c0b33df2576d200 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 25 Sep 2025 21:42:45 -0400 Subject: [PATCH 1/6] Fix FlashInfer AOT in release docker image Signed-off-by: mgoin --- .buildkite/release-pipeline.yaml | 2 +- docker/Dockerfile | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 8c6ef7817aaf..7677d783fabc 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -76,7 +76,7 @@ steps: queue: arm64_cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # Add job to create multi-arch manifest diff --git a/docker/Dockerfile b/docker/Dockerfile index 034f73736ca7..4daa69e0dc5a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -404,16 +404,14 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" fi echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" + export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" # Build AOT kernels - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot + python3 -m flashinfer.aot # Install with no-build-isolation since we already built AOT kernels - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation . \ + uv pip install --system --no-build-isolation . \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Download pre-compiled cubins - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." + python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." else echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" uv pip install --system . \ From a07651094beda82be6a6e00e61fa4fdffd2c0113 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 25 Sep 2025 22:01:53 -0600 Subject: [PATCH 2/6] Add installation of cuda-python in Dockerfile --- docker/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 4daa69e0dc5a..78b26f75d496 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -406,6 +406,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" # Build AOT kernels + uv pip install --system cuda-python python3 -m flashinfer.aot # Install with no-build-isolation since we already built AOT kernels uv pip install --system --no-build-isolation . \ From 03bf75908f8e4a86188630be43db56ca8d0ae424 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 25 Sep 2025 22:03:16 -0600 Subject: [PATCH 3/6] Pass TORCH_CUDA_ARCH_LIST to FlashInfer commands --- docker/Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 78b26f75d496..96efe73b51e1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -407,12 +407,15 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" # Build AOT kernels uv pip install --system cuda-python - python3 -m flashinfer.aot + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer.aot # Install with no-build-isolation since we already built AOT kernels - uv pip install --system --no-build-isolation . \ + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + uv pip install --system --no-build-isolation . \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Download pre-compiled cubins - python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." else echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" uv pip install --system . \ From ff2f1574d759a3e8f7d8b241f17b31109be14414 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Sep 2025 08:03:29 -0600 Subject: [PATCH 4/6] Specify cuda-python version in Dockerfile --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 96efe73b51e1..8fabff7dcfa3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -406,7 +406,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" # Build AOT kernels - uv pip install --system cuda-python + uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ python3 -m flashinfer.aot # Install with no-build-isolation since we already built AOT kernels From 410cb52b97200c252360409993960b4f65dd201e Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Sep 2025 08:54:43 -0600 Subject: [PATCH 5/6] Remove redundant cuda-python installation command --- docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8fabff7dcfa3..5baf833722c5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -405,8 +405,9 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' fi echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" + # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future + uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) # Build AOT kernels - uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ python3 -m flashinfer.aot # Install with no-build-isolation since we already built AOT kernels From 0919a24f805d9da0989dd607ecde725e89b1568e Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Sep 2025 10:28:49 -0600 Subject: [PATCH 6/6] Add nvidia-nvshmem-cu package installation --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 5baf833722c5..c0f55a7eeba0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -406,7 +406,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future - uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) + uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1) # Build AOT kernels TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ python3 -m flashinfer.aot