From 08b62eb0f96c966a50d675739c0b33df2576d200 Mon Sep 17 00:00:00 2001
From: mgoin <mgoin64@gmail.com>
Date: Thu, 25 Sep 2025 21:42:45 -0400
Subject: [PATCH 1/6] Fix FlashInfer AOT in release docker image

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/release-pipeline.yaml |  2 +-
 docker/Dockerfile                | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 8c6ef7817aaf..7677d783fabc 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -76,7 +76,7 @@ steps:
       queue: arm64_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
   # Add job to create multi-arch manifest
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 034f73736ca7..4daa69e0dc5a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -404,16 +404,14 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
                 FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
             fi
             echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
             # Build AOT kernels
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                python3 -m flashinfer.aot
+            python3 -m flashinfer.aot
             # Install with no-build-isolation since we already built AOT kernels
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                uv pip install --system --no-build-isolation . \
+            uv pip install --system --no-build-isolation . \
                 --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
             # Download pre-compiled cubins
-            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
+            python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
         else
             echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
             uv pip install --system . \

From a07651094beda82be6a6e00e61fa4fdffd2c0113 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 25 Sep 2025 22:01:53 -0600
Subject: [PATCH 2/6] Add installation of cuda-python in Dockerfile

---
 docker/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 4daa69e0dc5a..78b26f75d496 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -406,6 +406,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
             export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
             # Build AOT kernels
+            uv pip install --system cuda-python
             python3 -m flashinfer.aot
             # Install with no-build-isolation since we already built AOT kernels
             uv pip install --system --no-build-isolation . \

From 03bf75908f8e4a86188630be43db56ca8d0ae424 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 25 Sep 2025 22:03:16 -0600
Subject: [PATCH 3/6] Pass TORCH_CUDA_ARCH_LIST to FlashInfer commands

---
 docker/Dockerfile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 78b26f75d496..96efe73b51e1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -407,12 +407,15 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
             # Build AOT kernels
             uv pip install --system cuda-python
-            python3 -m flashinfer.aot
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                python3 -m flashinfer.aot
             # Install with no-build-isolation since we already built AOT kernels
-            uv pip install --system --no-build-isolation . \
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                uv pip install --system --no-build-isolation . \
                 --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
             # Download pre-compiled cubins
-            python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
         else
             echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
             uv pip install --system . \

From ff2f1574d759a3e8f7d8b241f17b31109be14414 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 26 Sep 2025 08:03:29 -0600
Subject: [PATCH 4/6] Specify cuda-python version in Dockerfile

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 96efe73b51e1..8fabff7dcfa3 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -406,7 +406,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
             export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
             # Build AOT kernels
-            uv pip install --system cuda-python
+            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2)
             TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                 python3 -m flashinfer.aot
             # Install with no-build-isolation since we already built AOT kernels

From 410cb52b97200c252360409993960b4f65dd201e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 26 Sep 2025 08:54:43 -0600
Subject: [PATCH 5/6] Remove redundant cuda-python installation command

---
 docker/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 8fabff7dcfa3..5baf833722c5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -405,8 +405,9 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             fi
             echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
             export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
+            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
+            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1)
             # Build AOT kernels
-            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2)
             TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                 python3 -m flashinfer.aot
             # Install with no-build-isolation since we already built AOT kernels

From 0919a24f805d9da0989dd607ecde725e89b1568e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 26 Sep 2025 10:28:49 -0600
Subject: [PATCH 6/6] Add nvidia-nvshmem-cu package installation

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 5baf833722c5..c0f55a7eeba0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -406,7 +406,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
             echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
             export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
             # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
-            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1)
+            uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
             # Build AOT kernels
             TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
                 python3 -m flashinfer.aot