diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
index c87f8d548b2..d86e3470162 100644
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -35,9 +35,14 @@ jobs:
     runs-on: sgl-kernel-build-node
     strategy:
       matrix:
-        python-version: ['3.9']
-        cuda-version: ['12.4']
-
+        include:
+          - python-version: '3.9'
+            cuda-version: '11.8'
+          - python-version: '3.9'
+            cuda-version: '12.4'
+          - python-version: '3.9'
+            cuda-version: '12.8'
+    name: Build Wheel (CUDA ${{ matrix.cuda-version }})
     steps:
       - name: Cleanup
         run: |
@@ -52,13 +57,14 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
         run: |
           cd sgl-kernel
           chmod +x ./build.sh
           ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
 
-      - name: Upload artifacts
+      - name: Upload artifacts (only for CUDA 12.4)
+        if: ${{ matrix.cuda-version == '12.4' }}
         uses: actions/upload-artifact@v4
         with:
           name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
@@ -128,7 +134,7 @@ jobs:
           pip3 uninstall sgl-kernel -y
 
   finish:
-    needs: [unit-test, mla-test, lint]
+    needs: [unit-test, mla-test, lint, build-wheels]
     runs-on: ubuntu-latest
     steps:
       - name: Check all dependent job statuses
diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
index 631551475fe..ebfc4b80237 100644
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -14,7 +14,7 @@ on:
 jobs:
   build-wheels:
     if: github.repository == 'sgl-project/sglang'
-    runs-on: ubuntu-latest
+    runs-on: sgl-kernel-build-node
     strategy:
       matrix:
         python-version: ['3.9']
diff --git a/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu b/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu
index da6ea2a08de..46ad440c58e 100644
--- a/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu
+++ b/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu
@@ -25,6 +25,8 @@ limitations under the License.
 #include <device/sm100_mla.hpp>
 #include <kernel/sm100_mla_tile_scheduler.hpp>
 
+#if defined CUDA_VERSION && CUDA_VERSION >= 12040
+
 #define CUTLASS_CHECK(status)                                                       \
   {                                                                                 \
     cutlass::Status error = status;                                                 \
@@ -205,3 +207,5 @@ int64_t cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches,
 
   return MlaSm100Type::Fmha::get_workspace_size(arguments);
 }
+
+#endif