build(pypi): add cu128 windows build

windreamer · windreamer · commit 76459c902a53 · 2025-09-10T19:44:43.000+08:00
diff --git a/.github/workflows/cuda12.8-whl-release.yml b/.github/workflows/cuda12.8-whl-release.yml
@@ -53,11 +53,48 @@ jobs:
           retention-days: 1
           name: linux-${{ matrix.pyver }}
 
+  windows-build:
+    strategy:
+      matrix:
+        pyver: ['3.9', '3.10', '3.11', '3.12', '3.13']
+    runs-on: windows-latest
+    steps:
+      - name: Set git for windows
+        run: |
+          git config --global core.longpaths true
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Install python packages
+        run: |
+          pip install build change-wheel-version
+      - name: Setup CUDA Toolkit
+        id: cuda-toolkit
+        shell: pwsh
+        run: ./builder/windows/setup_cuda.ps1
+        env:
+            INPUT_CUDA_VERSION: '12.8.1'
+      - name: Build wheel
+        run: |
+          python -m build --wheel -o build/wheel
+          Get-ChildItem -Path "build" -Filter "*.whl" | ForEach-Object { change_wheel_version $_.FullName --local-version cu128 --delete-old-wheel }
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: build/wheel/*
+          retention-days: 1
+          name: windows-${{ matrix.pyver }}
+
   publish:
     runs-on: ubuntu-latest
     environment: 'prod'
     needs:
       - linux-build
+      - windows-build
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
@@ -29,7 +29,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cudaver: [11.8, 12.4]
+        cudaver: [11.8, 12.4, 12.8]
     name: cuda-${{ matrix.cudaver }}
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
@@ -29,7 +29,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cudaver: [11.8.0, 12.1.0]
+        cudaver: [11.8.0, 12.5.0, 12.8.1]
     name: cuda-${{ matrix.cudaver }}
     runs-on: windows-latest
     steps:
diff --git a/builder/windows/setup_cuda.ps1 b/builder/windows/setup_cuda.ps1
@@ -26,6 +26,8 @@ if ($CUDA_VERSION_FULL -eq "12.1.0") {
     $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe"
 } elseif ($CUDA_VERSION_FULL -eq "12.5.0") {
     $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.85_windows.exe"
+} elseif ($CUDA_VERSION_FULL -eq "12.8.1") {
+    $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_572.61_windows.exe"
 } else {
     Write-Output "Unsupported CUDA version specified"
     exit 1
diff --git a/src/turbomind/kernels/gemm/scaled_gmma_fp8_sm90.h b/src/turbomind/kernels/gemm/scaled_gmma_fp8_sm90.h
@@ -14,45 +14,44 @@ namespace turbomind::gemm {
 template<int TILE_M, int TILE_N, int TILE_K, int BATCH_M, int BATCH_N, int PIPE_M, int PIPE_N>
 struct ScaledGmmaFP8_TN {
 
-    static constexpr auto select_gmma_operation()
-    {
-        static_assert(TILE_M % (BATCH_M * PIPE_M) == 0);
-        static_assert(TILE_N % (BATCH_N * PIPE_N) == 0);
-
-        constexpr int M = TILE_M / (BATCH_M * PIPE_M);
-        constexpr int N = TILE_N / (BATCH_N * PIPE_N);
+    template<int tile_m  = TILE_M,
+             int tile_n  = TILE_N,
+             int batch_m = BATCH_M,
+             int batch_n = BATCH_N,
+             int pipe_m  = PIPE_M,
+             int pipe_n  = PIPE_N>
+    struct select_gmma_operation {
+        static constexpr int M = tile_m / (batch_m * pipe_m);
+        static constexpr int N = tile_n / (batch_n * pipe_n);
 
+        static_assert(tile_m % (batch_m * pipe_m) == 0);
+        static_assert(tile_n % (batch_n * pipe_n) == 0);
         static_assert(M % 64 == 0);
 
-        using namespace cute::SM90::GMMA;
-
-        if constexpr (N % 256 == 0) {
-            return MMA_64x256x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else if constexpr (N % 224 == 0) {
-            return MMA_64x224x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else if constexpr (N % 192 == 0) {
-            return MMA_64x192x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else if constexpr (N % 160 == 0) {
-            return MMA_64x160x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else if constexpr (N % 128 == 0) {
-            return MMA_64x128x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else if constexpr (N % 96 == 0) {
-            return MMA_64x96x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else if constexpr (N % 64 == 0) {
-            return MMA_64x64x32_F32E4M3E4M3_SS_TN<>{};
-        }
-        else {
-            static_assert(N == 0, "unsupported configuration");
-        }
-    }
+        using type = std::conditional_t<
+            N % 256 == 0,
+            cute::SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<>,
+            std::conditional_t<
+                N % 224 == 0,
+                cute::SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<>,
+                std::conditional_t<
+                    N % 192 == 0,
+                    cute::SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<>,
+                    std::conditional_t<
+                        N % 160 == 0,
+                        cute::SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<>,
+                        std::conditional_t<
+                            N % 128 == 0,
+                            cute::SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<>,
+                            std::conditional_t<N % 96 == 0,
+                                               cute::SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<>,
+                                               std::conditional_t<N % 64 == 0,
+                                                                  cute::SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<>,
+                                                                  void>>>>>>>;
+        static_assert(!std::is_same_v<type, void>, "unsupported configuration");
+    };
 
-    using Operation = decltype(select_gmma_operation());
+    using Operation = select_gmma_operation<>::type;
 
     static constexpr typename cute::MMA_Traits<Operation>::Shape_MNK OP_Shape{};
 
@@ -242,11 +241,11 @@ struct ScaledGmmaFP8_TN {
                                 int n = ((i_n * PIPE_N) + p_n * BATCH_N) + b_n;
                                 func(frag[i_m][i_n][p_m][p_n][b_m][b_n], m, n);
                             }  // BATCH_N
-                        }      // BATCH_M
-                    }          // PIPE_N
-                }              // PIPE_M
-            }                  // ITER_N
-        }                      // ITER_M
+                        }  // BATCH_M
+                    }  // PIPE_N
+                }  // PIPE_M
+            }  // ITER_N
+        }  // ITER_M
     }
 
     template<class Frag, class Func>