build(pypi): add cu128 windows build

windreamer · windreamer · commit c2d142d212c5 · 2025-09-11T19:16:59.000+08:00
diff --git a/.github/workflows/cuda12.8-whl-release.yml b/.github/workflows/cuda12.8-whl-release.yml
@@ -53,11 +53,48 @@ jobs:
           retention-days: 1
           name: linux-${{ matrix.pyver }}
 
+  windows-build:
+    strategy:
+      matrix:
+        pyver: ['3.9', '3.10', '3.11', '3.12', '3.13']
+    runs-on: windows-latest
+    steps:
+      - name: Set git for windows
+        run: |
+          git config --global core.longpaths true
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Install python packages
+        run: |
+          pip install build change-wheel-version
+      - name: Setup CUDA Toolkit
+        id: cuda-toolkit
+        shell: pwsh
+        run: ./builder/windows/setup_cuda.ps1
+        env:
+            INPUT_CUDA_VERSION: '12.8.1'
+      - name: Build wheel
+        run: |
+          python -m build --wheel -o build/wheel
+          Get-ChildItem -Path "build" -Filter "*.whl" | ForEach-Object { change_wheel_version $_.FullName --local-version cu128 --delete-old-wheel }
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: build/wheel/*
+          retention-days: 1
+          name: windows-${{ matrix.pyver }}
+
   publish:
     runs-on: ubuntu-latest
     environment: 'prod'
     needs:
       - linux-build
+      - windows-build
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
@@ -29,7 +29,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cudaver: [11.8, 12.4]
+        cudaver: [11.8, 12.4, 12.8]
     name: cuda-${{ matrix.cudaver }}
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
@@ -29,7 +29,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cudaver: [11.8.0, 12.1.0]
+        cudaver: [11.8.0, 12.5.0, 12.8.1]
     name: cuda-${{ matrix.cudaver }}
     runs-on: windows-latest
     steps:
diff --git a/builder/windows/setup_cuda.ps1 b/builder/windows/setup_cuda.ps1
@@ -26,6 +26,8 @@ if ($CUDA_VERSION_FULL -eq "12.1.0") {
     $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe"
 } elseif ($CUDA_VERSION_FULL -eq "12.5.0") {
     $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.85_windows.exe"
+} elseif ($CUDA_VERSION_FULL -eq "12.8.1") {
+    $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_572.61_windows.exe"
 } else {
     Write-Output "Unsupported CUDA version specified"
     exit 1
diff --git a/src/turbomind/kernels/gemm/scaled_gmma_fp8_sm90.h b/src/turbomind/kernels/gemm/scaled_gmma_fp8_sm90.h
@@ -11,10 +11,44 @@
 
 namespace turbomind::gemm {
 
+namespace {
+
+template<int tile>
+struct select_gmma_operation;
+template<>
+struct select_gmma_operation<256> {
+    using type = cute::SM90::GMMA::MMA_64x256x32_F32E4M3E4M3_SS_TN<>;
+};
+template<>
+struct select_gmma_operation<224> {
+    using type = cute::SM90::GMMA::MMA_64x224x32_F32E4M3E4M3_SS_TN<>;
+};
+template<>
+struct select_gmma_operation<192> {
+    using type = cute::SM90::GMMA::MMA_64x192x32_F32E4M3E4M3_SS_TN<>;
+};
+template<>
+struct select_gmma_operation<160> {
+    using type = cute::SM90::GMMA::MMA_64x160x32_F32E4M3E4M3_SS_TN<>;
+};
+template<>
+struct select_gmma_operation<128> {
+    using type = cute::SM90::GMMA::MMA_64x128x32_F32E4M3E4M3_SS_TN<>;
+};
+template<>
+struct select_gmma_operation<96> {
+    using type = cute::SM90::GMMA::MMA_64x96x32_F32E4M3E4M3_SS_TN<>;
+};
+template<>
+struct select_gmma_operation<64> {
+    using type = cute::SM90::GMMA::MMA_64x64x32_F32E4M3E4M3_SS_TN<>;
+};
+
+}  // namespace
+
 template<int TILE_M, int TILE_N, int TILE_K, int BATCH_M, int BATCH_N, int PIPE_M, int PIPE_N>
 struct ScaledGmmaFP8_TN {
-
-    static constexpr auto select_gmma_operation()
+    static constexpr auto select_gmma_size()
     {
         static_assert(TILE_M % (BATCH_M * PIPE_M) == 0);
         static_assert(TILE_N % (BATCH_N * PIPE_N) == 0);
@@ -24,35 +58,33 @@ struct ScaledGmmaFP8_TN {
 
         static_assert(M % 64 == 0);
 
-        using namespace cute::SM90::GMMA;
-
         if constexpr (N % 256 == 0) {
-            return MMA_64x256x32_F32E4M3E4M3_SS_TN<>{};
+            return 256;
         }
         else if constexpr (N % 224 == 0) {
-            return MMA_64x224x32_F32E4M3E4M3_SS_TN<>{};
+            return 224;
         }
         else if constexpr (N % 192 == 0) {
-            return MMA_64x192x32_F32E4M3E4M3_SS_TN<>{};
+            return 192;
         }
         else if constexpr (N % 160 == 0) {
-            return MMA_64x160x32_F32E4M3E4M3_SS_TN<>{};
+            return 160;
         }
         else if constexpr (N % 128 == 0) {
-            return MMA_64x128x32_F32E4M3E4M3_SS_TN<>{};
+            return 128;
         }
         else if constexpr (N % 96 == 0) {
-            return MMA_64x96x32_F32E4M3E4M3_SS_TN<>{};
+            return 96;
         }
         else if constexpr (N % 64 == 0) {
-            return MMA_64x64x32_F32E4M3E4M3_SS_TN<>{};
+            return 64;
         }
         else {
             static_assert(N == 0, "unsupported configuration");
         }
     }
 
-    using Operation = decltype(select_gmma_operation());
+    using Operation = typename select_gmma_operation<select_gmma_size()>::type;
 
     static constexpr typename cute::MMA_Traits<Operation>::Shape_MNK OP_Shape{};