enable int4 tile ci for gemma3 (#15332)

Gasoonjia · web-flow · commit 6e08aefd7698 · 2025-10-22T08:15:55.000-07:00
as title
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -176,12 +176,12 @@ jobs:
       matrix:
         quant:
           - name: "non-quantized"
-            artifact: "voxtral-cuda-export"
+            artifact: "gemma3-cuda-export"
             extra_args: ""
-          # TODO: enable gemma3 quantization
-          # - name: "quantized-int4-tile-packed"
-          #   artifact: "voxtral-cuda-quantized-int4-tile-packed"
-          #   extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-tile-packed"
+            artifact: "gemma3-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          # TODO: enable int4-weight-only on gemma3.
           # - name: "quantized-int4-weight-only"
           #   artifact: "voxtral-cuda-quantized-int4-weight-only"
           #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
@@ -194,7 +194,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: gemma3-cuda-export
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -435,9 +435,9 @@ jobs:
         format:
           - name: "non-quantized"
             artifact: "gemma3-cuda-export"
-          # TODO: enable quantized gemma3.
-          # - name: "quantized-int4-tile-packed"
-          #   artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-tile-packed"
+            artifact: "gemma3-cuda-quantized-int4-tile-packed"
+          # TODO: enable int4-weight-only on gemma3.
           # - name: "quantized-int4-weight-only"
           #   artifact: "gemma3-cuda-quantized-int4-weight-only"
     with: