Doubling tile texel col count for mat mul op to improve performance. (#15192)

trivedivivek · facebook-github-bot · commit c3d692ae2c5a · 2025-10-16T09:25:56.000-07:00
Summary:

### Summary

This change doubled tile texel column count for 8 bit matrix multiplication operation to improve performance.

Differential Revision: D84679398
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl
@@ -98,12 +98,17 @@ void main() {
     // Preload weight tensor
     [[unroll]] for (int r = 0; r < 4; r++) {
       $if QUANT_NBITS == 4:
+        $if WEIGHT_STORAGE == "buffer":
+          u8vec4 packed_weight_tex;
+        $else:
+          uvec4 packed_weight_tex;
+
         $for c in range(0, TILE_TXCOLS, 2):
           $if WEIGHT_STORAGE == "buffer":
             qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol;
-            const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}]
+            packed_weight_tex = t_weight[qmat2_bufi + ${c}]
           $else:
-            const uvec4 packed_weight_tex = texelFetch(
+            packed_weight_tex = texelFetch(
               t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
 
           qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml
@@ -12,7 +12,7 @@ linear_qcsnw_coop:
     WEIGHT_STORAGE: texture2d
     SCALES_STORAGE: texture2d
     TILE_ROWS: 4
-    TILE_TXCOLS: 1
+    TILE_TXCOLS: 2
     QUANT_NBITS: 8
   generate_variant_forall:
     TILE_ROWS:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
@@ -106,12 +106,17 @@ void main() {
     for (int r = 0; r < 4; r++) {
       VEC4_T qmat2[TILE_TXCOLS];
       $if QUANT_NBITS == 4:
+        $if WEIGHT_STORAGE == "buffer":
+          u8vec4 packed_weight_tex;
+        $else:
+          uvec4 packed_weight_tex;
+
         $for c in range(0, TILE_TXCOLS, 2):
           $if WEIGHT_STORAGE == "buffer":
             qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol;
-            const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}]
+            packed_weight_tex = t_weight[qmat2_bufi + ${c}]
           $else:
-            const uvec4 packed_weight_tex = texelFetch(
+            packed_weight_tex = texelFetch(
               t_weight, u16vec2(weight_txcol + ${c}, pos + r), 0);
 
           qmat2[${c}] = (VEC4_T(packed_weight_tex >> 4) - 8.0);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml
@@ -12,7 +12,7 @@ linear_qcsnw_tiled:
     WEIGHT_STORAGE: texture2d
     SCALES_STORAGE: texture2d
     TILE_ROWS: 4
-    TILE_TXCOLS: 1
+    TILE_TXCOLS: 2
     QUANT_NBITS: 8
   generate_variant_forall:
     TILE_ROWS:
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
@@ -73,7 +73,7 @@ utils::uvec3 linear_qcsnw_tiled_global_wg_size(
   }
 
   // Number of output texels in the output tile
-  uint32_t out_tile_ntxcols = 1;
+  uint32_t out_tile_ntxcols = 2;
   if (quant_nbits == 4) {
     out_tile_ntxcols = 2;
   }
@@ -324,7 +324,7 @@ void add_linear_qcsnw_tiled_node(
   }
 
   // Number of output texels in the output tile
-  uint32_t out_tile_ntxcols = 1;
+  uint32_t out_tile_ntxcols = 2;
   if (quant_nbits == 4) {
     out_tile_ntxcols = 2;
   }

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ utils::uvec3 linear_qcsnw_tiled_global_wg_size(`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`// Number of output texels in the output tile`
`76`		`- uint32_t out_tile_ntxcols = 1;`
	`76`	`+ uint32_t out_tile_ntxcols = 2;`
`77`	`77`	`if (quant_nbits == 4) {`
`78`	`78`	`out_tile_ntxcols = 2;`
`79`	`79`	`}`
`@@ -324,7 +324,7 @@ void add_linear_qcsnw_tiled_node(`
`324`	`324`	`}`
`325`	`325`
`326`	`326`	`// Number of output texels in the output tile`
`327`		`- uint32_t out_tile_ntxcols = 1;`
	`327`	`+ uint32_t out_tile_ntxcols = 2;`
`328`	`328`	`if (quant_nbits == 4) {`
`329`	`329`	`out_tile_ntxcols = 2;`
`330`	`330`	`}`