updated optimization, fixed errors

Neha Abbas · Neha Abbas · commit 2c82462f4073 · 2025-10-17T22:08:50.000-07:00
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -248,7 +248,7 @@ struct webgpu_context_struct {
 
     webgpu_pipeline memset_pipeline;
     webgpu_pipeline mul_mat_pipeline[30][2];
-    webgpu_pipeline set_rows_pipeline;
+    webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized (0 for vectorized, 1 for non vectorized)
     webgpu_pipeline get_rows_pipeline[30];
     webgpu_pipeline get_rows_f32_no_vec_pipeline;
     webgpu_pipeline cpy_pipeline[2][2];          // src type, dst type
@@ -767,9 +767,20 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
     };
 
     size_t   max_wg_size = ctx->max_wg_size_x;
-    uint32_t wg_x        = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
+    // number of threads needed with vec4 = (total number of rows in matrix) * (number of elements in a row / 4)
+    uint32_t threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
+
+    webgpu_pipeline pipeline = ctx->set_rows_pipeline[0][0];
+    // if not evenly divisble by 4, use the non-vectorized version
+    if (src->ne[0] % 4 != 0) {
+        pipeline = ctx->set_rows_pipeline[0][1];
+        // threads = number of rows
+        threads = src->ne[1] * src->ne[2] * src->ne[3];
+    }
+
+    uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size;
 
-    return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, error_bufs);
 }
 
 static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
@@ -1620,7 +1631,10 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
 }
 
 static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows",
+    // create_pipeline(device, pipeline, shader_code, label, constants)
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][1], wgsl_set_rows_f16, "set_rows_f16",
+                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][0], wgsl_set_rows_f16_vec, "set_rows_f16_vec",
                                 ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
 }
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
@@ -0,0 +1,163 @@
+#define(VARIANTS)
+
+[
+  {
+    "SHADER_SUFFIX": "f16_vec",
+    "REPLS": {
+      "TYPE" : "vec4<f32>",
+      "DST_TYPE": "vec4<f16>",
+      "BLOCK_SIZE": 4
+    },
+    "DECLS": ["F16_VEC"]
+  },
+  {
+    "SHADER_SUFFIX": "f16",
+    "REPLS": {
+      "TYPE" : "f32",
+      "DST_TYPE": "f16",
+      "BLOCK_SIZE": 1
+    },
+    "DECLS": ["F16"]
+  }
+]
+
+#end(VARIANTS)
+
+#define(DECLS)
+
+#decl(F16_VEC)
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    let src_vec_index = (src_base + offset) / {{BLOCK_SIZE}};
+    let dst_vec_index = (dst_base + offset) / {{BLOCK_SIZE}};
+    dst[dst_vec_index] = vec4<f16>(src[src_vec_index]);
+}
+#enddecl(F16_VEC)
+
+#decl(F16)
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    dst[dst_base + offset] = f16(src[src_base + offset]);
+}
+#enddecl(F16)
+
+#end(DECLS)
+
+#define(SHADER)
+
+enable f16;
+
+DECLS
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<{{TYPE}}>;
+
+@group(0) @binding(1)
+var<storage, read_write> idx: array<u32>;
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<{{DST_TYPE}}>;
+
+@group(0) @binding(3)
+var<storage, read_write> error: atomic<u32>;
+
+struct Params {
+    offset_src: u32, // in elements
+    offset_idx: u32, // in elements
+    offset_dst: u32, // in elements
+
+    // Strides (in elements)
+    stride_src1: u32,
+    stride_src2: u32,
+    stride_src3: u32,
+
+    stride_idx0: u32,
+    stride_idx1: u32,
+    stride_idx2: u32,
+
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // Shape of src
+    ne0: u32,
+    n_rows: u32, // n_rows = ne1 = rows per slice
+    ne2: u32,
+    ne3: u32,
+
+    // Shape of idx
+    idx1: u32,
+    idx2: u32,
+};
+
+@group(0) @binding(4)
+var<uniform> params: Params;
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+
+    // Determine the total number of threads based on mode
+    var max_threads: u32; 
+    var i: u32;
+    if {{BLOCK_SIZE}} > 1 {
+        // Vectorized: one thread per vector of elements
+        // # of total rows to go through * (# of threads per row)
+        max_threads = (params.n_rows * params.ne2 * params.ne3) * (params.ne0 / {{BLOCK_SIZE}});
+        
+        // calculations are based off i being row, but when vectorized, it corresponds to a vector in a row
+        // getting the row from gid
+        i = gid.x / (params.ne0 / {{BLOCK_SIZE}});
+    } else {
+        // Non-vectorized: one thread per row
+        // # of total rows in the matrix 
+        max_threads = params.n_rows * params.ne2 * params.ne3;
+        i = gid.x; // i corresponds to the row
+    }
+
+    if (gid.x >= max_threads) {
+        return;
+    }
+
+
+    let i_src3 = i / (params.ne2 * params.n_rows);
+
+    i = i % (params.ne2 * params.n_rows);
+    let i_src2 = i / params.n_rows;
+    let i_src1 = i % params.n_rows;
+
+    let i_idx2 = i_src3 % params.idx2;
+    let i_idx1 = i_src2 % params.idx1;
+    let i_idx0 = i_src1;
+
+    let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2;
+
+    let idx_high_val = idx[idx_high];
+    let idx_low_val = idx[idx_high + 1];
+
+    if (idx_low_val != 0) {
+        // Upper bits of index are not zero, output will be incorrect
+        atomicStore(&error, 1);
+        return;
+    }
+
+    let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
+    let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
+
+    if {{BLOCK_SIZE}} > 1 {
+        // Vectorized: one thread per vector of elements
+
+        // starts at what element of that row?
+        let element_offset = (gid.x % (params.ne0 / {{BLOCK_SIZE}})) * {{BLOCK_SIZE}};
+        copy_elements(i_src_row, i_dst_row, element_offset);
+
+    } else {
+        // Non-vectorized: go through each element in row, copy one by one
+        for (var i: u32 = 0; i < params.ne0; i++) {
+            copy_elements(i_src_row, i_dst_row, i);
+        }
+    }
+
+    
+}
+
+#end(SHADER)
+
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl