From 3beb73cd919c4b19b43afc945c1077888f9c95f1 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 1 Jun 2026 12:22:51 +0200
Subject: [PATCH 1/4] vulkan: add fwht support for Intel with shmem reduction

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          |  6 ++
 ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp | 56 +++++++++++++++++--
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |  1 +
 3 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 2a30fb95c61b..03b25d2893fc 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5047,6 +5047,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
             }
             ++idx;
         }
+    } else {
+        int idx = 0;
+        for (uint32_t n : {64, 128, 256, 512}) {
+            ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { n }, 1);
+            ++idx;
+        }
     }
 
     const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
index 72059d4afc2d..8dc388f1f4e9 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
@@ -1,13 +1,20 @@
 #version 450
 
 #extension GL_EXT_control_flow_attributes : require
+#ifndef FWHT_SHMEM
 #extension GL_KHR_shader_subgroup_basic : enable
 #extension GL_KHR_shader_subgroup_shuffle : enable
+#endif
 
-layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
-
+#ifdef FWHT_SHMEM
+layout(constant_id = 0) const uint N = 128;
+#else
 layout(constant_id = 0) const uint WARP_SIZE = 32;
 layout(constant_id = 1) const uint N = 128;
+const uint EL_W = N / WARP_SIZE;
+#endif
+
+layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 
 layout(push_constant) uniform parameter
 {
@@ -20,15 +27,51 @@ layout(push_constant) uniform parameter
 layout(binding = 0, std430) readonly buffer A { float data_a[]; };
 layout(binding = 1, std430) writeonly buffer D { float data_d[]; };
 
-const uint EL_W = N / WARP_SIZE;
+#ifdef FWHT_SHMEM
+shared float shmem[4 * N];
+#endif
 
 void main() {
+#ifdef FWHT_SHMEM
+    const uint tid = gl_LocalInvocationID.x;
+    const uint shmem_base = gl_LocalInvocationID.y * N;
+    const uint row_id = gl_LocalInvocationID.y;
+#else
     const uint lane = gl_SubgroupInvocationID;
-    for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID;
-            row < n_rows;
-            row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) {
+    const uint row_id = gl_SubgroupID;
+#endif
+
+    for (uint base_row = gl_WorkGroupID.x * gl_WorkGroupSize.y;
+            base_row < n_rows;
+            base_row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) {
+        const uint row = base_row + row_id;
         const uint row_offset = row * N;
 
+#ifdef FWHT_SHMEM
+        if (row < n_rows) {
+            shmem[shmem_base + tid] = data_a[src_offset + row_offset + tid] * scale;
+        }
+        barrier();
+
+        [[unroll]]
+        for (uint h = 1; h < N; h <<= 1) {
+            const float val   = shmem[shmem_base + tid];
+            const float other = shmem[shmem_base + (tid ^ h)];
+            barrier();
+            shmem[shmem_base + tid] = (tid & h) == 0 ? val + other : other - val;
+            barrier();
+        }
+
+        if (row < n_rows) {
+            data_d[dst_offset + row_offset + tid] = shmem[shmem_base + tid];
+        }
+
+        barrier();
+#else
+        if (row >= n_rows) {
+            continue;
+        }
+
         float reg[EL_W];
 
         [[unroll]]
@@ -65,5 +108,6 @@ void main() {
         for (uint i = 0; i < EL_W; ++i) {
             data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i];
         }
+#endif
     }
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index de7dbec2c639..d65cd12b2874 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -957,6 +957,7 @@ void process_shaders() {
     string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
     string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("fwht_f32", "fwht.comp", {});
+    string_to_spv("fwht_shmem_f32", "fwht.comp", {{"FWHT_SHMEM", "1"}});
     string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
     string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));

From 479618c927891463e92fe8d692eabe45f8d57c96 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 1 Jun 2026 15:16:19 +0200
Subject: [PATCH 2/4] don't use N as workgroup size

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          |  3 +-
 ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp | 74 ++++++++++---------
 2 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 03b25d2893fc..4d59c4c8e76a 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5050,7 +5050,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
     } else {
         int idx = 0;
         for (uint32_t n : {64, 128, 256, 512}) {
-            ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { n }, 1);
+            const uint32_t block_size = std::min(device->subgroup_size, n);
+            ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1);
             ++idx;
         }
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
index 8dc388f1f4e9..a2069964adbe 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
@@ -6,13 +6,8 @@
 #extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
 
-#ifdef FWHT_SHMEM
-layout(constant_id = 0) const uint N = 128;
-#else
-layout(constant_id = 0) const uint WARP_SIZE = 32;
+layout(constant_id = 0) const uint BLOCK_SIZE = 32;
 layout(constant_id = 1) const uint N = 128;
-const uint EL_W = N / WARP_SIZE;
-#endif
 
 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
 
@@ -27,6 +22,8 @@ layout(push_constant) uniform parameter
 layout(binding = 0, std430) readonly buffer A { float data_a[]; };
 layout(binding = 1, std430) writeonly buffer D { float data_d[]; };
 
+const uint EL_W = N / BLOCK_SIZE;
+
 #ifdef FWHT_SHMEM
 shared float shmem[4 * N];
 #endif
@@ -37,7 +34,7 @@ void main() {
     const uint shmem_base = gl_LocalInvocationID.y * N;
     const uint row_id = gl_LocalInvocationID.y;
 #else
-    const uint lane = gl_SubgroupInvocationID;
+    const uint tid = gl_SubgroupInvocationID;
     const uint row_id = gl_SubgroupID;
 #endif
 
@@ -47,51 +44,50 @@ void main() {
         const uint row = base_row + row_id;
         const uint row_offset = row * N;
 
-#ifdef FWHT_SHMEM
-        if (row < n_rows) {
-            shmem[shmem_base + tid] = data_a[src_offset + row_offset + tid] * scale;
-        }
-        barrier();
-
-        [[unroll]]
-        for (uint h = 1; h < N; h <<= 1) {
-            const float val   = shmem[shmem_base + tid];
-            const float other = shmem[shmem_base + (tid ^ h)];
-            barrier();
-            shmem[shmem_base + tid] = (tid & h) == 0 ? val + other : other - val;
-            barrier();
-        }
-
-        if (row < n_rows) {
-            data_d[dst_offset + row_offset + tid] = shmem[shmem_base + tid];
-        }
-
-        barrier();
-#else
+#ifndef FWHT_SHMEM
         if (row >= n_rows) {
             continue;
         }
+#endif
 
         float reg[EL_W];
 
         [[unroll]]
         for (uint i = 0; i < EL_W; ++i) {
-            reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale;
+            reg[i] = row < n_rows ? data_a[src_offset + row_offset + i * BLOCK_SIZE + tid] * scale : 0.0;
         }
 
+#ifdef FWHT_SHMEM
         [[unroll]]
-        for (uint h = 1; h < WARP_SIZE; h <<= 1) {
+        for (uint h = 1; h < BLOCK_SIZE; h <<= 1) {
+            [[unroll]]
+            for (uint i = 0; i < EL_W; ++i) {
+                shmem[shmem_base + i * BLOCK_SIZE + tid] = reg[i];
+            }
+            barrier();
+            [[unroll]]
+            for (uint j = 0; j < EL_W; ++j) {
+                const float val = reg[j];
+                const float other = shmem[shmem_base + j * BLOCK_SIZE + (tid ^ h)];
+                reg[j] = (tid & h) == 0 ? val + other : other - val;
+            }
+            barrier();
+        }
+#else
+        [[unroll]]
+        for (uint h = 1; h < BLOCK_SIZE; h <<= 1) {
             [[unroll]]
             for (uint j = 0; j < EL_W; ++j) {
                 const float val = reg[j];
                 const float val2 = subgroupShuffleXor(val, h);
-                reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
+                reg[j] = (tid & h) == 0 ? val + val2 : val2 - val;
             }
         }
+#endif
 
         [[unroll]]
-        for (uint h = WARP_SIZE; h < N; h <<= 1) {
-            const uint step = h / WARP_SIZE;
+        for (uint h = BLOCK_SIZE; h < N; h <<= 1) {
+            const uint step = h / BLOCK_SIZE;
             [[unroll]]
             for (uint j = 0; j < EL_W; j += 2 * step) {
                 [[unroll]]
@@ -104,10 +100,16 @@ void main() {
             }
         }
 
-        [[unroll]]
-        for (uint i = 0; i < EL_W; ++i) {
-            data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i];
+#ifdef FWHT_SHMEM
+        if (row < n_rows) {
+#endif
+            [[unroll]]
+            for (uint i = 0; i < EL_W; ++i) {
+                data_d[dst_offset + row_offset + i * BLOCK_SIZE + tid] = reg[i];
+            }
+#ifdef FWHT_SHMEM
         }
+        barrier();
 #endif
     }
 }

From 6833f867d54b840b2b7f42c0b9029883dc5409ab Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 1 Jun 2026 17:00:26 +0200
Subject: [PATCH 3/4] disable subgroup shuffle on MoltenVK AMD

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 4d59c4c8e76a..dd0c017e87d1 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5583,6 +5583,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
 #endif
         device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
                                    (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle);
+#ifdef __APPLE__
+        if (device->vendor_id == VK_VENDOR_ID_AMD) {
+            device->subgroup_shuffle = false;
+        }
+#endif
         device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
                                      (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered);
 

From e6d79f12985859ff245069251fcb2300fed60e30 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 2 Jun 2026 08:40:17 +0200
Subject: [PATCH 4/4] disable fwht shader on Intel Windows due to driver bug

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index dd0c017e87d1..4cbfe245a019 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5047,7 +5047,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
             }
             ++idx;
         }
-    } else {
+    } else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) {
+        // Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147
         int idx = 0;
         for (uint32_t n : {64, 128, 256, 512}) {
             const uint32_t block_size = std::min(device->subgroup_size, n);