From 3beb73cd919c4b19b43afc945c1077888f9c95f1 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 1 Jun 2026 12:22:51 +0200 Subject: [PATCH 1/4] vulkan: add fwht support for Intel with shmem reduction --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++ ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp | 56 +++++++++++++++++-- .../vulkan-shaders/vulkan-shaders-gen.cpp | 1 + 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2a30fb95c61b..03b25d2893fc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5047,6 +5047,12 @@ static void ggml_vk_load_shaders(vk_device& device) { } ++idx; } + } else { + int idx = 0; + for (uint32_t n : {64, 128, 256, 512}) { + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { n }, 1); + ++idx; + } } const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp index 72059d4afc2d..8dc388f1f4e9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp @@ -1,13 +1,20 @@ #version 450 #extension GL_EXT_control_flow_attributes : require +#ifndef FWHT_SHMEM #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_shuffle : enable +#endif -layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; - +#ifdef FWHT_SHMEM +layout(constant_id = 0) const uint N = 128; +#else layout(constant_id = 0) const uint WARP_SIZE = 32; layout(constant_id = 1) const uint N = 128; +const uint EL_W = N / WARP_SIZE; +#endif + +layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; layout(push_constant) uniform parameter { @@ -20,15 +27,51 @@ layout(push_constant) uniform parameter layout(binding = 0, std430) readonly buffer A { float data_a[]; }; layout(binding = 1, std430) writeonly buffer D { float data_d[]; }; -const uint EL_W = N / WARP_SIZE; +#ifdef FWHT_SHMEM +shared float shmem[4 * N]; +#endif void main() { +#ifdef FWHT_SHMEM + const uint tid = gl_LocalInvocationID.x; + const uint shmem_base = gl_LocalInvocationID.y * N; + const uint row_id = gl_LocalInvocationID.y; +#else const uint lane = gl_SubgroupInvocationID; - for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID; - row < n_rows; - row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { + const uint row_id = gl_SubgroupID; +#endif + + for (uint base_row = gl_WorkGroupID.x * gl_WorkGroupSize.y; + base_row < n_rows; + base_row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { + const uint row = base_row + row_id; const uint row_offset = row * N; +#ifdef FWHT_SHMEM + if (row < n_rows) { + shmem[shmem_base + tid] = data_a[src_offset + row_offset + tid] * scale; + } + barrier(); + + [[unroll]] + for (uint h = 1; h < N; h <<= 1) { + const float val = shmem[shmem_base + tid]; + const float other = shmem[shmem_base + (tid ^ h)]; + barrier(); + shmem[shmem_base + tid] = (tid & h) == 0 ? val + other : other - val; + barrier(); + } + + if (row < n_rows) { + data_d[dst_offset + row_offset + tid] = shmem[shmem_base + tid]; + } + + barrier(); +#else + if (row >= n_rows) { + continue; + } + float reg[EL_W]; [[unroll]] @@ -65,5 +108,6 @@ void main() { for (uint i = 0; i < EL_W; ++i) { data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i]; } +#endif } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index de7dbec2c639..d65cd12b2874 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -957,6 +957,7 @@ void process_shaders() { string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}})); string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("fwht_f32", "fwht.comp", {}); + string_to_spv("fwht_shmem_f32", "fwht.comp", {{"FWHT_SHMEM", "1"}}); string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}})); string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); From 479618c927891463e92fe8d692eabe45f8d57c96 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 1 Jun 2026 15:16:19 +0200 Subject: [PATCH 2/4] don't use N as workgroup size --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +- ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp | 74 ++++++++++--------- 2 files changed, 40 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 03b25d2893fc..4d59c4c8e76a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5050,7 +5050,8 @@ static void ggml_vk_load_shaders(vk_device& device) { } else { int idx = 0; for (uint32_t n : {64, 128, 256, 512}) { - ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { n }, 1); + const uint32_t block_size = std::min(device->subgroup_size, n); + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_shmem_f32", fwht_shmem_f32_len, fwht_shmem_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { block_size, n }, 1); ++idx; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp index 8dc388f1f4e9..a2069964adbe 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp @@ -6,13 +6,8 @@ #extension GL_KHR_shader_subgroup_shuffle : enable #endif -#ifdef FWHT_SHMEM -layout(constant_id = 0) const uint N = 128; -#else -layout(constant_id = 0) const uint WARP_SIZE = 32; +layout(constant_id = 0) const uint BLOCK_SIZE = 32; layout(constant_id = 1) const uint N = 128; -const uint EL_W = N / WARP_SIZE; -#endif layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; @@ -27,6 +22,8 @@ layout(push_constant) uniform parameter layout(binding = 0, std430) readonly buffer A { float data_a[]; }; layout(binding = 1, std430) writeonly buffer D { float data_d[]; }; +const uint EL_W = N / BLOCK_SIZE; + #ifdef FWHT_SHMEM shared float shmem[4 * N]; #endif @@ -37,7 +34,7 @@ void main() { const uint shmem_base = gl_LocalInvocationID.y * N; const uint row_id = gl_LocalInvocationID.y; #else - const uint lane = gl_SubgroupInvocationID; + const uint tid = gl_SubgroupInvocationID; const uint row_id = gl_SubgroupID; #endif @@ -47,51 +44,50 @@ void main() { const uint row = base_row + row_id; const uint row_offset = row * N; -#ifdef FWHT_SHMEM - if (row < n_rows) { - shmem[shmem_base + tid] = data_a[src_offset + row_offset + tid] * scale; - } - barrier(); - - [[unroll]] - for (uint h = 1; h < N; h <<= 1) { - const float val = shmem[shmem_base + tid]; - const float other = shmem[shmem_base + (tid ^ h)]; - barrier(); - shmem[shmem_base + tid] = (tid & h) == 0 ? val + other : other - val; - barrier(); - } - - if (row < n_rows) { - data_d[dst_offset + row_offset + tid] = shmem[shmem_base + tid]; - } - - barrier(); -#else +#ifndef FWHT_SHMEM if (row >= n_rows) { continue; } +#endif float reg[EL_W]; [[unroll]] for (uint i = 0; i < EL_W; ++i) { - reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale; + reg[i] = row < n_rows ? data_a[src_offset + row_offset + i * BLOCK_SIZE + tid] * scale : 0.0; } +#ifdef FWHT_SHMEM [[unroll]] - for (uint h = 1; h < WARP_SIZE; h <<= 1) { + for (uint h = 1; h < BLOCK_SIZE; h <<= 1) { + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + shmem[shmem_base + i * BLOCK_SIZE + tid] = reg[i]; + } + barrier(); + [[unroll]] + for (uint j = 0; j < EL_W; ++j) { + const float val = reg[j]; + const float other = shmem[shmem_base + j * BLOCK_SIZE + (tid ^ h)]; + reg[j] = (tid & h) == 0 ? val + other : other - val; + } + barrier(); + } +#else + [[unroll]] + for (uint h = 1; h < BLOCK_SIZE; h <<= 1) { [[unroll]] for (uint j = 0; j < EL_W; ++j) { const float val = reg[j]; const float val2 = subgroupShuffleXor(val, h); - reg[j] = (lane & h) == 0 ? val + val2 : val2 - val; + reg[j] = (tid & h) == 0 ? val + val2 : val2 - val; } } +#endif [[unroll]] - for (uint h = WARP_SIZE; h < N; h <<= 1) { - const uint step = h / WARP_SIZE; + for (uint h = BLOCK_SIZE; h < N; h <<= 1) { + const uint step = h / BLOCK_SIZE; [[unroll]] for (uint j = 0; j < EL_W; j += 2 * step) { [[unroll]] @@ -104,10 +100,16 @@ void main() { } } - [[unroll]] - for (uint i = 0; i < EL_W; ++i) { - data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i]; +#ifdef FWHT_SHMEM + if (row < n_rows) { +#endif + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + data_d[dst_offset + row_offset + i * BLOCK_SIZE + tid] = reg[i]; + } +#ifdef FWHT_SHMEM } + barrier(); #endif } } From 6833f867d54b840b2b7f42c0b9029883dc5409ab Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Mon, 1 Jun 2026 17:00:26 +0200 Subject: [PATCH 3/4] disable subgroup shuffle on MoltenVK AMD --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4d59c4c8e76a..dd0c017e87d1 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5583,6 +5583,11 @@ static vk_device ggml_vk_get_device(size_t idx) { #endif device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); +#ifdef __APPLE__ + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_shuffle = false; + } +#endif device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eClustered); From e6d79f12985859ff245069251fcb2300fed60e30 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 2 Jun 2026 08:40:17 +0200 Subject: [PATCH 4/4] disable fwht shader on Intel Windows due to driver bug --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index dd0c017e87d1..4cbfe245a019 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5047,7 +5047,8 @@ static void ggml_vk_load_shaders(vk_device& device) { } ++idx; } - } else { + } else if (device->driver_id != vk::DriverId::eIntelProprietaryWindows) { + // Disabled on Intel Windows due to a driver bug: https://github.com/ggml-org/llama.cpp/pull/23964#issuecomment-4598226147 int idx = 0; for (uint32_t n : {64, 128, 256, 512}) { const uint32_t block_size = std::min(device->subgroup_size, n);