Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -624,8 +624,6 @@ struct vk_device_struct {
// floor(log2(maxComputeWorkGroupInvocations))
uint32_t max_workgroup_size_log2 {};

bool flash_attention_fp16;

bool coopmat_support;
bool coopmat_acc_f32_support {};
bool coopmat_acc_f16_support {};
Expand Down Expand Up @@ -2978,11 +2976,15 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
}
}

static vk_fa_pipeline_state get_fa_pipeline_state(const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc,
static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc,
bool use_mask, bool use_mask_opt, bool use_logit_softcap) {
const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary &&
(device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2);

uint32_t flags = (use_mask_opt ? 1 : 0) |
(use_mask ? 2 : 0) |
(use_logit_softcap ? 4 : 0);
(use_logit_softcap ? 4 : 0) |
(old_amd_windows ? 8 : 0);

const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size;

Expand Down Expand Up @@ -3384,7 +3386,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
} \
}

if (device->flash_attention_fp16) {
if (device->fp16) {
CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, )
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
Expand Down Expand Up @@ -5423,10 +5425,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->mmvq_mode = 1;
}

// Driver issues with older AMD GPUs on Windows, see https://github.com/ggml-org/llama.cpp/pull/19625#issuecomment-3940840613
const bool is_amd_proprietary_gcn = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture == AMD_GCN && device->driver_id == vk::DriverId::eAmdProprietary;
device->flash_attention_fp16 = device->fp16 && !is_amd_proprietary_gcn;

return device;
}

Expand Down Expand Up @@ -8567,7 +8565,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
const uint32_t Br = params.block_rows;
const uint32_t Bc = params.block_cols;

const uint32_t float_type_size = device->flash_attention_fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);

// tmpsh is overestimated slightly
const uint32_t tmpsh = wg_size * sizeof(float);
Expand Down Expand Up @@ -8690,7 +8688,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
uint32_t workgroups_y = (uint32_t)neq2;
uint32_t workgroups_z = (uint32_t)neq3;

const bool f32acc = !ctx->device->flash_attention_fp16 || dst->op_params[3] == GGML_PREC_F32;
const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32;

// For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
// For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
Expand Down Expand Up @@ -8745,7 +8743,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx

// Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768;
vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(tuning_params, HSK, HSV, aligned, f32acc,
vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
mask != nullptr, use_mask_opt, logit_softcap != 0);

vk_pipeline pipeline = nullptr;
Expand Down
9 changes: 8 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,14 @@ void main() {

if (SubGroupSize > 0) {
[[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) {
Of[r][d] += subgroupShuffleXor(Of[r][d], s);
if (!OLD_AMD_WINDOWS) {
Of[r][d] += subgroupShuffleXor(Of[r][d], s);
} else {
// Something about f16vec4 subgroupShuffleXor is broken on AMD Windows RDNA2 and below.
// Shuffle full vec4 as workaround.
// See https://github.com/ggml-org/llama.cpp/issues/19881#issuecomment-3958643697
Of[r][d] += FLOAT_TYPEV4(subgroupShuffleXor(vec4(Of[r][d]), s));
}
}
if (row_split == 1) {
barrier();
Expand Down
7 changes: 4 additions & 3 deletions ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ layout (constant_id = 9) const uint32_t SHMEM_STAGING = 0;
layout (constant_id = 10) const uint32_t Flags = 0;
layout (constant_id = 11) const uint32_t LIMIT_OCCUPANCY_SHMEM = 0;

const bool USE_MASK_OPT = (Flags & 1) != 0;
const bool MASK_ENABLE = (Flags & 2) != 0;
const bool LOGIT_SOFTCAP = (Flags & 4) != 0;
const bool USE_MASK_OPT = (Flags & 1) != 0;
const bool MASK_ENABLE = (Flags & 2) != 0;
const bool LOGIT_SOFTCAP = (Flags & 4) != 0;
const bool OLD_AMD_WINDOWS = (Flags & 8) != 0;

// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
const uint32_t HSK_pad = (HSK + 15) & ~15;
Expand Down
Loading