Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7844,8 +7844,10 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
return false;
}

// General performance issue with q3_k and q6_k due to 2-byte alignment
if (src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) {
// q6_k only has 2-byte alignment which makes it somewhat problematic,
// using MMVQ is only a win on Intel.
bool mmvq_q6 = device->vendor_id == VK_VENDOR_ID_INTEL;
if (src0_type == GGML_TYPE_Q6_K && !mmvq_q6) {
return false;
}

Expand All @@ -7857,7 +7859,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
// Quantization overhead is not worth it for small k
switch (device->vendor_id) {
case VK_VENDOR_ID_NVIDIA:
if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) {
if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) {
return true;
}

Expand All @@ -7884,9 +7886,16 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
return true;
}
case VK_VENDOR_ID_INTEL:
if (device->architecture == vk_device_architecture::INTEL_XE2) {
if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) {
Comment thread
0cc4m marked this conversation as resolved.
return true;
}
}

if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
// Intel Windows proprietary driver MMVQ performance is worse than fp16, see
// https://github.com/ggml-org/llama.cpp/issues/17628
// Intel Windows proprietary driver MMVQ performance for !Q2/Q3/Q6 is worse than fp16,
// see https://github.com/ggml-org/llama.cpp/issues/17628 and
// https://github.com/ggml-org/llama.cpp/pull/23056
return false;
}

Expand Down
108 changes: 66 additions & 42 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -212,28 +212,40 @@ i32vec4 repack4(uint ib, uint iqs) {
const uint qs_shift = ((iqs_k % 32) / 8) * 2;
const uint hm_shift = iqs_k / 8;

const uvec4 qs = uvec4( uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 ]) |
(uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 1]) << 16),
uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 2]) |
(uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 3]) << 16),
uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 4]) |
(uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 5]) << 16),
uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 6]) |
(uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 7]) << 16));

const uvec4 hmask = uvec4( uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 ]) |
(uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 1]) << 16),
uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 2]) |
(uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 3]) << 16),
uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 4]) |
(uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 5]) << 16),
uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 6]) |
(uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 7]) << 16));

// bitwise OR to add 4 if hmask is set, subtract later
const i8vec2 vals00 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 ] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 ] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals01 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 1] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 1] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals10 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 2] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 2] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals11 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 3] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 3] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals20 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 4] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 4] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals21 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 5] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 5] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals30 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 6] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 6] >> hm_shift) & uint16_t(0x0101)) << 2));
const i8vec2 vals31 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 7] >> qs_shift) & uint16_t(0x0303))) |
unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 7] >> hm_shift) & uint16_t(0x0101)) << 2));

return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y) - int8_t(4)),
pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y) - int8_t(4)),
pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y) - int8_t(4)),
pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y) - int8_t(4)));
const uint vals0 = (( qs.x >> qs_shift) & 0x03030303) |
(((hmask.x >> hm_shift) & 0x01010101) << 2);
const uint vals1 = (( qs.y >> qs_shift) & 0x03030303) |
(((hmask.y >> hm_shift) & 0x01010101) << 2);
const uint vals2 = (( qs.z >> qs_shift) & 0x03030303) |
(((hmask.z >> hm_shift) & 0x01010101) << 2);
const uint vals3 = (( qs.w >> qs_shift) & 0x03030303) |
(((hmask.w >> hm_shift) & 0x01010101) << 2);

// Subtract 4 by twiddling bits rather than using re-packing as mesa
// compiles repacking poorly.
return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x04040404) ^ 0x80808080),
int32_t(((vals1 ^ 0x80808080) - 0x04040404) ^ 0x80808080),
int32_t(((vals2 ^ 0x80808080) - 0x04040404) ^ 0x80808080),
int32_t(((vals3 ^ 0x80808080) - 0x04040404) ^ 0x80808080));
}

float get_d_scale(uint ib, uint iqs) {
Expand Down Expand Up @@ -343,27 +355,39 @@ i32vec4 repack4(uint ib, uint iqs) {
const uint qh_idx = (iqs_k / 32) * 8 + iqs;
const uint qh_shift = ((iqs_k % 32) / 8) * 2;

const i8vec2 vals00 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 ] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 ] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals01 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 1] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 1] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals10 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 2] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 2] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals11 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 3] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 3] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals20 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 4] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 4] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals21 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 5] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 5] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals30 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 6] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 6] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);
const i8vec2 vals31 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 7] >> ql_shift) & uint16_t(0x0F0F))) |
unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 7] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32);

return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y)),
pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y)),
pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y)),
pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y)));
const uvec4 ql = uvec4( uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 ]) |
(uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 1]) << 16),
uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 2]) |
(uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 3]) << 16),
uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 4]) |
(uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 5]) << 16),
uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 6]) |
(uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 7]) << 16));

const uvec4 qh = uvec4( uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 ]) |
(uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 1]) << 16),
uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 2]) |
(uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 3]) << 16),
uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 4]) |
(uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 5]) << 16),
uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 6]) |
(uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 7]) << 16));

const uint vals0 = (( ql.x >> ql_shift) & 0x0F0F0F0F) |
(((qh.x >> qh_shift) & 0x03030303) << 4);
const uint vals1 = (( ql.y >> ql_shift) & 0x0F0F0F0F) |
(((qh.y >> qh_shift) & 0x03030303) << 4);
const uint vals2 = (( ql.z >> ql_shift) & 0x0F0F0F0F) |
(((qh.z >> qh_shift) & 0x03030303) << 4);
const uint vals3 = (( ql.w >> ql_shift) & 0x0F0F0F0F) |
(((qh.w >> qh_shift) & 0x03030303) << 4);

// Subtract 32 by twiddling bits rather than using re-packing as mesa
// compiles repacking poorly.
return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x20202020) ^ 0x80808080),
int32_t(((vals1 ^ 0x80808080) - 0x20202020) ^ 0x80808080),
int32_t(((vals2 ^ 0x80808080) - 0x20202020) ^ 0x80808080),
int32_t(((vals3 ^ 0x80808080) - 0x20202020) ^ 0x80808080));
}

float get_d_scale(uint ib, uint iqs) {
Expand Down
Loading