Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 207 additions & 23 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

69 changes: 69 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
#endif

#if defined(DATA_A_Q4_0)
#if defined(A_TYPE_REPACKED)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants[(a_offset + ib) * 16 + iqs]);
return (vec2(vui & 0xF, vui >> 4) - 8.0f);
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants16[(a_offset + ib) * 8 + iqs/2]);
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
}
#else
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return (vec2(vui & 0xF, vui >> 4) - 8.0f);
Expand All @@ -32,8 +42,19 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
}
#endif
#endif

#if defined(DATA_A_Q4_1)
#if defined(A_TYPE_REPACKED)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants[(a_offset + ib) * 16 + iqs]);
return vec2(vui & 0xF, vui >> 4);
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants16[(a_offset + ib) * 8 + iqs/2]);
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
}
#else
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(vui & 0xF, vui >> 4);
Expand All @@ -43,6 +64,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
}
#endif
#endif

#if defined(DATA_A_Q5_0)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
Expand Down Expand Up @@ -77,6 +99,16 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
#endif

#if defined(DATA_A_Q8_0)
#if defined(A_TYPE_REPACKED)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const i8vec2 v = unpack8(int32_t(data_a_quants16[(a_offset + ib) * 16 + iqs/2])).xy;
return vec2(v);
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const i8vec4 v = unpack8(int32_t(data_a_quants32[(a_offset + ib) * 8 + iqs/4]));
return vec4(v);
}
#else
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
}
Expand All @@ -86,6 +118,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
return vec4(v0.x, v0.y, v1.x, v1.y);
}
#endif
#endif

#if defined(DATA_A_Q1_0)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
Expand Down Expand Up @@ -428,6 +461,16 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
#endif

#if defined(DATA_A_IQ4_NL)
#if defined(A_TYPE_REPACKED)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants[(a_offset + ib) * 16 + iqs]);
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants16[(a_offset + ib) * 8 + iqs/2]);
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
}
#else
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
Expand All @@ -437,8 +480,20 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
}
#endif
#endif

#if defined(DATA_A_MXFP4)
#if defined(A_TYPE_REPACKED)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants[(a_offset + ib) * 16 + iqs]);
return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]) * 0.5;
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a_quants16[(a_offset + ib) * 8 + iqs/2]);
return vec4(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[(vui >> 4) & 0xF],
kvalues_mxfp4[(vui >> 8) & 0xF], kvalues_mxfp4[vui >> 12]) * 0.5;
}
#else
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]) * 0.5;
Expand All @@ -449,6 +504,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
return vec4(v0.x, v0.y, v1.x, v1.y);
}
#endif
#endif

#if defined(DATA_A_NVFP4)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
Expand Down Expand Up @@ -486,7 +542,11 @@ vec2 get_dm(uint ib, uint a_offset) {

#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
vec2 get_dm(uint ib, uint a_offset) {
#if (defined(DATA_A_Q4_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)) && defined(A_TYPE_REPACKED)
return vec2(float(data_a_deltas[a_offset + p.deltas_offset + ib]), 0);
#else
return vec2(float(data_a[a_offset + ib].d), 0);
#endif
}
#endif

Expand All @@ -499,7 +559,11 @@ vec2 get_dm(uint ib, uint a_offset) {

#if defined(DATA_A_MXFP4)
vec2 get_dm(uint ib, uint a_offset) {
#if defined(A_TYPE_REPACKED)
return vec2(e8m0_to_fp32(uint8_t(data_a_quants[p.deltas_offset + a_offset + ib])), 0);
#else
return vec2(e8m0_to_fp32(data_a[a_offset + ib].e), 0);
#endif
}
#endif

Expand All @@ -511,8 +575,13 @@ vec2 get_dm(uint ib, uint a_offset) {

#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
vec2 get_dm(uint ib, uint a_offset) {
#if defined(DATA_A_Q4_1) && defined(A_TYPE_REPACKED)
return vec2(float(data_a_deltas[p.deltas_offset + (a_offset + ib) * 2]),
float(data_a_deltas[p.deltas_offset + (a_offset + ib) * 2 + 1]));
#else
const vec2 dm = vec2(data_a_packed32[a_offset + ib].dm);
return dm;
#endif
}
#endif

Expand Down
82 changes: 74 additions & 8 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,65 @@ float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2
return bit != 0u ? d : -d;
}

#ifdef A_TYPE_REPACKED
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_0 {
uint32_t qs[4];
};
#else
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
block_q4_0_packed16 block;
};
#endif

float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
#ifdef A_TYPE_REPACKED
const uint ib = pos_a + blockCoords[0] * (p.stride_a / QUANT_K) + blockCoords[1];
const float16_t d = data_a_deltas[p.deltas_offset + ib];
uint32_t qs = bl.qs[(idx & 0xC) >> 2];
const uint shift = (idx & 0x10) >> 2;
qs >>= ((idx & 3) * 8 + shift);
#else
const float16_t d = bl.block.d;
uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
const uint shift = (idx & 0x10) >> 2;
qs >>= shift;
qs &= 0x0F0F;
qs = unpack8(qs)[idx & 1];
#endif
qs &= 0xF;
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
return ret;
}

#ifdef A_TYPE_REPACKED
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_1 {
uint32_t qs[4];
};
#else
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
block_q4_1 block;
};
#endif

float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const float16_t m = bl.block.m;
const uint idx = coordInBlock[1];
const uint iqs = idx & 0xF;
const uint shift = (idx & 0x10) >> 2;
#ifdef A_TYPE_REPACKED
const uint ib = pos_a + blockCoords[0] * (p.stride_a / QUANT_K) + blockCoords[1];
const float16_t d = data_a_deltas[p.deltas_offset + ib * 2];
const float16_t m = data_a_deltas[p.deltas_offset + ib * 2 + 1];
uint32_t qs = bl.qs[(idx & 0xC) >> 2];
qs >>= ((iqs & 3) * 8 + shift);
#else
const float16_t d = bl.block.d;
const float16_t m = bl.block.m;
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
#endif
qs &= 0xF;
float16_t ret = float16_t(qs) * d + m;
return ret;
Expand Down Expand Up @@ -105,18 +134,28 @@ float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2
return ret;
}

#ifdef A_TYPE_REPACKED
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ8_0 {
int32_t qs[8];
};
#else
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
block_q8_0_packed16 block;
};
#endif

float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint iqs = idx;

// Load 16b and select the byte for this element
#ifdef A_TYPE_REPACKED
const uint ib = pos_a + blockCoords[0] * (p.stride_a / QUANT_K) + blockCoords[1];
const float16_t d = data_a_deltas[p.deltas_offset + ib];
int32_t qs = unpack8(bl.qs[(iqs & 0x1C) >> 2])[iqs & 3];
#else
const float16_t d = bl.block.d;
int32_t qs = unpack8(bl.block.qs[(iqs & 0x1E) >> 1])[iqs & 1];
#endif
float16_t ret = float16_t(qs) * d;
return ret;
}
Expand Down Expand Up @@ -660,37 +699,64 @@ float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoor
#endif

#if defined(DATA_A_IQ4_NL)
#ifdef A_TYPE_REPACKED
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufIQ4_NL {
uint32_t qs[4];
};
#else
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
block_iq4_nl block;
};
#endif

float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
#ifdef A_TYPE_REPACKED
const uint ib = pos_a + blockCoords[0] * (p.stride_a / QUANT_K) + blockCoords[1];
const float16_t d = data_a_deltas[p.deltas_offset + ib];
uint32_t qs = bl.qs[(idx & 0xC) >> 2];
const uint shift = (idx & 0x10) >> 2;
qs >>= ((idx & 3) * 8 + shift);
#else
const float16_t d = bl.block.d;
const uint iqs = idx & 0xF;
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
#endif
qs &= 0xF;
float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
return ret;
}
#endif

#if defined(DATA_A_MXFP4)
#ifdef A_TYPE_REPACKED
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufMXFP4 {
uint32_t qs[4];
};
#else
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufMXFP4 {
block_mxfp4 block;
};
#endif

float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float d = e8m0_to_fp32(bl.block.e);
const uint idx = coordInBlock[1];
const uint iqs = idx & 0xF;
const uint shift = (idx & 0x10) >> 2;
#ifdef A_TYPE_REPACKED
const uint ib = pos_a + blockCoords[0] * (p.stride_a / QUANT_K) + blockCoords[1];
const float d = e8m0_to_fp32(data_a_scales[p.deltas_offset + ib]);
uint32_t qs = bl.qs[(iqs & 0xC) >> 2];
qs >>= ((iqs & 3) * 8 + shift);
#else
const float d = e8m0_to_fp32(bl.block.e);
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
#endif
qs &= 0xF;
float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
return ret;
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ layout (push_constant) uniform parameter
uint broadcast2;
uint broadcast3;
#endif

uint deltas_offset;
} p;

#ifdef MUL_MAT_ID
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16
#if defined(A_TYPE_PACKED32)
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
#endif
#if defined(A_TYPE_REPACKED)
layout (binding = 0) readonly buffer A_QUANTS {uint8_t data_a_quants[];};
layout (binding = 0) readonly buffer A_QUANTS16 {uint16_t data_a_quants16[];};
layout (binding = 0) readonly buffer A_QUANTS32 {uint32_t data_a_quants32[];};
layout (binding = 0) readonly buffer A_DELTAS {float16_t data_a_deltas[];};
#endif

layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
#ifdef B_TYPEV2
Expand Down
Loading
Loading