From 91a131b5f8f844bacad4533f5dd5e1c414299b6a Mon Sep 17 00:00:00 2001 From: 007jbks Date: Tue, 21 Apr 2026 16:59:08 +0530 Subject: [PATCH 1/3] ggml: vectorize ggml_vec_dot_q4_1_q8_1 with WASM SIMD128 Optimize the inner loop of ggml_vec_dot_q4_1_q8_1_generic using WASM SIMD128 intrinsics, gated behind #ifdef __wasm_simd128__ so non-wasm builds are completely unaffected. Approach: - single wasm_v128_load covers all 32 packed 4-bit weights - nibbles unpacked via AND/SHR into two u8x16 registers - widened to i16 before multiply (WASM SIMD has no i8*i8 instruction) - 4x wasm_i32x4_dot_i16x8 calls accumulate all 32 element pairs - horizontal reduce via 4x wasm_i32x4_extract_lane Benchmark (node v25, emcc -O3 -msimd128, 64 blocks x QK8_1=32, 200k iterations): | impl | ns/call | speedup | |--------|---------|---------| | scalar | 880.7 | 1.00x | | simd | 257.8 | 3.42x | Correctness verified against scalar reference across 10 random seeds with exact output match. --- ggml/src/ggml-cpu/quants.c | 40 +++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index e5f9a4083f9c..012fce58aa96 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -14,6 +14,8 @@ #include // for qsort #include // for GGML_ASSERT +#include // for wasm simd + #define GROUP_MAX_EPS 1e-15f #define GROUP_MAX_EPS_IQ3_XXS 1e-8f #define GROUP_MAX_EPS_IQ2_S 1e-8f @@ -170,7 +172,6 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } - void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -207,7 +208,6 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -// TODO: add WASM SIMD void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; @@ -225,6 +225,40 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c int ib = 0; float sumf = 0; +#ifdef __wasm_simd128__ + for (; ib < nb; ++ib) { + v128_t raw = wasm_v128_load(x[ib].qs); + v128_t v0s = wasm_v128_and(raw, wasm_i8x16_splat(0x0F)); + v128_t v1s = wasm_u8x16_shr(raw, 4); + + v128_t ys_lo = wasm_v128_load(&y[ib].qs[0]); + v128_t ys_hi = wasm_v128_load(&y[ib].qs[16]); + + v128_t v0s_l = wasm_u16x8_extend_low_u8x16(v0s); + v128_t v0s_h = wasm_u16x8_extend_high_u8x16(v0s); + v128_t ylo_l = wasm_i16x8_extend_low_i8x16(ys_lo); + v128_t ylo_h = wasm_i16x8_extend_high_i8x16(ys_lo); + + v128_t acc = wasm_i32x4_dot_i16x8(v0s_l, ylo_l); + acc = wasm_i32x4_add(acc, wasm_i32x4_dot_i16x8(v0s_h, ylo_h)); + + v128_t v1s_l = wasm_u16x8_extend_low_u8x16(v1s); + v128_t v1s_h = wasm_u16x8_extend_high_u8x16(v1s); + v128_t yhi_l = wasm_i16x8_extend_low_i8x16(ys_hi); + v128_t yhi_h = wasm_i16x8_extend_high_i8x16(ys_hi); + + acc = wasm_i32x4_add(acc, wasm_i32x4_dot_i16x8(v1s_l, yhi_l)); + acc = wasm_i32x4_add(acc, wasm_i32x4_dot_i16x8(v1s_h, yhi_h)); + + int sumi = wasm_i32x4_extract_lane(acc, 0) + + wasm_i32x4_extract_lane(acc, 1) + + wasm_i32x4_extract_lane(acc, 2) + + wasm_i32x4_extract_lane(acc, 3); + + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi + + GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + } +#else for (; ib < nb; ++ib) { int sumi0 = 0; int sumi1 = 0; @@ -240,10 +274,10 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c int sumi = sumi0 + sumi1; sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } +#endif *s = sumf; } - void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); From 799539b4af5049e4cfd0f76b7b4ccb2f618dcdf1 Mon Sep 17 00:00:00 2001 From: 007jbks Date: Tue, 21 Apr 2026 17:20:56 +0530 Subject: [PATCH 2/3] ggml: move q4_1_q8_1 WASM SIMD implementation to wasm backend Relocate the SIMD128 implementation of ggml_vec_dot_q4_1_q8_1 to ggml/src/ggml-cpu/arch/wasm/quants.c to follow architecture-specific layout. Restore the generic implementation in ggml/src/ggml-cpu/quants.c. Move for loop in the else block. --- ggml/src/ggml-cpu/arch/wasm/quants.c | 78 ++++++++++++++++++++++++++++ ggml/src/ggml-cpu/quants.c | 40 ++------------ 2 files changed, 81 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 648c6fcaba76..513c415e1dd9 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -355,6 +355,84 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + float sumf = 0; + +#if defined __wasm_simd128__ + v128_t sumv = wasm_f32x4_splat(0.0f); + float summs = 0.0f; + + for (int ib = 0; ib < nb; ++ib) { + const block_q4_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + + const v128_t raw = wasm_v128_load(x0->qs); + const v128_t v0s = wasm_v128_and(raw, wasm_i8x16_splat(0x0F)); + const v128_t v1s = wasm_u8x16_shr(raw, 4); + + const v128_t ys_lo = wasm_v128_load(y0->qs); + const v128_t ys_hi = wasm_v128_load(y0->qs + 16); + + const v128_t v0s_l = wasm_u16x8_extend_low_u8x16(v0s); + const v128_t v0s_h = wasm_u16x8_extend_high_u8x16(v0s); + const v128_t ylo_l = wasm_i16x8_extend_low_i8x16(ys_lo); + const v128_t ylo_h = wasm_i16x8_extend_high_i8x16(ys_lo); + const v128_t v1s_l = wasm_u16x8_extend_low_u8x16(v1s); + const v128_t v1s_h = wasm_u16x8_extend_high_u8x16(v1s); + const v128_t yhi_l = wasm_i16x8_extend_low_i8x16(ys_hi); + const v128_t yhi_h = wasm_i16x8_extend_high_i8x16(ys_hi); + + const v128_t acc = wasm_i32x4_add( + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(v0s_l, ylo_l), + wasm_i32x4_dot_i16x8(v0s_h, ylo_h)), + wasm_i32x4_add( + wasm_i32x4_dot_i16x8(v1s_l, yhi_l), + wasm_i32x4_dot_i16x8(v1s_h, yhi_h))); + + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul( + wasm_f32x4_convert_i32x4(acc), + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); + } + + sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; + +#else + for (int ib = 0; ib < nb; ++ib) { + int sumi0 = 0, sumi1 = 0; + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F); + const int v1 = (x[ib].qs[j] >> 4); + sumi0 += v0 * y[ib].qs[j]; + sumi1 += v1 * y[ib].qs[j + qk/2]; + } + int sumi = sumi0 + sumi1; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi + + GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); + } + +#endif + + *s = sumf; +} + void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 012fce58aa96..e5f9a4083f9c 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -14,8 +14,6 @@ #include // for qsort #include // for GGML_ASSERT -#include // for wasm simd - #define GROUP_MAX_EPS 1e-15f #define GROUP_MAX_EPS_IQ3_XXS 1e-8f #define GROUP_MAX_EPS_IQ2_S 1e-8f @@ -172,6 +170,7 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } + void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; @@ -208,6 +207,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } +// TODO: add WASM SIMD void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; @@ -225,40 +225,6 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c int ib = 0; float sumf = 0; -#ifdef __wasm_simd128__ - for (; ib < nb; ++ib) { - v128_t raw = wasm_v128_load(x[ib].qs); - v128_t v0s = wasm_v128_and(raw, wasm_i8x16_splat(0x0F)); - v128_t v1s = wasm_u8x16_shr(raw, 4); - - v128_t ys_lo = wasm_v128_load(&y[ib].qs[0]); - v128_t ys_hi = wasm_v128_load(&y[ib].qs[16]); - - v128_t v0s_l = wasm_u16x8_extend_low_u8x16(v0s); - v128_t v0s_h = wasm_u16x8_extend_high_u8x16(v0s); - v128_t ylo_l = wasm_i16x8_extend_low_i8x16(ys_lo); - v128_t ylo_h = wasm_i16x8_extend_high_i8x16(ys_lo); - - v128_t acc = wasm_i32x4_dot_i16x8(v0s_l, ylo_l); - acc = wasm_i32x4_add(acc, wasm_i32x4_dot_i16x8(v0s_h, ylo_h)); - - v128_t v1s_l = wasm_u16x8_extend_low_u8x16(v1s); - v128_t v1s_h = wasm_u16x8_extend_high_u8x16(v1s); - v128_t yhi_l = wasm_i16x8_extend_low_i8x16(ys_hi); - v128_t yhi_h = wasm_i16x8_extend_high_i8x16(ys_hi); - - acc = wasm_i32x4_add(acc, wasm_i32x4_dot_i16x8(v1s_l, yhi_l)); - acc = wasm_i32x4_add(acc, wasm_i32x4_dot_i16x8(v1s_h, yhi_h)); - - int sumi = wasm_i32x4_extract_lane(acc, 0) - + wasm_i32x4_extract_lane(acc, 1) - + wasm_i32x4_extract_lane(acc, 2) - + wasm_i32x4_extract_lane(acc, 3); - - sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi - + GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); - } -#else for (; ib < nb; ++ib) { int sumi0 = 0; int sumi1 = 0; @@ -274,10 +240,10 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c int sumi = sumi0 + sumi1; sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } -#endif *s = sumf; } + void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); From e807882ec9e44338df37a36cd80c4f04b70d1a1c Mon Sep 17 00:00:00 2001 From: 007jbks Date: Thu, 4 Jun 2026 10:09:05 +0530 Subject: [PATCH 3/3] ggml: use generic q4_1_q8_1 fallback in wasm backend --- ggml/src/ggml-cpu/arch/wasm/quants.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 513c415e1dd9..0a7119b4e1fb 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -414,23 +414,17 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; + *s = sumf; + #else - for (int ib = 0; ib < nb; ++ib) { - int sumi0 = 0, sumi1 = 0; - for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[ib].qs[j] & 0x0F); - const int v1 = (x[ib].qs[j] >> 4); - sumi0 += v0 * y[ib].qs[j]; - sumi1 += v1 * y[ib].qs[j + qk/2]; - } - int sumi = sumi0 + sumi1; - sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi - + GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); - } + UNUSED(nb); + UNUSED(x); + UNUSED(y); + UNUSED(sumf); + ggml_vec_dot_q4_1_q8_1_generic( + n, s, bs, vx, bx, vy, by, nrc); #endif - - *s = sumf; } void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {