From 4f38d77685769b2511b1e4b4a6d39b925347de9d Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Mon, 10 Nov 2025 17:28:52 +0800 Subject: [PATCH 1/5] feat(ios): build metallib with metal4 for enable tensor api --- cpp/ggml-metal/ggml-metal-device.m | 4 ++++ scripts/bootstrap.sh | 10 ++++++++++ scripts/build-ios.sh | 2 ++ scripts/patches/ggml-metal-device.m.patch | 10 +++++++--- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/cpp/ggml-metal/ggml-metal-device.m b/cpp/ggml-metal/ggml-metal-device.m index b6c6b6a7..29baaf47 100644 --- a/cpp/ggml-metal/ggml-metal-device.m +++ b/cpp/ggml-metal/ggml-metal-device.m @@ -182,8 +182,12 @@ lm_ggml_metal_library_t lm_ggml_metal_library_init(lm_ggml_metal_device_t dev) { #endif #if TARGET_OS_SIMULATOR + NSString *lib_name = @"ggml-llama-sim"; + if (dev->props.has_tensor) lib_name = @"ggml-llama-sim-metal4"; NSString * path_lib = [bundle pathForResource:@"ggml-llama-sim" ofType:@"metallib"]; #else + NSString *lib_name = @"ggml-llama"; + if (dev->props.has_tensor) lib_name = @"ggml-llama-metal4"; NSString * path_lib = [bundle pathForResource:@"ggml-llama" ofType:@"metallib"]; #endif if (path_lib == nil) { diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 7ff6396a..3db0d723 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -292,11 +292,21 @@ if [ "$OS" = "Darwin" ]; then xcrun --sdk iphoneos metallib ggml-metal.air -o ggml-llama.metallib rm ggml-metal.air mv ./ggml-llama.metallib "$CPP_DIR/ggml-metal/ggml-llama.metallib" + # metal4 + xcrun --sdk iphoneos metal -O3 -std=metal4.0 -mios-version-min=26.0 -c ggml-metal.metal -o ggml-metal.air -DGGML_METAL_HAS_BF16=1 -DGGML_METAL_HAS_TENSOR=1 + xcrun --sdk iphoneos metallib ggml-metal.air -o ggml-llama-metal4.metallib + rm ggml-metal.air + mv ./ggml-llama-metal4.metallib "$CPP_DIR/ggml-metal/ggml-llama-metal4.metallib" xcrun --sdk iphonesimulator metal -O3 -std=metal3.2 -mios-version-min=16.0 -c ggml-metal.metal -o ggml-metal.air -DGGML_METAL_HAS_BF16=1 xcrun --sdk iphonesimulator metallib ggml-metal.air -o ggml-llama.metallib rm ggml-metal.air mv ./ggml-llama.metallib "$CPP_DIR/ggml-metal/ggml-llama-sim.metallib" + # metal4 + xcrun --sdk iphonesimulator metal -O3 -std=metal4.0 -mios-version-min=26.0 -c ggml-metal.metal -o ggml-metal.air -DGGML_METAL_HAS_BF16=1 -DGGML_METAL_HAS_TENSOR=1 + xcrun --sdk iphonesimulator metallib ggml-metal.air -o ggml-llama-metal4.metallib + rm ggml-metal.air + mv ./ggml-llama-metal4.metallib "$CPP_DIR/ggml-metal/ggml-llama-sim-metal4.metallib" # Remove the symbolic link rm ggml-common.h diff --git a/scripts/build-ios.sh b/scripts/build-ios.sh index 31fdabec..ebeb2a28 100755 --- a/scripts/build-ios.sh +++ b/scripts/build-ios.sh @@ -50,8 +50,10 @@ function build_framework() { # TODO: May need to re-build metallib for tvOS if [ "$4" == "ios-arm64_x86_64-simulator" ] || [ "$4" == "tvos-arm64_x86_64-simulator" ]; then cp ../cpp/ggml-metal/ggml-llama-sim.metallib ../ios/rnllama.xcframework/$4/rnllama.framework/ggml-llama-sim.metallib + cp ../cpp/ggml-metal/ggml-llama-sim-metal4.metallib ../ios/rnllama.xcframework/$4/rnllama.framework/ggml-llama-sim-metal4.metallib else cp ../cpp/ggml-metal/ggml-llama.metallib ../ios/rnllama.xcframework/$4/rnllama.framework/ggml-llama.metallib + cp ../cpp/ggml-metal/ggml-llama-metal4.metallib ../ios/rnllama.xcframework/$4/rnllama.framework/ggml-llama-metal4.metallib fi rm -rf ./* diff --git a/scripts/patches/ggml-metal-device.m.patch b/scripts/patches/ggml-metal-device.m.patch index c2729178..7d72cd78 100644 --- a/scripts/patches/ggml-metal-device.m.patch +++ b/scripts/patches/ggml-metal-device.m.patch @@ -1,13 +1,17 @@ ---- ggml-metal-device.m.orig 2025-09-18 14:50:43 -+++ ggml-metal-device.m 2025-09-18 14:50:16 -@@ -171,7 +171,11 @@ +--- ggml-metal/ggml-metal-device.m.orig 2025-11-10 17:26:40 ++++ ggml-metal/ggml-metal-device.m 2025-11-10 17:22:40 +@@ -181,7 +181,15 @@ NSBundle * bundle = [NSBundle bundleForClass:[LMGGMLMetalClass class]]; #endif - NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; +#if TARGET_OS_SIMULATOR ++ NSString *lib_name = @"ggml-llama-sim"; ++ if (dev->props.has_tensor) lib_name = @"ggml-llama-sim-metal4"; + NSString * path_lib = [bundle pathForResource:@"ggml-llama-sim" ofType:@"metallib"]; +#else ++ NSString *lib_name = @"ggml-llama"; ++ if (dev->props.has_tensor) lib_name = @"ggml-llama-metal4"; + NSString * path_lib = [bundle pathForResource:@"ggml-llama" ofType:@"metallib"]; +#endif if (path_lib == nil) { From 2b102bf5dec6a311dee5a70478fbeab9007f594f Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Tue, 11 Nov 2025 10:28:03 +0800 Subject: [PATCH 2/5] fix: patch --- scripts/patches/ggml-metal-device.m.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/patches/ggml-metal-device.m.patch b/scripts/patches/ggml-metal-device.m.patch index 7d72cd78..6c01ea2e 100644 --- a/scripts/patches/ggml-metal-device.m.patch +++ b/scripts/patches/ggml-metal-device.m.patch @@ -1,5 +1,5 @@ ---- ggml-metal/ggml-metal-device.m.orig 2025-11-10 17:26:40 -+++ ggml-metal/ggml-metal-device.m 2025-11-10 17:22:40 +--- ggml-metal-device.m.orig 2025-11-10 17:26:40 ++++ ggml-metal-device.m 2025-11-10 17:22:40 @@ -181,7 +181,15 @@ NSBundle * bundle = [NSBundle bundleForClass:[LMGGMLMetalClass class]]; #endif From 11a978cebe91c531d5c2267d47c4933ff36e365c Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Tue, 11 Nov 2025 10:28:52 +0800 Subject: [PATCH 3/5] feat: sync llama.cpp to b7017 --- cpp/common.h | 3 +- cpp/ggml-cpu/arch/arm/quants.c | 454 +++++++++++++++++++++++++++-- cpp/ggml-cpu/ggml-cpu.c | 37 ++- cpp/ggml-cpu/ops.cpp | 40 --- cpp/ggml-cpu/ops.h | 4 - cpp/ggml-metal/ggml-metal-device.m | 6 +- cpp/ggml-metal/ggml-metal-ops.cpp | 5 + cpp/ggml-opencl/ggml-opencl.cpp | 7 +- cpp/llama-memory-recurrent.cpp | 7 +- cpp/models/ernie4-5.cpp | 9 +- cpp/models/openai-moe-iswa.cpp | 3 +- cpp/tools/mtmd/clip.cpp | 17 +- src/version.ts | 4 +- third_party/llama.cpp | 2 +- 14 files changed, 488 insertions(+), 110 deletions(-) diff --git a/cpp/common.h b/cpp/common.h index cb7d22f3..7d99f858 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -464,7 +464,8 @@ struct common_params { float slot_prompt_similarity = 0.1f; // batched-bench params - bool is_pp_shared = false; + bool is_pp_shared = false; + bool is_tg_separate = false; std::vector n_pp; std::vector n_tg; diff --git a/cpp/ggml-cpu/arch/arm/quants.c b/cpp/ggml-cpu/arch/arm/quants.c index 42095574..b55e2c2c 100644 --- a/cpp/ggml-cpu/arch/arm/quants.c +++ b/cpp/ggml-cpu/arch/arm/quants.c @@ -2044,6 +2044,26 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con } +#ifdef __ARM_FEATURE_SVE +static inline svuint32_t lm_ggml_decode_q4scales_and_mins_for_mmla(const uint32_t * vx_scales) { + const svbool_t pg_all = svptrue_pat_b32(SV_VL4); + const svbool_t pg_false = svpfalse_b(); // 0x0000 + const svbool_t pg_lo_8 = svwhilelt_b8_s32(0, 8); // 0x00ff + const svbool_t pg_odd = svzip1_b32(pg_false, pg_lo_8); + + svuint32_t vutmp_hi, vutmp_lo; + svuint32_t vx01 = svld1_u32(pg_lo_8, vx_scales); + vutmp_hi = svzip1_u32(vx01, vx01); + vutmp_hi = svlsr_n_u32_m(pg_odd, vutmp_hi, 2); + vutmp_hi = svreinterpret_u32_u64(svand_n_u64_x(pg_all, svreinterpret_u64_u32(vutmp_hi), UINT64_C(0x303030303f3f3f3f))); + const svuint32_t vx2 = svdup_u32(vx_scales[2]); + vutmp_lo = svlsr_u32_x(pg_all, vx2, svreinterpret_u32_s32(svindex_s32(-2, 2))); + vutmp_lo = svand_n_u32_z(pg_odd, vutmp_lo, UINT32_C(0x0f0f0f0f)); + svuint32_t vutmp = svorr_u32_z(pg_all, vutmp_hi, vutmp_lo); + return vutmp; +} +#endif + void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); #ifdef __ARM_FEATURE_MATMUL_INT8 @@ -2066,8 +2086,220 @@ void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con static const uint32_t kmask3 = 0x03030303; uint32_t utmp[4]; +#ifdef __ARM_FEATURE_SVE + const int vector_length = lm_ggml_cpu_get_sve_cnt()*8; +#endif -#if defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + svbool_t pg32_2 = svptrue_pat_b32(SV_VL2); + + const block_q4_K * LM_GGML_RESTRICT vx0 = vx; + const block_q8_K * LM_GGML_RESTRICT vy0 = vy; + const block_q4_K * LM_GGML_RESTRICT vx1 = (const block_q4_K *) ((const uint8_t*)vx + bx); + const block_q8_K * LM_GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by); + + union { + uint32_t u32[8]; + uint64_t u64[4]; + } new_utmp; + + svfloat32_t sumf1 = svdup_n_f32(0); + + switch (vector_length) { + case 128: + { + svbool_t pg_false = svpfalse_b(); + svbool_t pg_lo_8 = svwhilelt_b8_s32(0, 8); + svbool_t vmins_mask1= svzip1_b32(pg_lo_8, pg_false); + svbool_t vmins_mask2 = svzip1_b32(pg_false, pg_lo_8); + svbool_t pg128_all = svptrue_pat_b8(SV_VL16); + for (int i = 0; i < nb; ++i) { + svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)); + svfloat32_t vx_d = svzip1_f32(svdup_n_f32(LM_GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(LM_GGML_FP16_TO_FP32(vx1[i].d))); + svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d); + svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(LM_GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(LM_GGML_FP16_TO_FP32(vx1[i].dmin))); + svfloat32_t vy_dmins = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)); + svfloat32_t svdmins = svmul_n_f32_x(pg128_all, svmul_f32_x(pg128_all, vy_dmins, vx_dmins), -1); + const uint8_t * LM_GGML_RESTRICT q4_0 = vx0[i].qs; + const int8_t * LM_GGML_RESTRICT q8_0 = vy0[i].qs; + const uint8_t * LM_GGML_RESTRICT q4_1 = vx1[i].qs; + const int8_t * LM_GGML_RESTRICT q8_1 = vy1[i].qs; + svint16_t lo = svld1_s16(pg128_all, vy0[i].bsums + 0); + svint16_t hi = svld1_s16(pg128_all, vy0[i].bsums + 8); + svint16_t sum_tmp1 = svuzp1_s16(lo, hi); + svint16_t sum_tmp2 = svuzp2_s16(lo, hi); + svint16_t svq8sums_0 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2); + lo = svld1_s16(pg128_all, vy1[i].bsums + 0); + hi = svld1_s16(pg128_all, vy1[i].bsums + 8); + sum_tmp1 = svuzp1(lo, hi); + sum_tmp2 = svuzp2(lo, hi); + svint16_t svq8sums_1 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2); + svuint32_t decoded_scales0 = lm_ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales); + svuint32_t decoded_scales1 = lm_ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales); + svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1); + svst2_u32(pg128_all, new_utmp.u32, decoded_scales); + svint16_t svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp1_u32(svld1_u32(vmins_mask1, new_utmp.u32+4), svdup_n_u32(0))))); + svint16_t svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp2_u32(svld1_u32(vmins_mask2, new_utmp.u32+4), svdup_n_u32(0))))); + svint32_t svsumfs_tmp1 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_0)); + svint32_t svsumfs_tmp2 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_1)); + svint32_t svsumfs_tmp3 = svtrn1_s32(svsumfs_tmp1, svsumfs_tmp2); + svint32_t svsumfs_tmp4 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_0)); + svint32_t svsumfs_tmp5 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_1)); + svint32_t svsumfs_tmp6 = svtrn1_s32(svsumfs_tmp4, svsumfs_tmp5); + svint32_t svsumfs_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6))); + svint32_t svsumfs_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6))); + svint32_t svsumfs_tmp = svadd_s32_x(pg128_all, svsumfs_tmp7, svsumfs_tmp8); + svint32_t svscales, sumi1, sumi2; + svint32_t acc_sumif1 = svdup_n_s32(0); + svint32_t acc_sumif2 = svdup_n_s32(0); + svint8_t q4bytes_0_l, q4bytes_0_h, q4bytes_1_l, q4bytes_1_h, l0, l1, l2, l3, + q8bytes_0_h, q8bytes_0_l, q8bytes_1_h, q8bytes_1_l, r0, r1, r2, r3; +#pragma GCC unroll 1 + for (int j = 0; j < QK_K/64; ++j) { + q4bytes_0_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 0xf)); + q4bytes_1_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 0xf)); + q4bytes_0_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 0xf)); + q4bytes_1_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 0xf)); + l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l))); + l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l))); + l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h))); + l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h))); + q8bytes_0_h = svld1_s8(pg128_all, q8_0); + q8bytes_1_h = svld1_s8(pg128_all, q8_1); + q8bytes_0_l = svld1_s8(pg128_all, q8_0+16); + q8bytes_1_l = svld1_s8(pg128_all, q8_1+16); + r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h))); + r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h))); + r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l))); + r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l))); + sumi1 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3); + svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24)); + acc_sumif1 = svmla_s32_x(pg128_all, acc_sumif1, svscales, sumi1); + + q4bytes_0_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 4)); + q4bytes_1_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 4)); + q4bytes_0_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 4)); + q4bytes_1_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 4)); + l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l))); + l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l))); + l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h))); + l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h))); + q8bytes_0_h = svld1_s8(pg128_all, q8_0+32); + q8bytes_1_h = svld1_s8(pg128_all, q8_1+32); + q8bytes_0_l = svld1_s8(pg128_all, q8_0+48); + q8bytes_1_l = svld1_s8(pg128_all, q8_1+48); + r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h))); + r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h))); + r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l))); + r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l))); + sumi2 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3); + svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24)); + acc_sumif2 = svmla_s32_x(pg128_all, acc_sumif2, svscales, sumi2); + q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64; + } + sumf1 = svmla_f32_x(pg128_all, + svmla_f32_x(pg128_all, + sumf1, + svcvt_f32_x(pg128_all, + svadd_s32_x(pg128_all, acc_sumif1, acc_sumif2)), + svsuper_block_scales), + svdmins, + svcvt_f32_s32_x(pg128_all, svsumfs_tmp)); + } //end of for nb + } // end of case 128 + break; + case 256: + case 512: + { + const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4); + const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16); + const svbool_t pg256_all = svptrue_pat_b8(SV_ALL); + for (int i = 0; i < nb; ++i) { + const uint8_t * LM_GGML_RESTRICT q4_0 = vx0[i].qs; + const int8_t * LM_GGML_RESTRICT q8_0 = vy0[i].qs; + const uint8_t * LM_GGML_RESTRICT q4_1 = vx1[i].qs; + const int8_t * LM_GGML_RESTRICT q8_1 = vy1[i].qs; + svint32_t svscales, sumi1, sumi2; + svint32_t acc_sumif1 = svdup_n_s32(0); + svint32_t acc_sumif2 = svdup_n_s32(0); + svint8_t l0, l1, l2, l3, r0, r1, r2, r3; + svfloat32_t vx_d = svzip1_f32(svdup_n_f32(LM_GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(LM_GGML_FP16_TO_FP32(vx1[i].d))); + svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d))); + svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp)); + svfloat32_t svsuper_block_scales = svmul_f32_z(pg32_4, vy_d, vx_d); + svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(LM_GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(LM_GGML_FP16_TO_FP32(vx1[i].dmin))); + svfloat64_t vy_dmins_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d))); + svfloat32_t vy_dmins = svreinterpret_f32_f64(svuzp1_f64(vy_dmins_tmp, vy_dmins_tmp)); + svfloat32_t svdmins = svmul_n_f32_x(pg32_4, svmul_f32_x(pg32_4, vx_dmins, vy_dmins), -1); + svint16_t rc1 = svuzp1_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums)); + svint16_t rc2 = svuzp2_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums)); + svint16_t svq8sums = svadd_s16_x(pg256_all, rc1, rc2); + svuint32_t decoded_scales0 = lm_ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales); + svuint32_t decoded_scales1 = lm_ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales); + svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1); + svst2_u32(pg8_16, new_utmp.u32, decoded_scales); + svint16_t new_svq8sums_0 = svreinterpret_s16_u64(svtrn1_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums))); + svint16_t new_svq8sums_1 = svreinterpret_s16_u64(svtrn2_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums))); + svuint64_t new_mins_0 = svdup_u64(new_utmp.u64[2]); + svuint64_t new_mins_1 = svdup_u64(new_utmp.u64[3]); + svint16_t new_svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_0))); + svint16_t new_svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_1))); + svint64_t dot_prod_0 = svdot_s64(svdup_s64(0), new_svmins8_0, new_svq8sums_0); + svint64_t dot_prod_1 = svdot_s64(dot_prod_0, new_svmins8_1, new_svq8sums_1); + svfloat32_t converted_dot_prod_1 = svcvt_f32_s64_x(pg256_all, dot_prod_1); + svfloat32_t svsumfs_tmp = svuzp1_f32(converted_dot_prod_1, converted_dot_prod_1); + +#pragma GCC unroll 1 + for (int j = 0; j < QK_K/64; ++j) { + svuint8_t q4bytes_0 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 0xf); + svuint8_t q4bytes_1 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 0xf); + svuint8_t q4bytes_2 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 4); + svuint8_t q4bytes_3 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 4); + l0 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1))); + l1 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1))); + l2 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3))); + l3 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3))); + svint8_t q8bytes_0 = svld1_s8(pg256_all, q8_0); + svint8_t q8bytes_1 = svld1_s8(pg256_all, q8_1); + svint8_t q8bytes_2 = svld1_s8(pg256_all, q8_0+32); + svint8_t q8bytes_3 = svld1_s8(pg256_all, q8_1+32); + r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1))); + r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1))); + r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3))); + r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3))); + sumi1 = svmmla(svmmla(svdup_n_s32(0), r0, l0), r1, l1); + svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24)); + acc_sumif1 = svmla_s32_x(pg256_all, acc_sumif1, svscales, sumi1); + sumi2 = svmmla(svmmla(svdup_n_s32(0), r2, l2), r3, l3); + svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24)); + acc_sumif2 = svmla_s32_x(pg256_all, acc_sumif2, svscales, sumi2); + q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64; + } + svint32_t acc_sumif = svadd_s32_x(pg256_all, acc_sumif1, acc_sumif2); + svint32_t swap_acc_sumif = svext_s32(acc_sumif, acc_sumif, 4); + acc_sumif = svadd_s32_x(pg32_4, acc_sumif, swap_acc_sumif); + sumf1 = svmla_f32_x(pg32_4, + svmla_f32_x(pg32_4, + sumf1, + svcvt_f32_x(pg32_4, acc_sumif), + svsuper_block_scales), + svdmins, + svsumfs_tmp); + } // end of for nb + } // end of case 256-512 + break; + default: + assert(false && "Unsupported vector length"); + break; + } + + svst1_f32(pg32_2, s, sumf1); + svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sumf1), svdup_n_u8(0), 8))); + + return; + } +#elif defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { const block_q4_K * LM_GGML_RESTRICT x0 = x; const block_q4_K * LM_GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx); @@ -2235,7 +2467,6 @@ void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con const uint8_t * LM_GGML_RESTRICT q4 = x[i].qs; const int8_t * LM_GGML_RESTRICT q8 = y[i].qs; - const int vector_length = lm_ggml_cpu_get_sve_cnt()*8; const svuint8_t m4b = svdup_n_u8(0xf); const svint32_t mzero = svdup_n_s32(0); svint32_t sumi1 = svdup_n_s32(0); @@ -2480,7 +2711,201 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con const int nb = n / QK_K; -#if defined(__ARM_FEATURE_MATMUL_INT8) +#ifdef __ARM_FEATURE_SVE + const int vector_length = lm_ggml_cpu_get_sve_cnt()*8; +#endif +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const svbool_t pg32_2 = svptrue_pat_b32(SV_VL2); + + svfloat32_t sum = svdup_n_f32(0); + + const block_q6_K * LM_GGML_RESTRICT vx0 = vx; + const block_q8_K * LM_GGML_RESTRICT vy0 = vy; + const block_q6_K * LM_GGML_RESTRICT vx1 = (const block_q6_K *) ((const uint8_t*)vx + bx); + const block_q8_K * LM_GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by); + + switch (vector_length) { + case 128: + { + const svbool_t pg128_all = svptrue_pat_b8(SV_ALL); + for (int i = 0; i < nb; ++i) { + const uint8_t * LM_GGML_RESTRICT ql0 = vx0[i].ql; + const uint8_t * LM_GGML_RESTRICT qh0 = vx0[i].qh; + const uint8_t * LM_GGML_RESTRICT ql1 = vx1[i].ql; + const uint8_t * LM_GGML_RESTRICT qh1 = vx1[i].qh; + const int8_t * LM_GGML_RESTRICT q80 = vy0[i].qs; + const int8_t * LM_GGML_RESTRICT q81 = vy1[i].qs; + + const int8_t * LM_GGML_RESTRICT scale0 = vx0[i].scales; + const int8_t * LM_GGML_RESTRICT scale1 = vx1[i].scales; + + svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)); + svfloat32_t vx_d = svzip1_f32(svdup_n_f32(LM_GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(LM_GGML_FP16_TO_FP32(vx1[i].d))); + svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d); + // process q8sum summation 128 bit route + const svint16_t q8sums_01 = svld1_s16(pg128_all, vy0[i].bsums); + const svint16_t q8sums_02 = svld1_s16(pg128_all, vy0[i].bsums + 8); + const svint16_t q8sums_11 = svld1_s16(pg128_all, vy1[i].bsums); + const svint16_t q8sums_12 = svld1_s16(pg128_all, vy1[i].bsums + 8); + const svint64x2_t q6scales_0_tmp = svld2_s64(pg128_all, (const int64_t *)scale0); + const svint16_t q6scales_01 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 0))); + const svint16_t q6scales_02 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 1))); + const svint64x2_t q6scales_1_tmp = svld2_s64(pg128_all, (const int64_t *)scale1); + const svint16_t q6scales_11 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 0))); + const svint16_t q6scales_12 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 1))); + const svint64_t prod = svdup_n_s64(0); + + svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_01), q8sums_02, q6scales_02)); + svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_11), q8sums_02, q6scales_12)); + svint32_t isum_tmp3 = svtrn1_s32(isum_tmp1, isum_tmp2); + svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_01), q8sums_12, q6scales_02)); + svint32_t isum_tmp5 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_11), q8sums_12, q6scales_12)); + svint32_t isum_tmp6 = svtrn1_s32(isum_tmp4, isum_tmp5); + svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6))); + svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6))); + svint32_t svisum_mins = svadd_s32_x(pg128_all, isum_tmp7, isum_tmp8); + + // process mmla + svint8_t l0, l1, r0, r1; + svint32_t isum_tmp = svdup_n_s32(0); + for (int j = 0; j < QK_K/128; ++j) { + for (int k = 0; k < 8; ++k) { + svuint8_t qhbits_0 = svld1_u8(pg128_all, qh0+16*(k%2)); + svuint8_t qhbits_1 = svld1_u8(pg128_all, qh1+16*(k%2)); + svuint8_t q6bits_0 = svld1_u8(pg128_all, ql0+16*(k%4)); + svuint8_t q6bits_1 = svld1_u8(pg128_all, ql1+16*(k%4)); + const int ql_pos = (k/4)*4; + svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_0, 4); + svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_1, 4); + const int qh_pos = (k/2)*2; + svuint8_t q6bytes_0_hi = svand_n_u8_x(pg128_all, qhbits_0, 0x3 << qh_pos); + svuint8_t q6bytes_1_hi = svand_n_u8_x(pg128_all, qhbits_1, 0x3 << qh_pos); + svint8_t q6bytes_0, q6bytes_1; + if (qh_pos <= 4) { + q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos))); + q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos))); + } else { + q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_0_lo, svlsr_n_u8_x(pg128_all, q6bytes_0_hi, (qh_pos - 4)))); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_1_lo, svlsr_n_u8_x(pg128_all, q6bytes_1_hi, (qh_pos - 4)))); + } + svint8_t q8bytes_0 = svld1_s8(pg128_all, q80+16*(k%8)); + svint8_t q8bytes_1 = svld1_s8(pg128_all, q81+16*(k%8)); + l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1))); + l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1))); + r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1))); + r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1))); + svint32_t svscale = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k])); + isum_tmp = svmla_s32_x(pg128_all, isum_tmp, svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), svscale); + } + qh0 += 32; qh1 += 32; + ql0 += 64; ql1 += 64; + q80 += 128; q81 += 128; + scale0 += 8; scale1 += 8; + } + sum = svmla_f32_x(pg128_all, sum, + svcvt_f32_x(pg128_all, svmla_s32_x(pg128_all, isum_tmp, + svisum_mins, svdup_n_s32(-32))), + svsuper_block_scales); + } + } // end of case 128 + break; + case 256: + case 512: + { + const svbool_t pg256_all = svptrue_pat_b8(SV_ALL); + const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4); + for (int i = 0; i < nb; ++i) { + const uint8_t * LM_GGML_RESTRICT ql0 = vx0[i].ql; + const uint8_t * LM_GGML_RESTRICT qh0 = vx0[i].qh; + const uint8_t * LM_GGML_RESTRICT ql1 = vx1[i].ql; + const uint8_t * LM_GGML_RESTRICT qh1 = vx1[i].qh; + const int8_t * LM_GGML_RESTRICT q80 = vy0[i].qs; + const int8_t * LM_GGML_RESTRICT q81 = vy1[i].qs; + + const int8_t * LM_GGML_RESTRICT scale0 = vx0[i].scales; + const int8_t * LM_GGML_RESTRICT scale1 = vx1[i].scales; + svfloat32_t vx_d = svzip1_f32(svdup_n_f32(LM_GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(LM_GGML_FP16_TO_FP32(vx1[i].d))); + svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d))); + svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp)); + svfloat32_t svsuper_block_scales = svmul_f32_x(pg32_4, vy_d, vx_d); + // process q8sum summation 256 bit route + const svint16_t q8sums_0 = svld1_s16(pg256_all, vy0[i].bsums); + const svint16_t q8sums_1 = svld1_s16(pg256_all, vy1[i].bsums); + const svint16_t q6scales_0 = svunpklo_s16(svld1_s8(pg256_all, scale0)); + const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(pg256_all, scale1)); + const svint64_t prod = svdup_n_s64(0); + svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_0)); + svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_1)); + svint32_t isum_tmp3 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_0)); + svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_1)); + svint32_t isum_tmp5 = svtrn1_s32(isum_tmp1, isum_tmp2); + svint32_t isum_tmp6 = svtrn1_s32(isum_tmp3, isum_tmp4); + svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6))); + svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6))); + svint32_t isum_tmp9 = svadd_s32_x(pg256_all, isum_tmp7, isum_tmp8); + svint32_t isum_tmp10 = svreinterpret_s32_u8(svext_u8(svreinterpret_u8_s32(isum_tmp9), svreinterpret_u8_s32(isum_tmp9), 16)); + svint32_t svisum_mins = svadd_s32_z(pg32_4, isum_tmp9, isum_tmp10); + + // process mmla + svint8_t l0, l1, r0, r1; + svint32_t isum_tmp = svdup_n_s32(0); + for (int j = 0; j < QK_K/128; ++j) { + for (int k = 0; k < 8; k+=2) { // process 2 block + svuint8_t qhbits_0 = svld1_u8(pg256_all, qh0); + svuint8_t qhbits_1 = svld1_u8(pg256_all, qh1); + svuint8_t q6bits_0 = svld1_u8(pg256_all, ql0+32*((k%4)/2)); + svuint8_t q6bits_1 = svld1_u8(pg256_all, ql1+32*((k%4)/2)); + const int ql_pos = (k/4)*4; + svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_0, 4); + svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_1, 4); + const int qh_pos = (k/2)*2; + svuint8_t q6bytes_0_hi = svand_n_u8_x(pg256_all, qhbits_0, 0x3 << qh_pos); + svuint8_t q6bytes_1_hi = svand_n_u8_x(pg256_all, qhbits_1, 0x3 << qh_pos); + svint8_t q6bytes_0, q6bytes_1; + if (qh_pos <= 4) { + q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos))); + q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos))); + } else { + q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_0_lo, svlsr_n_u8_x(pg256_all, q6bytes_0_hi, (qh_pos - 4)))); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_1_lo, svlsr_n_u8_x(pg256_all, q6bytes_1_hi, (qh_pos - 4)))); + } + svint8_t q8bytes_0 = svld1_s8(pg256_all, q80+32*(k/2)); + svint8_t q8bytes_1 = svld1_s8(pg256_all, q81+32*(k/2)); + l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1))); + l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1))); + r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1))); + r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1))); + svint32_t svscale0 = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k])); + svint32_t svscale1 = svzip1_s32(svdup_n_s32(scale0[k+1]), svdup_n_s32(scale1[k+1])); + isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r0, l0), svscale0); + isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r1, l1), svscale1); + } + qh0 += 32; qh1 += 32; + ql0 += 64; ql1 += 64; + q80 += 128; q81 += 128; + scale0 += 8; scale1 += 8; + } // end of for + svint32_t swap_isum_tmp = svext_s32(isum_tmp, isum_tmp, 4); + isum_tmp = svadd_s32_x(pg32_4, isum_tmp, swap_isum_tmp); + sum = svmla_f32_x(pg32_4, sum, + svcvt_f32_x(pg32_4, svmla_s32_x(pg32_4, isum_tmp, + svisum_mins, svdup_n_s32(-32))), + svsuper_block_scales); + } + } // end of case 256 + break; + default: + assert(false && "Unsupported vector length"); + break; + } // end of switch + + svst1_f32(pg32_2, s, sum); + svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sum), svdup_n_u8(0), 8))); + + return; + } +#elif defined(__ARM_FEATURE_MATMUL_INT8) if (nrc == 2) { const block_q6_K * LM_GGML_RESTRICT x0 = x; const block_q6_K * LM_GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx); @@ -2594,27 +3019,6 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con // adjust bias, apply superblock scale { int32_t bias[4]; -#ifdef __ARM_FEATURE_SVE - const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8); - const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8); - const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums); - const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8); - const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums); - const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8); - const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales)); - const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8)); - const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales)); - const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8)); - const svint64_t zero = svdup_n_s64(0); - bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0), - svdot_s64(zero, y0_q8sums_1, x0_q6scales_1))); - bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0), - svdot_s64(zero, y1_q8sums_1, x0_q6scales_1))); - bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0), - svdot_s64(zero, y0_q8sums_1, x1_q6scales_1))); - bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0), - svdot_s64(zero, y1_q8sums_1, x1_q6scales_1))); -#else // NEON doesn't support int16 dot product, fallback to separated mul and add const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums); const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums); @@ -2646,7 +3050,6 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1])))); bias[3] = vaddvq_s32(prod); -#endif const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32); const float32x4_t superblock_scale = { @@ -2672,7 +3075,6 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con #endif #ifdef __ARM_FEATURE_SVE - const int vector_length = lm_ggml_cpu_get_sve_cnt()*8; float sum = 0; svuint8_t m4b = svdup_n_u8(0xf); svint32_t vzero = svdup_n_s32(0); diff --git a/cpp/ggml-cpu/ggml-cpu.c b/cpp/ggml-cpu/ggml-cpu.c index a2f1b58e..754877d3 100644 --- a/cpp/ggml-cpu/ggml-cpu.c +++ b/cpp/ggml-cpu/ggml-cpu.c @@ -1807,22 +1807,6 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_cont(params, tensor); } break; - case LM_GGML_OP_RESHAPE: - { - lm_ggml_compute_forward_reshape(params, tensor); - } break; - case LM_GGML_OP_VIEW: - { - lm_ggml_compute_forward_view(params, tensor); - } break; - case LM_GGML_OP_PERMUTE: - { - lm_ggml_compute_forward_permute(params, tensor); - } break; - case LM_GGML_OP_TRANSPOSE: - { - lm_ggml_compute_forward_transpose(params, tensor); - } break; case LM_GGML_OP_GET_ROWS: { lm_ggml_compute_forward_get_rows(params, tensor); @@ -2042,6 +2026,22 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { // nop } break; + case LM_GGML_OP_RESHAPE: + { + // nop + } break; + case LM_GGML_OP_PERMUTE: + { + // nop + } break; + case LM_GGML_OP_VIEW: + { + // nop + } break; + case LM_GGML_OP_TRANSPOSE: + { + // nop + } break; case LM_GGML_OP_COUNT: { LM_GGML_ABORT("fatal error"); @@ -2884,6 +2884,11 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct lm_ggml_tensor * node = cgraph->nodes[node_n]; + if (lm_ggml_op_is_empty(node->op)) { + // skip NOPs + continue; + } + lm_ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && diff --git a/cpp/ggml-cpu/ops.cpp b/cpp/ggml-cpu/ops.cpp index 13c01a45..0080090b 100644 --- a/cpp/ggml-cpu/ops.cpp +++ b/cpp/ggml-cpu/ops.cpp @@ -4455,46 +4455,6 @@ void lm_ggml_compute_forward_cont( lm_ggml_compute_forward_dup(params, dst); } -// lm_ggml_compute_forward_reshape - -void lm_ggml_compute_forward_reshape( - const lm_ggml_compute_params * params, - lm_ggml_tensor * dst) { - // NOP - LM_GGML_UNUSED(params); - LM_GGML_UNUSED(dst); -} - -// lm_ggml_compute_forward_view - -void lm_ggml_compute_forward_view( - const lm_ggml_compute_params * params, - lm_ggml_tensor * dst) { - // NOP - LM_GGML_UNUSED(params); - LM_GGML_UNUSED(dst); -} - -// lm_ggml_compute_forward_permute - -void lm_ggml_compute_forward_permute( - const lm_ggml_compute_params * params, - lm_ggml_tensor * dst) { - // NOP - LM_GGML_UNUSED(params); - LM_GGML_UNUSED(dst); -} - -// lm_ggml_compute_forward_transpose - -void lm_ggml_compute_forward_transpose( - const lm_ggml_compute_params * params, - lm_ggml_tensor * dst) { - // NOP - LM_GGML_UNUSED(params); - LM_GGML_UNUSED(dst); -} - // lm_ggml_compute_forward_get_rows static void lm_ggml_compute_forward_get_rows_q( diff --git a/cpp/ggml-cpu/ops.h b/cpp/ggml-cpu/ops.h index 3a62a1a7..55751c0b 100644 --- a/cpp/ggml-cpu/ops.h +++ b/cpp/ggml-cpu/ops.h @@ -51,10 +51,6 @@ void lm_ggml_compute_forward_scale(const struct lm_ggml_compute_params * params, void lm_ggml_compute_forward_set(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); void lm_ggml_compute_forward_cpy(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); void lm_ggml_compute_forward_cont(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); -void lm_ggml_compute_forward_reshape(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); -void lm_ggml_compute_forward_view(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); -void lm_ggml_compute_forward_permute(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); -void lm_ggml_compute_forward_transpose(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); void lm_ggml_compute_forward_get_rows(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); void lm_ggml_compute_forward_get_rows_back(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); void lm_ggml_compute_forward_set_rows(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst); diff --git a/cpp/ggml-metal/ggml-metal-device.m b/cpp/ggml-metal/ggml-metal-device.m index 29baaf47..504058d4 100644 --- a/cpp/ggml-metal/ggml-metal-device.m +++ b/cpp/ggml-metal/ggml-metal-device.m @@ -572,8 +572,10 @@ lm_ggml_metal_device_t lm_ggml_metal_device_init(void) { // TODO: try to update the tensor API kernels to at least match the simdgroup performance if (getenv("LM_GGML_METAL_TENSOR_ENABLE") == NULL && ![[dev->mtl_device name] containsString:@"M5"] && - ![[dev->mtl_device name] containsString:@"M6"]) { - LM_GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__); + ![[dev->mtl_device name] containsString:@"M6"] && + ![[dev->mtl_device name] containsString:@"A19"] && + ![[dev->mtl_device name] containsString:@"A20"]) { + LM_GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__); dev->props.has_tensor = false; } diff --git a/cpp/ggml-metal/ggml-metal-ops.cpp b/cpp/ggml-metal/ggml-metal-ops.cpp index fdffdf56..e1bed93b 100644 --- a/cpp/ggml-metal/ggml-metal-ops.cpp +++ b/cpp/ggml-metal/ggml-metal-ops.cpp @@ -1036,6 +1036,11 @@ int lm_ggml_metal_op_set_rows(lm_ggml_metal_op_t ctx, int idx) { nth = std::min(nth, nk0); + if (nth*nrptg > lm_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + nth = lm_ggml_metal_pipeline_max_theads_per_threadgroup(pipeline); + nrptg = 1; + } + lm_ggml_metal_kargs_set_rows args = { /*.nk0 =*/ nk0, /*.ne01 =*/ ne01, diff --git a/cpp/ggml-opencl/ggml-opencl.cpp b/cpp/ggml-opencl/ggml-opencl.cpp index 79944700..593173bc 100644 --- a/cpp/ggml-opencl/ggml-opencl.cpp +++ b/cpp/ggml-opencl/ggml-opencl.cpp @@ -2944,8 +2944,11 @@ static bool lm_ggml_opencl_supports_op(lm_ggml_backend_dev_t dev, const struct l return op->src[0]->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32; // Assuming F32 for now, can be expanded case LM_GGML_OP_PAD: return op->src[0]->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32; - case LM_GGML_OP_UPSCALE: - return op->src[0]->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32; + case LM_GGML_OP_UPSCALE: { + lm_ggml_scale_mode mode = (lm_ggml_scale_mode)(lm_ggml_get_op_params_i32(op, 0) & 0xFF); + return op->src[0]->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32 && + (mode == LM_GGML_SCALE_MODE_NEAREST || mode == LM_GGML_SCALE_MODE_BILINEAR); + } case LM_GGML_OP_CONV_2D: return (op->src[0]->type == LM_GGML_TYPE_F16 && op->src[1]->type == LM_GGML_TYPE_F16 && op->type == LM_GGML_TYPE_F16) || (op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32) || diff --git a/cpp/llama-memory-recurrent.cpp b/cpp/llama-memory-recurrent.cpp index 73dd765d..09d31cec 100644 --- a/cpp/llama-memory-recurrent.cpp +++ b/cpp/llama-memory-recurrent.cpp @@ -151,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1 = std::numeric_limits::max(); } - // models like Mamba or RWKV can't have a state partially erased + // models like Mamba or RWKV can't have a state partially erased at the end + // of the sequence because their state isn't preserved for previous tokens if (seq_id >= (int64_t) size) { // could be fatal return false; @@ -160,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos int32_t & tail_id = cells[seq_id].tail; if (tail_id >= 0) { const auto & cell = cells[tail_id]; - // partial intersection is invalid - if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) { + // partial intersection is invalid if it includes the final pos + if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) { //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n"); return false; } diff --git a/cpp/models/ernie4-5.cpp b/cpp/models/ernie4-5.cpp index 28c25e06..15756710 100644 --- a/cpp/models/ernie4-5.cpp +++ b/cpp/models/ernie4-5.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -19,6 +17,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap auto * inp_attn = build_attn_inp_kv(); + lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { lm_ggml_tensor * inpSA = inpL; @@ -67,9 +67,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap } if (il == n_layer - 1) { // skip computing output for unused tokens - lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); + cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); } lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); diff --git a/cpp/models/openai-moe-iswa.cpp b/cpp/models/openai-moe-iswa.cpp index b9ffe15a..9e8beb11 100644 --- a/cpp/models/openai-moe-iswa.cpp +++ b/cpp/models/openai-moe-iswa.cpp @@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, auto * inp_attn = build_attn_inp_kv_iswa(); + lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { lm_ggml_tensor * inpSA = inpL; @@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, } if (il == n_layer - 1) { // skip computing output for unused tokens - lm_ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/cpp/tools/mtmd/clip.cpp b/cpp/tools/mtmd/clip.cpp index cebb570e..dffa05eb 100644 --- a/cpp/tools/mtmd/clip.cpp +++ b/cpp/tools/mtmd/clip.cpp @@ -160,13 +160,13 @@ enum patch_merge_type { }; struct clip_hparams { - int32_t image_size; - int32_t patch_size; - int32_t n_embd; - int32_t n_ff; - int32_t projection_dim; - int32_t n_head; - int32_t n_layer; + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t n_embd = 0; + int32_t n_ff = 0; + int32_t projection_dim = 0; + int32_t n_head = 0; + int32_t n_layer = 0; // idefics3 int32_t image_longest_edge = 0; int32_t image_min_pixels = -1; @@ -2683,6 +2683,9 @@ struct clip_model_loader { } } else if (is_audio) { get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); + // some hparams are unused, but still need to set to avoid issues + hparams.image_size = 0; + hparams.patch_size = 1; } else { LM_GGML_ASSERT(false && "unknown modality"); diff --git a/src/version.ts b/src/version.ts index 8205c7f9..39e9f440 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ -export const BUILD_NUMBER = '7003'; -export const BUILD_COMMIT = 'b8595b1'; +export const BUILD_NUMBER = '7017'; +export const BUILD_COMMIT = '7bef684'; diff --git a/third_party/llama.cpp b/third_party/llama.cpp index b8595b16..7bef6841 160000 --- a/third_party/llama.cpp +++ b/third_party/llama.cpp @@ -1 +1 @@ -Subproject commit b8595b16e69e3029e06be3b8f6635f9812b2bc3f +Subproject commit 7bef684118cc44f9ab8b82df102d68db94a6d9f4 From beaee80db322ed2145f06080742ffb57b821a1ee Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Mon, 10 Nov 2025 17:34:14 +0800 Subject: [PATCH 4/5] fix: has_tensor check --- cpp/ggml-metal/ggml-metal-device.m | 4 ++-- scripts/patches/ggml-metal-device.m.patch | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/ggml-metal/ggml-metal-device.m b/cpp/ggml-metal/ggml-metal-device.m index 504058d4..e4089aec 100644 --- a/cpp/ggml-metal/ggml-metal-device.m +++ b/cpp/ggml-metal/ggml-metal-device.m @@ -183,11 +183,11 @@ lm_ggml_metal_library_t lm_ggml_metal_library_init(lm_ggml_metal_device_t dev) { #if TARGET_OS_SIMULATOR NSString *lib_name = @"ggml-llama-sim"; - if (dev->props.has_tensor) lib_name = @"ggml-llama-sim-metal4"; + if (lm_ggml_metal_device_get_props(dev)->has_tensor) lib_name = @"ggml-llama-sim-metal4"; NSString * path_lib = [bundle pathForResource:@"ggml-llama-sim" ofType:@"metallib"]; #else NSString *lib_name = @"ggml-llama"; - if (dev->props.has_tensor) lib_name = @"ggml-llama-metal4"; + if (lm_ggml_metal_device_get_props(dev)->has_tensor) lib_name = @"ggml-llama-metal4"; NSString * path_lib = [bundle pathForResource:@"ggml-llama" ofType:@"metallib"]; #endif if (path_lib == nil) { diff --git a/scripts/patches/ggml-metal-device.m.patch b/scripts/patches/ggml-metal-device.m.patch index 6c01ea2e..e9865714 100644 --- a/scripts/patches/ggml-metal-device.m.patch +++ b/scripts/patches/ggml-metal-device.m.patch @@ -7,11 +7,11 @@ - NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; +#if TARGET_OS_SIMULATOR + NSString *lib_name = @"ggml-llama-sim"; -+ if (dev->props.has_tensor) lib_name = @"ggml-llama-sim-metal4"; ++ if (lm_ggml_metal_device_get_props(dev)->has_tensor) lib_name = @"ggml-llama-sim-metal4"; + NSString * path_lib = [bundle pathForResource:@"ggml-llama-sim" ofType:@"metallib"]; +#else + NSString *lib_name = @"ggml-llama"; -+ if (dev->props.has_tensor) lib_name = @"ggml-llama-metal4"; ++ if (lm_ggml_metal_device_get_props(dev)->has_tensor) lib_name = @"ggml-llama-metal4"; + NSString * path_lib = [bundle pathForResource:@"ggml-llama" ofType:@"metallib"]; +#endif if (path_lib == nil) { From 341eba6e23632be84df061d995b86f7a9df2c7ce Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Tue, 11 Nov 2025 10:52:28 +0800 Subject: [PATCH 5/5] ci: use macos-26 --- .github/workflows/ci.yml | 4 ++-- .github/workflows/release.yml | 2 +- .github/workflows/sync-llama-cpp.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3403bc57..9824a681 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,7 +50,7 @@ jobs: ./run_tests.sh build-ios-from-source: - runs-on: macos-latest + runs-on: macos-26 steps: - name: Checkout uses: actions/checkout@v4 @@ -82,7 +82,7 @@ jobs: npm run build:ios build-ios-frameworks: - runs-on: macos-latest + runs-on: macos-26 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dbe58e45..fd8b801f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -10,7 +10,7 @@ permissions: jobs: publish: - runs-on: macos-latest + runs-on: macos-26 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/sync-llama-cpp.yml b/.github/workflows/sync-llama-cpp.yml index 1fd183bb..8a192fc2 100644 --- a/.github/workflows/sync-llama-cpp.yml +++ b/.github/workflows/sync-llama-cpp.yml @@ -8,7 +8,7 @@ on: jobs: sync: - runs-on: macos-15 + runs-on: macos-26 steps: - name: Checkout repository