From 9c572d791a48df29b45303dddb69041cb934a6c7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 May 2026 11:00:19 +0300 Subject: [PATCH 1/5] metal : fix GGML_OP_SET kernel threads --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 8506000b6c0..ac84c92dcbb 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -1786,7 +1786,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nk0 = ne10/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne11, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1797,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } From 8e65c8858d81f7074e0c6c131719306dfe36b35c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 May 2026 13:43:56 +0300 Subject: [PATCH 2/5] tests : extend test_cpy to support different src/dst shapes Extend test_cpy to support different source and destination tensor shapes for CPY operations (reshaping), where the total number of elements must match. - Renamed ne -> ne_src, added ne_dst parameter (default: use src shape) - Added 50 new reshaping test cases covering 1D<->2D<->3D<->4D conversions - Tests exercise 1024 boundary, small shapes, and large dimensionality changes - Fixed dangling reference bug (storing & to temporary std::array) - Updated all existing test calls with permute/transpose args for compatibility Assisted-by: llama.cpp:local pi --- tests/test-backend-ops.cpp | 294 ++++++++++++++++++++++++++++++++----- 1 file changed, 256 insertions(+), 38 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f54ab41c195..b4a0025f49c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2866,15 +2866,24 @@ struct test_set : public test_case { struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; - const std::array ne; + const std::array ne_src; + const std::array ne_dst; const std::array permute_src; const std::array permute_dst; bool _src_use_permute; bool _dst_use_permute; bool _src_transpose; + bool _use_dst_shape; std::string vars() override { - return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose); + if (_use_dst_shape) { + return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose); + } + return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose); + } + + int64_t total_elements() const { + return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3]; } double max_nmse_err() override { @@ -2899,7 +2908,7 @@ struct test_cpy : public test_case { err_estimate /= 8.0f; } err_estimate *= err_estimate; - err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]); + err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements()); return err_estimate; } return 1e-6; @@ -2910,17 +2919,19 @@ struct test_cpy : public test_case { } test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}, + std::array ne_src = {10, 10, 10, 1}, + std::array ne_dst = {-1, -1, -1, -1}, std::array permute_src = {0, 0, 0, 0}, std::array permute_dst = {0, 0, 0, 0}, bool transpose_src = false) - : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst), + : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst), _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0), - _src_transpose(transpose_src){} + _src_transpose(transpose_src), + _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); + ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data()); ggml_set_param(src); ggml_set_name(src, "src"); @@ -2934,7 +2945,8 @@ struct test_cpy : public test_case { ggml_set_name(src, "src_transposed"); } - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); + std::array dst_ne = _use_dst_shape ? ne_dst : std::array{src->ne[0], src->ne[1], src->ne[2], src->ne[3]}; + ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); ggml_set_name(dst, "dst"); if (_dst_use_permute) { @@ -8040,42 +8052,173 @@ static std::vector> make_test_cases_eval() { for (int k = 1; k < 4; ++k) { test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3})); } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : all_types) { for (ggml_type type_dst : {GGML_TYPE_F32}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous } } test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + + // CPY - test the new Metal kernel logic (nth = min(nk0*ne01, 256), row packing nrptg, early return i01>=ne01) + // Small ne0 (nk0=1) with various ne01 values (tests nth = min(nk0*ne01, 256)) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 2, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 4, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 8, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 16, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 128, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 256, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 512, 1, 1})); + // Small ne0 with large ne01 (tests nrptg row packing) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 128, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 256, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 128, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 16, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 8, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 64, 1, 1})); + // ne0*ne01 near 256 boundary (tests nth = min(nk0*ne01, 256)) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 255, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 257, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 128, 1, 1})); // nk0*ne01 = 256 + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 129, 1, 1})); // nk0*ne01 = 258 + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 1, 1})); // nk0*ne01 = 256 + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 65, 1, 1})); // nk0*ne01 = 260 + // Large ne0 with small ne01 (tests nrptg logic) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4096, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 2, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 4, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 8, 1, 1})); + // Multi-dimensional with row packing + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 2, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 4, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 16, 4, 2})); + // Permute src with various ne01 (tests the i01 calculation with tpitg.y) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 256, 3, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 128, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 64, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 128, 4, 3}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + // Permute dst with various ne01 + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 2, 1, 3})); + // Both src and dst permuted + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 2, 3}, {-1,-1,-1,-1}, {0, 2, 1, 3}, {0, 2, 1, 3})); + + // CPY - different src/dst shapes (reshaping via CPY) + // 1D -> 2D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {8, 8, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 1, 1}, {16, 16, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {32, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {1024, 1, 1, 1})); // same shape + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {64, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4096, 1, 1, 1}, {128, 32, 1, 1})); + // 1D -> 3D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {4, 4, 4, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {8, 8, 16, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {16, 8, 16, 1})); + // 1D -> 4D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {4, 2, 4, 2})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {8, 4, 8, 4})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {16, 8, 4, 4})); + // 2D -> 1D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 1, 1}, {64, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 16, 1, 1}, {256, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {1024, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 32, 1, 1}, {2048, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 32, 1, 1}, {4096, 1, 1, 1})); + // 2D -> 2D (different shapes) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {1, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {1, 1024, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {32, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {256, 4, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 2, 1, 1}, {512, 4, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {512, 4, 1, 1}, {1024, 2, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 8, 1, 1}, {256, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 64, 1, 1}, {2048, 8, 1, 1})); + // 2D -> 3D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {4, 4, 4, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {16, 8, 8, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {16, 4, 16, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {8, 8, 16, 1})); + // 3D -> 1D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 4, 4, 1}, {64, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 8, 1}, {1024, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 16, 1}, {1024, 1, 1, 1})); + // 3D -> 2D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 4, 4, 1}, {16, 4, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 16, 1}, {128, 8, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 8, 1}, {128, 8, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 16, 1}, {128, 16, 1, 1})); + // 3D -> 3D (different shapes) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 4, 4, 1}, {8, 2, 4, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 16, 1}, {16, 8, 8, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 16, 1}, {32, 8, 8, 1})); + // 4D -> 1D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {64, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {1024, 1, 1, 1})); + // 4D -> 2D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {32, 2, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {128, 8, 1, 1})); + // 4D -> 3D + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {8, 4, 2, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {16, 8, 8, 1})); + // 4D -> 4D (different shapes) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {2, 4, 4, 2})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {4, 8, 8, 4})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 4, 4}, {8, 16, 4, 4})); + // Various sizes with 1024 boundary + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {32, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {1024, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1023, 1, 1, 1}, {341, 3, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1025, 1, 1, 1}, {205, 5, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {64, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 32, 1, 1}, {2048, 1, 1, 1})); + // Small shapes + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 1, 1, 1}, {1, 1, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 1, 1, 1}, {1, 2, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3, 1, 1, 1}, {1, 3, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {6, 1, 1, 1}, {2, 3, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 3, 1, 1}, {6, 1, 1, 1})); for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (bool use_view_slice : { true, false }) { @@ -8833,6 +8976,56 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular test_cases.emplace_back(new test_pad_ext()); + + // PAD - test the new Metal kernel logic (1024-block processing, k0/i1 split, nth_max=64) + // ne0 exactly at 1024 boundary + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false)); + // ne0 just below 1024 (partial block) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false)); + // ne0 just above 1024 (tests nk0 > 1) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false)); + // ne0 = 2048 (exactly 2 blocks) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false)); + // ne0 = 2049 (3 blocks needed) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false)); + // ne0 very small (tests early break in loop) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {3, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {63, 1, 1, 1}, 1, 0, false)); // nth_max-1 + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {64, 1, 1, 1}, 1, 0, false)); // nth_max + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {65, 1, 1, 1}, 1, 0, false)); // nth_max+1 + // Various ne1 values (tests k0/i1 split in threadgroup dispatch) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 2, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 3, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 7, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 16, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 32, 1, 1}, 1, 0, false)); + // Large ne1 with ne0 > 1024 (tests nk0*ne1 threadgroup count) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 64, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 128, 1, 1}, 1, 0, false)); + // Multi-dimensional (ne2, ne3 > 1) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 2, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 2, 3}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 4, 2}, 1, 1, false)); + // Large padding + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false)); + // ne0 large, pad_0 = 0 (no padding in dim 0) + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 0, 1, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {4096, 4, 1, 1}, 0, 1, false)); + // ne0 large with large pad + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1024, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 512, 1, false)); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1})); test_cases.emplace_back(new test_roll()); @@ -9132,22 +9325,47 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + + // CPY - more tests for the new Metal kernel (tpitg ushort3, nrptg row packing) + // Very large ne0*ne01 (tests nrptg with many rows) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 1024, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 512, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 256, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 128, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 64, 1, 1})); + // Large ne0 with moderate ne01 (tests nrptg logic with larger rows) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 128, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {512, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 32, 1, 1})); + // Quantized types with small rows (tests kernel_cpy_f32_q and kernel_cpy_q_f32) + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {4, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {16, 16, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {4, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {16, 16, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q8_0, {4, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q8_0, {8, 32, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q8_0, GGML_TYPE_F32, {4, 64, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q8_0, GGML_TYPE_F32, {8, 32, 1, 1})); + // Quantized with permuted src + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); From dd2bb4756b6df6e4b105aae595c0f9b78fa44e3c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 May 2026 14:26:04 +0300 Subject: [PATCH 3/5] metal : optimize concat kernel with row batching for small widths When ne0 < 256, batch multiple rows into a single threadgroup to improve occupancy. This avoids underutilizing the GPU when processing narrow tensors. - Dispatch nth = min(256, ne0) threads per group - Calculate nrptg (rows per threadgroup) to fill up to 256 threads - Update kernel index calculation to handle the row batching - Add boundary check for i1 >= ne1 Assisted-by: llama.cpp:local pi --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 15 +++++++++++++-- ggml/src/ggml-metal/ggml-metal.metal | 6 +++++- tests/test-backend-ops.cpp | 11 +++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index ac84c92dcbb..206af227a2c 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); - const int nth = std::min(1024, ne0); + int nth = std::min(256, ne0); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + if (nth < 256) { + nrptg = std::min((256 + nth - 1) / nth, ne1); + if (nrptg * nth > 256) { + nrptg = 256 / nth; + } + } + + const int nw0 = (ne1 + nrptg - 1) / nrptg; + + ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1); return 1; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 4cf9dbea946..e772664ba91 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -7486,7 +7486,11 @@ kernel void kernel_concat( const int i3 = tgpig.z; const int i2 = tgpig.y; - const int i1 = tgpig.x; + const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y; + + if (i1 >= args.ne1) { + return; + } int o[4] = {0, 0, 0, 0}; o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b4a0025f49c..a76fbfb12d9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8877,6 +8877,17 @@ static std::vector> make_test_cases_eval() { } } + // Test concat with small ne0 values to exercise the Metal kernel row batching logic + // When ne0 < 1024, multiple rows are batched into a single threadgroup + // Key cases: ne0 values that trigger different nrptg calculations + for (int64_t ne0 : {1, 2, 7, 31, 32, 63, 64, 127, 128, 255, 256, 511, 512, 1023, 1024, 1025}) { + // Test with various ne1 values including edge cases for batching + for (int64_t ne1 : {1, 2, 10, 100, 1000}) { + test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {ne0, ne1, 3, 2}, ne1 + 1, 0, 0)); + test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {ne0, ne1, 3, 2}, ne1 + 1, 1, 0)); + } + } + for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { for (uint32_t i = 4; i <= 1024*1024; i *= 2) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1})); From 654df983077eb6acd5181def2f1e5d1a3b9b05d3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 10:00:21 +0300 Subject: [PATCH 4/5] tests : clean-up --- tests/test-backend-ops.cpp | 129 +------------------------------------ 1 file changed, 1 insertion(+), 128 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index a76fbfb12d9..99bbab32923 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8089,61 +8089,6 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); - // CPY - test the new Metal kernel logic (nth = min(nk0*ne01, 256), row packing nrptg, early return i01>=ne01) - // Small ne0 (nk0=1) with various ne01 values (tests nth = min(nk0*ne01, 256)) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 2, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 4, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 8, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 16, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 128, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 256, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 512, 1, 1})); - // Small ne0 with large ne01 (tests nrptg row packing) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 128, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 256, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 128, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 16, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 8, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 64, 1, 1})); - // ne0*ne01 near 256 boundary (tests nth = min(nk0*ne01, 256)) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 255, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 257, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 128, 1, 1})); // nk0*ne01 = 256 - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 129, 1, 1})); // nk0*ne01 = 258 - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 1, 1})); // nk0*ne01 = 256 - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 65, 1, 1})); // nk0*ne01 = 260 - // Large ne0 with small ne01 (tests nrptg logic) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4096, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 2, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 4, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 8, 1, 1})); - // Multi-dimensional with row packing - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 2, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 4, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 2, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 16, 4, 2})); - // Permute src with various ne01 (tests the i01 calculation with tpitg.y) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 256, 3, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 128, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 64, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 128, 4, 3}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - // Permute dst with various ne01 - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 2, 1, 3})); - // Both src and dst permuted - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 64, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 32, 2, 3}, {-1,-1,-1,-1}, {0, 2, 1, 3}, {0, 2, 1, 3})); - // CPY - different src/dst shapes (reshaping via CPY) // 1D -> 2D test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {8, 8, 1, 1})); @@ -8877,17 +8822,6 @@ static std::vector> make_test_cases_eval() { } } - // Test concat with small ne0 values to exercise the Metal kernel row batching logic - // When ne0 < 1024, multiple rows are batched into a single threadgroup - // Key cases: ne0 values that trigger different nrptg calculations - for (int64_t ne0 : {1, 2, 7, 31, 32, 63, 64, 127, 128, 255, 256, 511, 512, 1023, 1024, 1025}) { - // Test with various ne1 values including edge cases for batching - for (int64_t ne1 : {1, 2, 10, 100, 1000}) { - test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {ne0, ne1, 3, 2}, ne1 + 1, 0, 0)); - test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {ne0, ne1, 3, 2}, ne1 + 1, 1, 0)); - } - } - for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { for (uint32_t i = 4; i <= 1024*1024; i *= 2) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1})); @@ -8984,58 +8918,23 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3)); + test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular test_cases.emplace_back(new test_pad_ext()); - - // PAD - test the new Metal kernel logic (1024-block processing, k0/i1 split, nth_max=64) - // ne0 exactly at 1024 boundary test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false)); - // ne0 just below 1024 (partial block) test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false)); - // ne0 just above 1024 (tests nk0 > 1) test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false)); - // ne0 = 2048 (exactly 2 blocks) test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false)); - // ne0 = 2049 (3 blocks needed) test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false)); - // ne0 very small (tests early break in loop) - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1, 1, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1, 8, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2, 1, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {3, 1, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {63, 1, 1, 1}, 1, 0, false)); // nth_max-1 - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {64, 1, 1, 1}, 1, 0, false)); // nth_max - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {65, 1, 1, 1}, 1, 0, false)); // nth_max+1 - // Various ne1 values (tests k0/i1 split in threadgroup dispatch) - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 2, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 3, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 7, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 16, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 32, 1, 1}, 1, 0, false)); - // Large ne1 with ne0 > 1024 (tests nk0*ne1 threadgroup count) - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 64, 1, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 128, 1, 1}, 1, 0, false)); - // Multi-dimensional (ne2, ne3 > 1) - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 2, 1}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 2, 3}, 1, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 4, 2}, 1, 1, false)); - // Large padding test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false)); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false)); - // ne0 large, pad_0 = 0 (no padding in dim 0) - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 0, 1, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {4096, 4, 1, 1}, 0, 1, false)); - // ne0 large with large pad - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1024, 0, false)); - test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 512, 1, false)); test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1})); @@ -9352,32 +9251,6 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - // CPY - more tests for the new Metal kernel (tpitg ushort3, nrptg row packing) - // Very large ne0*ne01 (tests nrptg with many rows) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 1024, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 512, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 256, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 128, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 64, 1, 1})); - // Large ne0 with moderate ne01 (tests nrptg logic with larger rows) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 128, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {512, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 32, 1, 1})); - // Quantized types with small rows (tests kernel_cpy_f32_q and kernel_cpy_q_f32) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {4, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {16, 16, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {4, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {16, 16, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q8_0, {4, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q8_0, {8, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_Q8_0, GGML_TYPE_F32, {4, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_Q8_0, GGML_TYPE_F32, {8, 32, 1, 1})); - // Quantized with permuted src - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); From 277665141103dd5a735d045d6e53bf676aea5984 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 10:20:37 +0300 Subject: [PATCH 5/5] tests : refactor CPY shape tests to use dimension permutations Replace 75 hardcoded test cases with a loop over permutations of {3, 5, 7, 32} (total elements: 3360). Each src permutation is tested against canonical sorted and reverse dst, skipping identical shapes. Covers F32, F16, and Q4_0 (when both src and dst ne0 == 32). Assisted-by: llama.cpp:local pi --- tests/test-backend-ops.cpp | 102 ++++++++++--------------------------- 1 file changed, 28 insertions(+), 74 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 99bbab32923..116c08f3a41 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8090,80 +8090,34 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); // CPY - different src/dst shapes (reshaping via CPY) - // 1D -> 2D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {8, 8, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 1, 1}, {16, 16, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {32, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {1024, 1, 1, 1})); // same shape - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {64, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4096, 1, 1, 1}, {128, 32, 1, 1})); - // 1D -> 3D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {4, 4, 4, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {8, 8, 16, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {16, 8, 16, 1})); - // 1D -> 4D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {4, 2, 4, 2})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {8, 4, 8, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {16, 8, 4, 4})); - // 2D -> 1D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 1, 1}, {64, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 16, 1, 1}, {256, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {1024, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 32, 1, 1}, {2048, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 32, 1, 1}, {4096, 1, 1, 1})); - // 2D -> 2D (different shapes) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {1, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {1, 1024, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {32, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {256, 4, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 2, 1, 1}, {512, 4, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {512, 4, 1, 1}, {1024, 2, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 8, 1, 1}, {256, 64, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 64, 1, 1}, {2048, 8, 1, 1})); - // 2D -> 3D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 1, 1, 1}, {4, 4, 4, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {16, 8, 8, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {16, 4, 16, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {8, 8, 16, 1})); - // 3D -> 1D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 4, 4, 1}, {64, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 8, 1}, {1024, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 16, 1}, {1024, 1, 1, 1})); - // 3D -> 2D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 4, 4, 1}, {16, 4, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 16, 1}, {128, 8, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 8, 1}, {128, 8, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 16, 1}, {128, 16, 1, 1})); - // 3D -> 3D (different shapes) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 4, 4, 1}, {8, 2, 4, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 8, 16, 1}, {16, 8, 8, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 16, 1}, {32, 8, 8, 1})); - // 4D -> 1D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {64, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {1024, 1, 1, 1})); - // 4D -> 2D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {32, 2, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {128, 8, 1, 1})); - // 4D -> 3D - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {8, 4, 2, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {16, 8, 8, 1})); - // 4D -> 4D (different shapes) - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 2, 4, 2}, {2, 4, 4, 2})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8, 4, 8, 4}, {4, 8, 8, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {16, 8, 4, 4}, {8, 16, 4, 4})); - // Various sizes with 1024 boundary - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1024, 1, 1, 1}, {32, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {32, 32, 1, 1}, {1024, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1023, 1, 1, 1}, {341, 3, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1025, 1, 1, 1}, {205, 5, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2048, 1, 1, 1}, {64, 32, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {64, 32, 1, 1}, {2048, 1, 1, 1})); - // Small shapes - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {1, 1, 1, 1}, {1, 1, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 1, 1, 1}, {1, 2, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3, 1, 1, 1}, {1, 3, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {6, 1, 1, 1}, {2, 3, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 3, 1, 1}, {6, 1, 1, 1})); + // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360. + // Each src permutation is tested against canonical sorted and reverse dst (skip self). + { + std::array dims = {3, 5, 7, 32}; + std::sort(dims.begin(), dims.end()); + std::array canonical = dims; + std::array reversed = {32, 7, 5, 3}; + for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + std::array cur = dims; + do { + if (cur != canonical) { + test_cases.emplace_back(new test_cpy(type, type, cur, canonical)); + } + if (cur != reversed) { + test_cases.emplace_back(new test_cpy(type, type, cur, reversed)); + } + if (cur[0] == 32 && type == GGML_TYPE_F32) { + if (canonical[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical)); + } + if (reversed[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed)); + } + } + std::next_permutation(cur.begin(), cur.end()); + } while (cur != canonical); + } + } for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (bool use_view_slice : { true, false }) {