Skip to content
14 changes: 8 additions & 6 deletions ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@ void main() {

const uint offset = p.param3;
const uint src1_i = idx - offset;
const uint oz = src1_i / p.nb02;
const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
const uint ox = src1_i % p.nb01;
const uint oz = src1_i / p.nb03;
Comment thread
0cc4m marked this conversation as resolved.
Outdated
const uint remy = src1_i - oz * p.nb03;
const uint oy = remy / p.nb02;
const uint remx = remy - oy * p.nb02;
const uint ox = remx / p.nb01;
const uint ow = remx % p.nb01;
Comment thread
0cc4m marked this conversation as resolved.
Outdated

uint i00, i01, i02, i03;
get_indices(idx, i00, i01, i02, i03);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the CPU reference, seems like there should only be one set of indices and then you can apply offset/sizeof(float) to the final index for src0 and dst.


if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
if (ow < p.ne10 && ox < p.ne11 && oy < p.ne12 && oz < p.ne13) {
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ow + ox * p.ne10 + oy * p.ne10 * p.ne11 + oz * p.ne10 * p.ne11 * p.ne12]));
} else {
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
}
}

58 changes: 58 additions & 0 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5852,6 +5852,62 @@ struct test_acc : public test_case {
}
};

// GGML_OP_ACC - block accumulation test
struct test_acc_block: public test_case {
Comment thread
0cc4m marked this conversation as resolved.
Outdated
const ggml_type type;
const int64_t block_size;
const int64_t n_blocks;
const int64_t ne2;
const int64_t ne3;

std::string vars() override {
return VARS_TO_STR5(type, block_size, n_blocks, ne2, ne3);
}

test_acc_block(ggml_type type = GGML_TYPE_F32,
int64_t block_size = 16,
int64_t n_blocks = 4,
int64_t ne2 = 1,
int64_t ne3 = 1)
: type(type), block_size(block_size), n_blocks(n_blocks), ne2(ne2), ne3(ne3) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
const int64_t chunk_size = block_size * n_blocks;

// Base tensor initialized to zero using ggml_clamp
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, chunk_size, chunk_size, ne2, ne3);
ggml_set_param(a);
ggml_set_name(a, "a");

// Source blocks that will be accumulated at different offsets
// Mimics the lower-triangular block pattern from the original code
for (int64_t j = 0; j < n_blocks; ++j) {
for (int64_t i = 0; i <= j; ++i) {
ggml_tensor * block = ggml_new_tensor_4d(ctx, type,
block_size, block_size, ne2, ne3);
ggml_set_param(block);

char name[64];
snprintf(name, sizeof(name), "block_%ld_%ld", (long)j, (long)i);
ggml_set_name(block, name);

// Accumulate block at position [i*block_size, j*block_size]
// This is the same pattern as the original code:
// offset = i_start * nb[0] + j_start * nb[1]
size_t offset = (i * block_size) * ggml_type_size(type)
+ (j * block_size) * (chunk_size * ggml_type_size(type));

a = ggml_acc(ctx, a, block,
a->nb[1], a->nb[2], a->nb[3],
offset);
}
}

ggml_set_name(a, "out");
return a;
}
};

// GGML_OP_PAD
struct test_pad : public test_case {
const ggml_type type;
Expand Down Expand Up @@ -8130,6 +8186,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {64, 64, 320, 1}));
test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_acc_block(GGML_TYPE_F32, 16, 4, 3, 2));
test_cases.emplace_back(new test_acc_block(GGML_TYPE_F32, 32, 4, 2, 2));
test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
test_cases.emplace_back(new test_pad_ext());
Expand Down
Loading