Skip to content

Commit 689c68e

Browse files
hexagon: remove the need for vdelta in reduce-multiply-x8
1 parent 4c44340 commit 689c68e

File tree

1 file changed

+11
-37
lines changed

1 file changed

+11
-37
lines changed

ggml/src/ggml-hexagon/htp/matmul-ops.c

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -253,48 +253,22 @@ static inline HVX_Vector hvx_vec_rmpy_x8(HVX_Vector_x8 x, HVX_Vector_x8 y) {
253253
HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
254254
HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
255255

256-
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r4, r0, 16);
257-
HVX_VectorPair p1 = Q6_W_vshuff_VVR(r5, r1, 16);
258-
HVX_VectorPair p2 = Q6_W_vshuff_VVR(r6, r2, 16);
259-
HVX_VectorPair p3 = Q6_W_vshuff_VVR(r7, r3, 16);
260-
256+
HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
257+
HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
258+
HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
259+
HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
261260
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
262261
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
263262
r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
264263
r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
265264

266-
p0 = Q6_W_vshuff_VVR(r2, r0, 8);
267-
p1 = Q6_W_vshuff_VVR(r3, r1, 8);
268-
r0 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p0), Q6_V_lo_W(p0));
269-
r1 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p1), Q6_V_lo_W(p1));
270-
271-
p0 = Q6_W_vshuff_VVR(r1, r0, 4);
272-
r0 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p0), Q6_V_lo_W(p0));
273-
274-
static const uint8_t vrd[128] __attribute__((aligned(128))) = {
275-
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
276-
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
277-
0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54,
278-
0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
279-
0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
280-
0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68,
281-
0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C,
282-
0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C,
283-
};
284-
285-
static const uint8_t vd[128] __attribute__((aligned(128))) = {
286-
0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x10, 0x10, 0x20, 0x20, 0x20, 0x20, 0x30, 0x30, 0x30, 0x30,
287-
0x24, 0x24, 0x24, 0x24, 0x34, 0x34, 0x34, 0x34, 0x04, 0x04, 0x04, 0x04, 0x14, 0x14, 0x14, 0x14,
288-
0x08, 0x08, 0x08, 0x08, 0x18, 0x18, 0x18, 0x18, 0x28, 0x28, 0x28, 0x28, 0x38, 0x38, 0x38, 0x38,
289-
0x2C, 0x2C, 0x2C, 0x2C, 0x3C, 0x3C, 0x3C, 0x3C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1C, 0x1C, 0x1C, 0x1C,
290-
0x30, 0x30, 0x30, 0x30, 0x20, 0x20, 0x20, 0x20, 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00,
291-
0x14, 0x14, 0x14, 0x14, 0x04, 0x04, 0x04, 0x04, 0x34, 0x34, 0x34, 0x34, 0x24, 0x24, 0x24, 0x24,
292-
0x38, 0x38, 0x38, 0x38, 0x28, 0x28, 0x28, 0x28, 0x18, 0x18, 0x18, 0x18, 0x08, 0x08, 0x08, 0x08,
293-
0x1C, 0x1C, 0x1C, 0x1C, 0x0C, 0x0C, 0x0C, 0x0C, 0x3C, 0x3C, 0x3C, 0x3C, 0x2C, 0x2C, 0x2C, 0x2C,
294-
};
295-
296-
r0 = Q6_V_vrdelta_VV(r0, *(HVX_Vector *) vrd);
297-
r0 = Q6_V_vdelta_VV(r0, *(HVX_Vector *) vd);
265+
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
266+
p1 = Q6_W_vdeal_VVR(r3, r2, -4);
267+
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
268+
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
269+
270+
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
271+
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
298272

299273
return r0;
300274
}

0 commit comments

Comments
 (0)