-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMatMulPack24.S
122 lines (100 loc) · 2.82 KB
/
MatMulPack24.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#if defined(__aarch64__)
#include "ArmAsmGlobal.h"
.text
.align 5
asm_function MatMulPack24
//void MatMulPack24(float *C, float *A, float *B, size_t eP, size_t l, size_t hP)
//Auto load:
//x0: C, x1:A, x2:B, x3:eP, x4:l, x5:hP
cbz x3, End
cbz x4, End
cbz x5, End
sub sp, sp, #128
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
// x9: b_stride * sizeof(fp16)
lsl x9, x4, #4
LoopHP:
// x10: A_offset
mov x10, x1
// x13 = eP
mov x13, x3
LoopEP:
// x14 = B_offset
mov x14, x2
ld1 {v3.8h}, [x14], #16
ld1 {v0.8h, v1.8h, v2.8h}, [x10], #48
fmul v8.8h, v3.8h, v0.h[0]
fmul v9.8h, v3.8h, v0.h[1]
fmul v10.8h, v3.8h, v0.h[2]
fmul v11.8h, v3.8h, v0.h[3]
fmul v12.8h, v3.8h, v0.h[4]
fmul v13.8h, v3.8h, v0.h[5]
fmul v14.8h, v3.8h, v0.h[6]
fmul v15.8h, v3.8h, v0.h[7]
fmul v16.8h, v3.8h, v1.h[0]
fmul v17.8h, v3.8h, v1.h[1]
fmul v18.8h, v3.8h, v1.h[2]
fmul v19.8h, v3.8h, v1.h[3]
fmul v20.8h, v3.8h, v1.h[4]
fmul v21.8h, v3.8h, v1.h[5]
fmul v22.8h, v3.8h, v1.h[6]
fmul v23.8h, v3.8h, v1.h[7]
fmul v24.8h, v3.8h, v2.h[0]
fmul v25.8h, v3.8h, v2.h[1]
fmul v26.8h, v3.8h, v2.h[2]
fmul v27.8h, v3.8h, v2.h[3]
fmul v28.8h, v3.8h, v2.h[4]
fmul v29.8h, v3.8h, v2.h[5]
fmul v30.8h, v3.8h, v2.h[6]
fmul v31.8h, v3.8h, v2.h[7]
// x12 = l - 1
subs x12, x4, #1
beq LoopEPRemain
LoopL:
ld1 {v3.8h}, [x14], #16
ld1 {v0.8h, v1.8h, v2.8h}, [x10], #48
fmla v8.8h, v3.8h, v0.h[0]
fmla v9.8h, v3.8h, v0.h[1]
fmla v10.8h, v3.8h, v0.h[2]
fmla v11.8h, v3.8h, v0.h[3]
fmla v12.8h, v3.8h, v0.h[4]
fmla v13.8h, v3.8h, v0.h[5]
fmla v14.8h, v3.8h, v0.h[6]
fmla v15.8h, v3.8h, v0.h[7]
fmla v16.8h, v3.8h, v1.h[0]
fmla v17.8h, v3.8h, v1.h[1]
fmla v18.8h, v3.8h, v1.h[2]
fmla v19.8h, v3.8h, v1.h[3]
fmla v20.8h, v3.8h, v1.h[4]
fmla v21.8h, v3.8h, v1.h[5]
fmla v22.8h, v3.8h, v1.h[6]
fmla v23.8h, v3.8h, v1.h[7]
fmla v24.8h, v3.8h, v2.h[0]
fmla v25.8h, v3.8h, v2.h[1]
fmla v26.8h, v3.8h, v2.h[2]
fmla v27.8h, v3.8h, v2.h[3]
fmla v28.8h, v3.8h, v2.h[4]
fmla v29.8h, v3.8h, v2.h[5]
fmla v30.8h, v3.8h, v2.h[6]
fmla v31.8h, v3.8h, v2.h[7]
subs x12, x12, #1
bne LoopL
LoopEPRemain:
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #64
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
subs x13, x13, #1
bne LoopEP
add x2, x2, x9
subs x5, x5, #1
bne LoopHP
sub sp, sp, #128
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
End:
ret
#endif