-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathdecode_amd64.s
143 lines (130 loc) · 4.06 KB
/
decode_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
// Code generated by command: go run decode_asm.go -pkg base64 -out ../base64/decode_amd64.s -stubs ../base64/decode_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
DATA b64_dec_lut_hi<>+0(SB)/8, $0x0804080402011010
DATA b64_dec_lut_hi<>+8(SB)/8, $0x1010101010101010
DATA b64_dec_lut_hi<>+16(SB)/8, $0x0804080402011010
DATA b64_dec_lut_hi<>+24(SB)/8, $0x1010101010101010
GLOBL b64_dec_lut_hi<>(SB), RODATA|NOPTR, $32
DATA b64_dec_madd1<>+0(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+8(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+16(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+24(SB)/8, $0x0140014001400140
GLOBL b64_dec_madd1<>(SB), RODATA|NOPTR, $32
DATA b64_dec_madd2<>+0(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+8(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+16(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+24(SB)/8, $0x0001100000011000
GLOBL b64_dec_madd2<>(SB), RODATA|NOPTR, $32
DATA b64_dec_shuf_lo<>+0(SB)/8, $0x0000000000000000
DATA b64_dec_shuf_lo<>+8(SB)/8, $0x0600010200000000
GLOBL b64_dec_shuf_lo<>(SB), RODATA|NOPTR, $16
DATA b64_dec_shuf<>+0(SB)/8, $0x090a040506000102
DATA b64_dec_shuf<>+8(SB)/8, $0x000000000c0d0e08
DATA b64_dec_shuf<>+16(SB)/8, $0x0c0d0e08090a0405
DATA b64_dec_shuf<>+24(SB)/8, $0x0000000000000000
GLOBL b64_dec_shuf<>(SB), RODATA|NOPTR, $32
// func decodeAVX2(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·decodeAVX2(SB), NOSPLIT, $0-72
MOVQ dst_base+0(FP), AX
MOVQ src_base+24(FP), DX
MOVQ lut+48(FP), SI
MOVQ src_len+32(FP), DI
MOVB $0x2f, CL
PINSRB $0x00, CX, X8
VPBROADCASTB X8, Y8
XORQ CX, CX
XORQ BX, BX
VPXOR Y7, Y7, Y7
VPERMQ $0x44, (SI), Y6
VPERMQ $0x44, 16(SI), Y4
VMOVDQA b64_dec_lut_hi<>+0(SB), Y5
loop:
VMOVDQU (DX)(BX*1), Y0
VPSRLD $0x04, Y0, Y2
VPAND Y8, Y0, Y3
VPSHUFB Y3, Y4, Y3
VPAND Y8, Y2, Y2
VPSHUFB Y2, Y5, Y9
VPTEST Y9, Y3
JNE done
VPCMPEQB Y8, Y0, Y3
VPADDB Y3, Y2, Y2
VPSHUFB Y2, Y6, Y2
VPADDB Y0, Y2, Y0
VPMADDUBSW b64_dec_madd1<>+0(SB), Y0, Y0
VPMADDWD b64_dec_madd2<>+0(SB), Y0, Y0
VEXTRACTI128 $0x01, Y0, X1
VPSHUFB b64_dec_shuf_lo<>+0(SB), X1, X1
VPSHUFB b64_dec_shuf<>+0(SB), Y0, Y0
VPBLENDD $0x08, Y1, Y0, Y1
VPBLENDD $0xc0, Y7, Y1, Y1
VMOVDQU Y1, (AX)(CX*1)
ADDQ $0x18, CX
ADDQ $0x20, BX
SUBQ $0x20, DI
CMPQ DI, $0x2d
JB done
JMP loop
done:
MOVQ CX, ret+56(FP)
MOVQ BX, ret1+64(FP)
VZEROUPPER
RET
// func decodeAVX2URI(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·decodeAVX2URI(SB), NOSPLIT, $0-72
MOVB $0x2f, AL
PINSRB $0x00, AX, X0
VPBROADCASTB X0, Y0
MOVB $0x5f, AL
PINSRB $0x00, AX, X1
VPBROADCASTB X1, Y1
MOVQ dst_base+0(FP), AX
MOVQ src_base+24(FP), DX
MOVQ lut+48(FP), SI
MOVQ src_len+32(FP), DI
MOVB $0x2f, CL
PINSRB $0x00, CX, X10
VPBROADCASTB X10, Y10
XORQ CX, CX
XORQ BX, BX
VPXOR Y9, Y9, Y9
VPERMQ $0x44, (SI), Y8
VPERMQ $0x44, 16(SI), Y6
VMOVDQA b64_dec_lut_hi<>+0(SB), Y7
loop:
VMOVDQU (DX)(BX*1), Y2
VPCMPEQB Y2, Y1, Y4
VPBLENDVB Y4, Y0, Y2, Y2
VPSRLD $0x04, Y2, Y4
VPAND Y10, Y2, Y5
VPSHUFB Y5, Y6, Y5
VPAND Y10, Y4, Y4
VPSHUFB Y4, Y7, Y11
VPTEST Y11, Y5
JNE done
VPCMPEQB Y10, Y2, Y5
VPADDB Y5, Y4, Y4
VPSHUFB Y4, Y8, Y4
VPADDB Y2, Y4, Y2
VPMADDUBSW b64_dec_madd1<>+0(SB), Y2, Y2
VPMADDWD b64_dec_madd2<>+0(SB), Y2, Y2
VEXTRACTI128 $0x01, Y2, X3
VPSHUFB b64_dec_shuf_lo<>+0(SB), X3, X3
VPSHUFB b64_dec_shuf<>+0(SB), Y2, Y2
VPBLENDD $0x08, Y3, Y2, Y3
VPBLENDD $0xc0, Y9, Y3, Y3
VMOVDQU Y3, (AX)(CX*1)
ADDQ $0x18, CX
ADDQ $0x20, BX
SUBQ $0x20, DI
CMPQ DI, $0x2d
JB done
JMP loop
done:
MOVQ CX, ret+56(FP)
MOVQ BX, ret1+64(FP)
VZEROUPPER
RET