Skip to content

Commit

Permalink
Add AVX2 xor (#223)
Browse files Browse the repository at this point in the history
Slight speedup on xor heavy loads. Before/after:

```
BenchmarkReconstructLeopard50x20x1M-32    	      38	  33032958 ns/op	2222.03 MB/s	   29820 B/op	       3 allocs/op
BenchmarkReconstructLeopard50x20x1M-32    	      44	  30021986 ns/op	2444.89 MB/s	   26045 B/op	       3 allocs/op
```
  • Loading branch information
klauspost authored Oct 5, 2022
1 parent c82a6f7 commit 4d2013d
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 4 deletions.
18 changes: 14 additions & 4 deletions galois_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ func galMulAVX2_64(low, high, in, out []byte)
//go:noescape
func sSE2XorSlice_64(in, out []byte)

//go:noescape
func avx2XorSlice_64(in, out []byte)

// This is what the assembler routines do in blocks of 16 bytes:
/*
func galMulSSSE3(low, high, in, out []byte) {
Expand Down Expand Up @@ -121,10 +124,17 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
func sliceXor(in, out []byte, o *options) {
if o.useSSE2 {
if len(in) >= bigSwitchover {
sSE2XorSlice_64(in, out)
done := (len(in) >> 6) << 6
in = in[done:]
out = out[done:]
if o.useAVX2 {
avx2XorSlice_64(in, out)
done := (len(in) >> 6) << 6
in = in[done:]
out = out[done:]
} else {
sSE2XorSlice_64(in, out)
done := (len(in) >> 6) << 6
in = in[done:]
out = out[done:]
}
}
if len(in) >= 16 {
sSE2XorSlice(in, out)
Expand Down
29 changes: 29 additions & 0 deletions galois_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,32 @@ loopback_xor_sse2_64:

done_xor_sse2_64:
RET

// func avx2XorSlice_64(in, out []byte)
TEXT ·avx2XorSlice_64(SB), 7, $0
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), R9 // R9: len(in)
MOVQ out+24(FP), DX // DX: &out
SHRQ $6, R9 // len(in) / 64
CMPQ R9, $0
JEQ done_xor_avx2_64

loopback_xor_avx2_64:
VMOVDQU (SI), Y0
VMOVDQU 32(SI), Y2
VMOVDQU (DX), Y1
VMOVDQU 32(DX), Y3
VPXOR Y0, Y1, Y1
VPXOR Y2, Y3, Y3
VMOVDQU Y1, (DX)
VMOVDQU Y3, 32(DX)

ADDQ $64, SI // in+=64
ADDQ $64, DX // out+=64
SUBQ $1, R9
JNZ loopback_xor_avx2_64
VZEROUPPER

done_xor_avx2_64:

RET

0 comments on commit 4d2013d

Please sign in to comment.