Add AVX2 xor (#223)

Slight speedup on xor heavy loads. Before/after: ``` BenchmarkReconstructLeopard50x20x1M-32 38 33032958 ns/op 2222.03 MB/s 29820 B/op 3 allocs/op BenchmarkReconstructLeopard50x20x1M-32 44 30021986 ns/op 2444.89 MB/s 26045 B/op 3 allocs/op ```
klauspost · Oct 5, 2022 · 4d2013d · 4d2013d
1 parent c82a6f7
commit 4d2013d
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 4 deletions.
diff --git a/galois_amd64.go b/galois_amd64.go
@@ -29,6 +29,9 @@ func galMulAVX2_64(low, high, in, out []byte)
 //go:noescape
 func sSE2XorSlice_64(in, out []byte)
 
+//go:noescape
+func avx2XorSlice_64(in, out []byte)
+
 // This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
@@ -121,10 +124,17 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
 func sliceXor(in, out []byte, o *options) {
 	if o.useSSE2 {
 		if len(in) >= bigSwitchover {
-			sSE2XorSlice_64(in, out)
-			done := (len(in) >> 6) << 6
-			in = in[done:]
-			out = out[done:]
+			if o.useAVX2 {
+				avx2XorSlice_64(in, out)
+				done := (len(in) >> 6) << 6
+				in = in[done:]
+				out = out[done:]
+			} else {
+				sSE2XorSlice_64(in, out)
+				done := (len(in) >> 6) << 6
+				in = in[done:]
+				out = out[done:]
+			}
 		}
 		if len(in) >= 16 {
 			sSE2XorSlice(in, out)

diff --git a/galois_amd64.s b/galois_amd64.s
@@ -363,3 +363,32 @@ loopback_xor_sse2_64:
 
 done_xor_sse2_64:
 	RET
+
+// func avx2XorSlice_64(in, out []byte)
+TEXT ·avx2XorSlice_64(SB), 7, $0
+	MOVQ in+0(FP), SI     // SI: &in
+	MOVQ in_len+8(FP), R9 // R9: len(in)
+	MOVQ out+24(FP), DX   // DX: &out
+	SHRQ $6, R9           // len(in) / 64
+	CMPQ R9, $0
+	JEQ  done_xor_avx2_64
+
+loopback_xor_avx2_64:
+	VMOVDQU (SI), Y0
+	VMOVDQU 32(SI), Y2
+	VMOVDQU (DX), Y1
+	VMOVDQU 32(DX), Y3
+	VPXOR   Y0, Y1, Y1
+	VPXOR   Y2, Y3, Y3
+	VMOVDQU Y1, (DX)
+	VMOVDQU Y3, 32(DX)
+
+	ADDQ $64, SI              // in+=64
+	ADDQ $64, DX              // out+=64
+	SUBQ $1, R9
+	JNZ  loopback_xor_avx2_64
+	VZEROUPPER
+
+done_xor_avx2_64:
+
+	RET