From 4d2013d3345ef25b3c4fcbe677fe931129ff0db2 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Wed, 5 Oct 2022 15:16:59 +0200
Subject: [PATCH] Add AVX2 xor (#223)

Slight speedup on xor heavy loads. Before/after:

```
BenchmarkReconstructLeopard50x20x1M-32    	      38	  33032958 ns/op	2222.03 MB/s	   29820 B/op	       3 allocs/op
BenchmarkReconstructLeopard50x20x1M-32    	      44	  30021986 ns/op	2444.89 MB/s	   26045 B/op	       3 allocs/op
```
---
 galois_amd64.go | 18 ++++++++++++++----
 galois_amd64.s  | 29 +++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/galois_amd64.go b/galois_amd64.go
index fd3eea7f..79933558 100644
--- a/galois_amd64.go
+++ b/galois_amd64.go
@@ -29,6 +29,9 @@ func galMulAVX2_64(low, high, in, out []byte)
 //go:noescape
 func sSE2XorSlice_64(in, out []byte)
 
+//go:noescape
+func avx2XorSlice_64(in, out []byte)
+
 // This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
@@ -121,10 +124,17 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
 func sliceXor(in, out []byte, o *options) {
 	if o.useSSE2 {
 		if len(in) >= bigSwitchover {
-			sSE2XorSlice_64(in, out)
-			done := (len(in) >> 6) << 6
-			in = in[done:]
-			out = out[done:]
+			if o.useAVX2 {
+				avx2XorSlice_64(in, out)
+				done := (len(in) >> 6) << 6
+				in = in[done:]
+				out = out[done:]
+			} else {
+				sSE2XorSlice_64(in, out)
+				done := (len(in) >> 6) << 6
+				in = in[done:]
+				out = out[done:]
+			}
 		}
 		if len(in) >= 16 {
 			sSE2XorSlice(in, out)
diff --git a/galois_amd64.s b/galois_amd64.s
index 26bc2d6c..3e97c7c1 100644
--- a/galois_amd64.s
+++ b/galois_amd64.s
@@ -363,3 +363,32 @@ loopback_xor_sse2_64:
 
 done_xor_sse2_64:
 	RET
+
+// func avx2XorSlice_64(in, out []byte)
+TEXT ·avx2XorSlice_64(SB), 7, $0
+	MOVQ in+0(FP), SI     // SI: &in
+	MOVQ in_len+8(FP), R9 // R9: len(in)
+	MOVQ out+24(FP), DX   // DX: &out
+	SHRQ $6, R9           // len(in) / 64
+	CMPQ R9, $0
+	JEQ  done_xor_avx2_64
+
+loopback_xor_avx2_64:
+	VMOVDQU (SI), Y0
+	VMOVDQU 32(SI), Y2
+	VMOVDQU (DX), Y1
+	VMOVDQU 32(DX), Y3
+	VPXOR   Y0, Y1, Y1
+	VPXOR   Y2, Y3, Y3
+	VMOVDQU Y1, (DX)
+	VMOVDQU Y3, 32(DX)
+
+	ADDQ $64, SI              // in+=64
+	ADDQ $64, DX              // out+=64
+	SUBQ $1, R9
+	JNZ  loopback_xor_avx2_64
+	VZEROUPPER
+
+done_xor_avx2_64:
+
+	RET