From 4d2013d3345ef25b3c4fcbe677fe931129ff0db2 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 5 Oct 2022 15:16:59 +0200 Subject: [PATCH] Add AVX2 xor (#223) Slight speedup on xor heavy loads. Before/after: ``` BenchmarkReconstructLeopard50x20x1M-32 38 33032958 ns/op 2222.03 MB/s 29820 B/op 3 allocs/op BenchmarkReconstructLeopard50x20x1M-32 44 30021986 ns/op 2444.89 MB/s 26045 B/op 3 allocs/op ``` --- galois_amd64.go | 18 ++++++++++++++---- galois_amd64.s | 29 +++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/galois_amd64.go b/galois_amd64.go index fd3eea7f..79933558 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -29,6 +29,9 @@ func galMulAVX2_64(low, high, in, out []byte) //go:noescape func sSE2XorSlice_64(in, out []byte) +//go:noescape +func avx2XorSlice_64(in, out []byte) + // This is what the assembler routines do in blocks of 16 bytes: /* func galMulSSSE3(low, high, in, out []byte) { @@ -121,10 +124,17 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { func sliceXor(in, out []byte, o *options) { if o.useSSE2 { if len(in) >= bigSwitchover { - sSE2XorSlice_64(in, out) - done := (len(in) >> 6) << 6 - in = in[done:] - out = out[done:] + if o.useAVX2 { + avx2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } else { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } } if len(in) >= 16 { sSE2XorSlice(in, out) diff --git a/galois_amd64.s b/galois_amd64.s index 26bc2d6c..3e97c7c1 100644 --- a/galois_amd64.s +++ b/galois_amd64.s @@ -363,3 +363,32 @@ loopback_xor_sse2_64: done_xor_sse2_64: RET + +// func avx2XorSlice_64(in, out []byte) +TEXT ·avx2XorSlice_64(SB), 7, $0 + MOVQ in+0(FP), SI // SI: &in + MOVQ in_len+8(FP), R9 // R9: len(in) + MOVQ out+24(FP), DX // DX: &out + SHRQ $6, R9 // len(in) / 64 + CMPQ R9, $0 + JEQ done_xor_avx2_64 + +loopback_xor_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y2 + VMOVDQU (DX), Y1 + VMOVDQU 32(DX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (DX) + VMOVDQU Y3, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_xor_avx2_64 + VZEROUPPER + +done_xor_avx2_64: + + RET