Generate correct matrix for code-gen based on actual vector length (f…

…or 256 bits and below)
klauspost · Aug 21, 2024 · 7949410 · 7949410
1 parent 5b12fc2
commit 7949410
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 21 deletions.
diff --git a/galois.go b/galois.go
@@ -910,30 +910,31 @@ func galExp(a byte, n int) byte {
 	return expTable[uint8(logResult)]
 }
 
-func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
+func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs, vectorLength int, dst []byte) []byte {
 	if !codeGen {
 		panic("codegen not enabled")
 	}
 	total := inputs * outputs
 
 	// Duplicated in+out
-	wantBytes := total * 32 * 2
+	wantBytes := total * vectorLength * 2
 	if cap(dst) < wantBytes {
 		dst = AllocAligned(1, wantBytes)[0]
 	} else {
 		dst = dst[:wantBytes]
 	}
 	for i, row := range matrixRows[:outputs] {
 		for j, idx := range row[inIdx : inIdx+inputs] {
-			dstIdx := (j*outputs + i) * 64
+			dstIdx := (j*outputs + i) * vectorLength * 2
 			dstPart := dst[dstIdx:]
-			dstPart = dstPart[:64]
+			dstPart = dstPart[:vectorLength*2]
 			lo := mulTableLow[idx][:]
 			hi := mulTableHigh[idx][:]
-			copy(dstPart[:16], lo)
-			copy(dstPart[16:32], lo)
-			copy(dstPart[32:48], hi)
-			copy(dstPart[48:64], hi)
+
+			for k := 0; k < vectorLength; k += 16 {
+				copy(dstPart[k:k+16], lo)
+				copy(dstPart[vectorLength*2-(k+16):vectorLength*2-k], hi)
+			}
 		}
 	}
 	return dst

diff --git a/galois_arm64.go b/galois_arm64.go
@@ -17,8 +17,12 @@ func getVectorLength() (vl, pl uint64)
 
 func init() {
 	if defaultOptions.useSVE {
-		if vl, _ := getVectorLength(); vl != 256 {
-			defaultOptions.useSVE = false // Temp fix: disable SVE for non-256 vector widths (ie Graviton4)
+		if vl, _ := getVectorLength(); vl <= 256 {
+			// set vector length in bytes
+			defaultOptions.vectorLength = int(vl) >> 3
+		} else {
+			// disable SVE for hardware implementatons over 256 bits (only know to be Fujitsu A64FX atm)
+			defaultOptions.useSVE = false
 		}
 	}
 }

diff --git a/galois_arm64_test.go b/galois_arm64_test.go
@@ -11,9 +11,9 @@ import (
 
 func TestGenGalois(t *testing.T) {
 	if defaultOptions.useSVE {
-		testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor)
+		testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor, defaultOptions.vectorLength)
 	}
 	if defaultOptions.useNEON {
-		testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor)
+		testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor, 32)
 	}
 }
diff --git a/galois_test.go b/galois_test.go
@@ -235,7 +235,7 @@ func TestSliceGalAdd(t *testing.T) {
 	}
 }
 
-func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) {
+func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) {
 
 	// reference versions
 	galMulSliceRef := func(c byte, in, out []byte) {
@@ -270,7 +270,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f
 		}
 	}
 
-	m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil)
+	m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), vectorLength, nil)
 
 	end := start + f(m, inputs, outputs, start, stop)
 	if end != stop {
@@ -297,7 +297,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f
 	}
 }
 
-func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) {
+func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) {
 
 	// reference version
 	galMulSliceXorRef := func(c byte, in, out []byte) {
@@ -327,7 +327,7 @@ func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int,
 		}
 	}
 
-	m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil)
+	m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), vectorLength, nil)
 
 	end := start + f(m, inputs, outputs, start, stop)
 	if end != stop {
@@ -363,7 +363,7 @@ func testGenGaloisEarlyAbort(t *testing.T, matrixRows [][]byte, size int, f func
 	}
 }
 
-func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int) {
+func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) {
 
 	for output := 1; output <= codeGenMaxOutputs; output++ {
 		for input := 1; input <= codeGenMaxInputs; input++ {
@@ -386,15 +386,15 @@ func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out []
 			const limit = 1024
 			for ; size < limit; size += stepsize {
 				// test full range
-				testGenGalois(t, matrixRows, size, 0, size, f)
-				testGenGaloisXor(t, matrixRows, size, 0, size, fXor)
+				testGenGalois(t, matrixRows, size, 0, size, f, vectorLength)
+				testGenGaloisXor(t, matrixRows, size, 0, size, fXor, vectorLength)
 
 				if size >= stepsize*2 && size < limit-stepsize*2 {
 					start := stepsize
 					stop := size - start
 					// test partial range
-					testGenGalois(t, matrixRows, size, start, stop, f)
-					testGenGaloisXor(t, matrixRows, size, start, stop, fXor)
+					testGenGalois(t, matrixRows, size, start, stop, f, vectorLength)
+					testGenGaloisXor(t, matrixRows, size, start, stop, fXor, vectorLength)
 				}
 			}
 		}

diff --git a/options.go b/options.go
@@ -24,6 +24,7 @@ type options struct {
 	useSSE2,
 	useNEON,
 	useSVE bool
+	vectorLength int
 
 	useJerasureMatrix    bool
 	usePAR1Matrix        bool
@@ -55,6 +56,7 @@ var defaultOptions = options{
 	useAvxGNFI:    cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI),
 	useNEON:       cpuid.CPU.Supports(cpuid.ASIMD),
 	useSVE:        cpuid.CPU.Supports(cpuid.SVE),
+	vectorLength:  32, // default vector length is 32 bytes (256 bits) for AVX2 code gen
 }
 
 // leopardMode controls the use of leopard GF in encoding and decoding.