diff --git a/galois.go b/galois.go index 9b36395..bbc521f 100644 --- a/galois.go +++ b/galois.go @@ -910,14 +910,14 @@ func galExp(a byte, n int) byte { return expTable[uint8(logResult)] } -func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { +func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs, vectorLength int, dst []byte) []byte { if !codeGen { panic("codegen not enabled") } total := inputs * outputs // Duplicated in+out - wantBytes := total * 32 * 2 + wantBytes := total * vectorLength * 2 if cap(dst) < wantBytes { dst = AllocAligned(1, wantBytes)[0] } else { @@ -925,15 +925,16 @@ func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byt } for i, row := range matrixRows[:outputs] { for j, idx := range row[inIdx : inIdx+inputs] { - dstIdx := (j*outputs + i) * 64 + dstIdx := (j*outputs + i) * vectorLength * 2 dstPart := dst[dstIdx:] - dstPart = dstPart[:64] + dstPart = dstPart[:vectorLength*2] lo := mulTableLow[idx][:] hi := mulTableHigh[idx][:] - copy(dstPart[:16], lo) - copy(dstPart[16:32], lo) - copy(dstPart[32:48], hi) - copy(dstPart[48:64], hi) + + for k := 0; k < vectorLength; k += 16 { + copy(dstPart[k:k+16], lo) + copy(dstPart[vectorLength*2-(k+16):vectorLength*2-k], hi) + } } } return dst diff --git a/galois_arm64.go b/galois_arm64.go index 08f1ae8..e34f39a 100644 --- a/galois_arm64.go +++ b/galois_arm64.go @@ -17,8 +17,12 @@ func getVectorLength() (vl, pl uint64) func init() { if defaultOptions.useSVE { - if vl, _ := getVectorLength(); vl != 256 { - defaultOptions.useSVE = false // Temp fix: disable SVE for non-256 vector widths (ie Graviton4) + if vl, _ := getVectorLength(); vl <= 256 { + // set vector length in bytes + defaultOptions.vectorLength = int(vl) >> 3 + } else { + // disable SVE for hardware implementatons over 256 bits (only know to be Fujitsu A64FX atm) + defaultOptions.useSVE = false } } } diff --git a/galois_arm64_test.go b/galois_arm64_test.go index 736d46b..a096e8a 100644 --- a/galois_arm64_test.go +++ b/galois_arm64_test.go @@ -11,9 +11,9 @@ import ( func TestGenGalois(t *testing.T) { if defaultOptions.useSVE { - testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor) + testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor, defaultOptions.vectorLength) } if defaultOptions.useNEON { - testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor) + testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor, 32) } } diff --git a/galois_test.go b/galois_test.go index 580b216..4b151b2 100644 --- a/galois_test.go +++ b/galois_test.go @@ -235,7 +235,7 @@ func TestSliceGalAdd(t *testing.T) { } } -func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { +func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) { // reference versions galMulSliceRef := func(c byte, in, out []byte) { @@ -270,7 +270,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f } } - m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil) + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), vectorLength, nil) end := start + f(m, inputs, outputs, start, stop) if end != stop { @@ -297,7 +297,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f } } -func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { +func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) { // reference version galMulSliceXorRef := func(c byte, in, out []byte) { @@ -327,7 +327,7 @@ func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, } } - m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil) + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), vectorLength, nil) end := start + f(m, inputs, outputs, start, stop) if end != stop { @@ -363,7 +363,7 @@ func testGenGaloisEarlyAbort(t *testing.T, matrixRows [][]byte, size int, f func } } -func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int) { +func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int, vectorLength int) { for output := 1; output <= codeGenMaxOutputs; output++ { for input := 1; input <= codeGenMaxInputs; input++ { @@ -386,15 +386,15 @@ func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [] const limit = 1024 for ; size < limit; size += stepsize { // test full range - testGenGalois(t, matrixRows, size, 0, size, f) - testGenGaloisXor(t, matrixRows, size, 0, size, fXor) + testGenGalois(t, matrixRows, size, 0, size, f, vectorLength) + testGenGaloisXor(t, matrixRows, size, 0, size, fXor, vectorLength) if size >= stepsize*2 && size < limit-stepsize*2 { start := stepsize stop := size - start // test partial range - testGenGalois(t, matrixRows, size, start, stop, f) - testGenGaloisXor(t, matrixRows, size, start, stop, fXor) + testGenGalois(t, matrixRows, size, start, stop, f, vectorLength) + testGenGaloisXor(t, matrixRows, size, start, stop, fXor, vectorLength) } } } diff --git a/options.go b/options.go index 377137e..cde2555 100644 --- a/options.go +++ b/options.go @@ -24,6 +24,7 @@ type options struct { useSSE2, useNEON, useSVE bool + vectorLength int useJerasureMatrix bool usePAR1Matrix bool @@ -55,6 +56,7 @@ var defaultOptions = options{ useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), useNEON: cpuid.CPU.Supports(cpuid.ASIMD), useSVE: cpuid.CPU.Supports(cpuid.SVE), + vectorLength: 32, // default vector length is 32 bytes (256 bits) for AVX2 code gen } // leopardMode controls the use of leopard GF in encoding and decoding.