diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 1be38816..b0204ba7 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -22,6 +22,9 @@ jobs: with: go-version: ${{ matrix.go-version }} + - name: CPU support + run: cat /proc/cpuinfo || true + - name: Checkout code uses: actions/checkout@v2 @@ -71,6 +74,11 @@ jobs: CGO_ENABLED: 1 run: go test -tags=noasm -cpu=4 -short -race -timeout 20m . + - name: Test Races, no gfni + env: + CGO_ENABLED: 1 + run: go test -no-gfni -short -race + - name: Test Races, no avx512 env: CGO_ENABLED: 1 diff --git a/reedsolomon.go b/reedsolomon.go index 05ad882e..451dc352 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -870,11 +870,22 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte gor := r.o.maxGoroutines var avx2Matrix []byte + var gfniMatrix []uint64 useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) - if useAvx2 { + useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) + if useGFNI { + var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64 + gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) + } else if useAvx2 { avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte)) defer r.mPool.Put(avx2Matrix) - } else if byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + } else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + // It appears there is a switchover point at around 10MB where + // Regular processing is faster... + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount) + return + } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { // It appears there is a switchover point at around 10MB where // Regular processing is faster... @@ -888,8 +899,12 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte } exec := func(start, stop int) { - if useAvx2 && stop-start >= 64 { - start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + if stop-start >= 64 { + if useGFNI { + start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) + } else if useAvx2 { + start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + } } lstart, lstop := start, start+r.o.perRound @@ -1091,6 +1106,154 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b wg.Wait() } +// Perform the same as codeSomeShards, but split the workload into +// several goroutines. +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int) { + var wg sync.WaitGroup + gor := r.o.maxGoroutines + + type state struct { + input [][]byte + output [][]byte + m []uint64 + first bool + } + // Make a plan... + plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + + // Flips between input first to output first. + // We put the smallest data load in the inner loop. + if len(inputs) > len(outputs) { + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + // Generate local matrix + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0, + }) + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + } else { + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + // Generate local matrix + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) + //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0, + }) + inIdx += len(inPer) + ins = ins[len(inPer):] + } + outIdx += len(outPer) + outs = outs[len(outPer):] + } + } + + do := byteCount / gor + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + + exec := func(start, stop int) { + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + if lstop-lstart >= minAvx2Size { + // Execute plan... + for _, p := range plan { + if p.first { + galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) + } else { + galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + } + } + lstart += (lstop - lstart) & avxSizeMask + if lstart == lstop { + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + continue + } + } + + for c := range inputs { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + wg.Done() + } + if gor == 1 { + wg.Add(1) + exec(0, byteCount) + return + } + + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + + wg.Add(1) + go exec(start, start+do) + start += do + } + wg.Wait() +} + // checkSomeShards is mostly the same as codeSomeShards, // except this will check values and return // as soon as a difference is found. diff --git a/reedsolomon_test.go b/reedsolomon_test.go index c77fa54a..e4303934 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -24,6 +24,7 @@ var noSSE2 = flag.Bool("no-sse2", !defaultOptions.useSSE2, "Disable SSE2") var noSSSE3 = flag.Bool("no-ssse3", !defaultOptions.useSSSE3, "Disable SSSE3") var noAVX2 = flag.Bool("no-avx2", !defaultOptions.useAVX2, "Disable AVX2") var noAVX512 = flag.Bool("no-avx512", !defaultOptions.useAVX512, "Disable AVX512") +var noGNFI = flag.Bool("no-gfni", !defaultOptions.useGFNI, "Disable AVX512+GFNI") func TestMain(m *testing.M) { flag.Parse() @@ -44,6 +45,9 @@ func testOptions(o ...Option) []Option { if *noAVX512 { o = append(o, WithAVX512(false)) } + if *noGNFI { + o = append(o, WithGFNI(false)) + } return o }