Skip to content

Commit

Permalink
Add to more
Browse files Browse the repository at this point in the history
  • Loading branch information
klauspost committed Nov 15, 2022
1 parent f374174 commit e54b3ff
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 4 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ jobs:
with:
go-version: ${{ matrix.go-version }}

- name: CPU support
run: cat /proc/cpuinfo || true

- name: Checkout code
uses: actions/checkout@v2

Expand Down Expand Up @@ -71,6 +74,11 @@ jobs:
CGO_ENABLED: 1
run: go test -tags=noasm -cpu=4 -short -race -timeout 20m .

- name: Test Races, no gfni
env:
CGO_ENABLED: 1
run: go test -no-gfni -short -race

- name: Test Races, no avx512
env:
CGO_ENABLED: 1
Expand Down
171 changes: 167 additions & 4 deletions reedsolomon.go
Original file line number Diff line number Diff line change
Expand Up @@ -870,11 +870,22 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
gor := r.o.maxGoroutines

var avx2Matrix []byte
var gfniMatrix []uint64
useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs))
if useAvx2 {
useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs))
if useGFNI {
var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64
gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:])
} else if useAvx2 {
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte))
defer r.mPool.Put(avx2Matrix)
} else if byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
} else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
// It appears there is a switchover point at around 10MB where
// Regular processing is faster...
r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
return
} else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
// It appears there is a switchover point at around 10MB where
// Regular processing is faster...
Expand All @@ -888,8 +899,12 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
}

exec := func(start, stop int) {
if useAvx2 && stop-start >= 64 {
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
if stop-start >= 64 {
if useGFNI {
start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop)
} else if useAvx2 {
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
}
}

lstart, lstop := start, start+r.o.perRound
Expand Down Expand Up @@ -1091,6 +1106,154 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
wg.Wait()
}

// Perform the same as codeSomeShards, but split the workload into
// several goroutines.
func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int) {
var wg sync.WaitGroup
gor := r.o.maxGoroutines

type state struct {
input [][]byte
output [][]byte
m []uint64
first bool
}
// Make a plan...
plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))

// Flips between input first to output first.
// We put the smallest data load in the inner loop.
if len(inputs) > len(outputs) {
inIdx := 0
ins := inputs
for len(ins) > 0 {
inPer := ins
if len(inPer) > maxAvx2Inputs {
inPer = inPer[:maxAvx2Inputs]
}
outs := outputs
outIdx := 0
for len(outs) > 0 {
outPer := outs
if len(outPer) > maxAvx2Outputs {
outPer = outPer[:maxAvx2Outputs]
}
// Generate local matrix
m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer)))
plan = append(plan, state{
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
})
outIdx += len(outPer)
outs = outs[len(outPer):]
}
inIdx += len(inPer)
ins = ins[len(inPer):]
}
} else {
outs := outputs
outIdx := 0
for len(outs) > 0 {
outPer := outs
if len(outPer) > maxAvx2Outputs {
outPer = outPer[:maxAvx2Outputs]
}

inIdx := 0
ins := inputs
for len(ins) > 0 {
inPer := ins
if len(inPer) > maxAvx2Inputs {
inPer = inPer[:maxAvx2Inputs]
}
// Generate local matrix
m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer)))
//fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
plan = append(plan, state{
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
})
inIdx += len(inPer)
ins = ins[len(inPer):]
}
outIdx += len(outPer)
outs = outs[len(outPer):]
}
}

do := byteCount / gor
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}

exec := func(start, stop int) {
lstart, lstop := start, start+r.o.perRound
if lstop > stop {
lstop = stop
}
for lstart < stop {
if lstop-lstart >= minAvx2Size {
// Execute plan...
for _, p := range plan {
if p.first {
galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop)
} else {
galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop)
}
}
lstart += (lstop - lstart) & avxSizeMask
if lstart == lstop {
lstop += r.o.perRound
if lstop > stop {
lstop = stop
}
continue
}
}

for c := range inputs {
in := inputs[c][lstart:lstop]
for iRow := 0; iRow < len(outputs); iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
}
}
}
lstart = lstop
lstop += r.o.perRound
if lstop > stop {
lstop = stop
}
}
wg.Done()
}
if gor == 1 {
wg.Add(1)
exec(0, byteCount)
return
}

// Make sizes divisible by 64
do = (do + 63) & (^63)
start := 0
for start < byteCount {
if start+do > byteCount {
do = byteCount - start
}

wg.Add(1)
go exec(start, start+do)
start += do
}
wg.Wait()
}

// checkSomeShards is mostly the same as codeSomeShards,
// except this will check values and return
// as soon as a difference is found.
Expand Down
4 changes: 4 additions & 0 deletions reedsolomon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ var noSSE2 = flag.Bool("no-sse2", !defaultOptions.useSSE2, "Disable SSE2")
var noSSSE3 = flag.Bool("no-ssse3", !defaultOptions.useSSSE3, "Disable SSSE3")
var noAVX2 = flag.Bool("no-avx2", !defaultOptions.useAVX2, "Disable AVX2")
var noAVX512 = flag.Bool("no-avx512", !defaultOptions.useAVX512, "Disable AVX512")
var noGNFI = flag.Bool("no-gfni", !defaultOptions.useGFNI, "Disable AVX512+GFNI")

func TestMain(m *testing.M) {
flag.Parse()
Expand All @@ -44,6 +45,9 @@ func testOptions(o ...Option) []Option {
if *noAVX512 {
o = append(o, WithAVX512(false))
}
if *noGNFI {
o = append(o, WithGFNI(false))
}
return o
}

Expand Down

0 comments on commit e54b3ff

Please sign in to comment.