From bf0ce8a3e12d2a0337dd7f1158657774947e7151 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 15 Nov 2022 16:33:55 +0100 Subject: [PATCH] Add AVX512 GFNI processing WIP --- _gen/cleanup.go | 1 + _gen/gen.go | 271 + galois.go | 18 + galois_gen_amd64.go | 1106 +- galois_gen_amd64.s | 34886 +++++++++++++++++++++++++++++++++++ galois_gen_switch_amd64.go | 676 + leopard8.go | 4 - options.go | 27 +- reedsolomon.go | 31 +- 9 files changed, 36999 insertions(+), 21 deletions(-) diff --git a/_gen/cleanup.go b/_gen/cleanup.go index afdde013..36efe4be 100644 --- a/_gen/cleanup.go +++ b/_gen/cleanup.go @@ -25,6 +25,7 @@ func main() { } data = bytes.ReplaceAll(data, []byte("\t// #"), []byte("#")) data = bytes.ReplaceAll(data, []byte("\t// @"), []byte("")) + data = bytes.ReplaceAll(data, []byte("VALIGNQ"), []byte("VGF2P8AFFINEQB")) data = bytes.ReplaceAll(data, []byte("VPTERNLOGQ"), []byte("XOR3WAY(")) split := bytes.Split(data, []byte("\n")) // Add closing ')' diff --git a/_gen/gen.go b/_gen/gen.go index 2e225b07..ff65add6 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -29,6 +29,9 @@ const outputMax = 10 var switchDefs [inputMax][outputMax]string var switchDefsX [inputMax][outputMax]string +var switchDefs512 [inputMax][outputMax]string +var switchDefsX512 [inputMax][outputMax]string + // Prefetch offsets, set to 0 to disable. // Disabled since they appear to be consistently slower. const prefetchSrc = 0 @@ -58,6 +61,8 @@ func main() { for j := 1; j <= outputMax; j++ { genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) + genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64", i, j), i, j, false) + genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64Xor", i, j), i, j, true) genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) } @@ -131,6 +136,48 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } `) + + w.WriteString(` + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop-start) & avxSizeMask + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefs512[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop-start) & avxSizeMask + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsX512[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} +`) + genGF16() genGF8() Generate() @@ -657,3 +704,227 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { Label(name + "_end") RET() } + +func genMulAvx512GFNI(name string, inputs int, outputs int, xor bool) { + const perLoopBits = 6 + const perLoop = 1 << perLoopBits + + total := inputs * outputs + + doc := []string{ + fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs), + } + if !xor { + doc = append(doc, "The output is initialized to 0.") + } + + // Load shuffle masks on every use. + var loadNone bool + // Use registers for destination registers. + var regDst = true + var reloadLength = false + + est := total + outputs + 2 + + if est > 32 { + loadNone = true + // We run out of GP registers first, now. + if inputs+outputs > 13 { + regDst = false + } + // Save one register by reloading length. + if inputs+outputs > 12 && regDst { + reloadLength = true + } + } + + TEXT(name, 0, fmt.Sprintf("func(matrix []uint64, in [][]byte, out [][]byte, start, n int)")) + x := "" + if xor { + x = "Xor" + } + // SWITCH DEFINITION: + //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) + s := fmt.Sprintf(" mulGFNI_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x) + s += fmt.Sprintf("\t\t\t\treturn n\n") + if xor { + switchDefsX512[inputs-1][outputs-1] = s + } else { + switchDefs512[inputs-1][outputs-1] = s + } + + if loadNone { + Comment("Loading no tables to registers") + } else { + // loadNone == false + Comment("Loading all tables to registers") + } + if regDst { + Comment("Destination kept in GP registers") + } else { + Comment("Destination kept on stack") + } + + Doc(doc...) + Pragma("noescape") + Commentf("Full registers estimated %d YMM used", est) + + length := Load(Param("n"), GP64()) + matrixBase := GP64() + addr, err := Param("matrix").Base().Resolve() + if err != nil { + panic(err) + } + MOVQ(addr.Addr, matrixBase) + SHRQ(U8(perLoopBits), length) + TESTQ(length, length) + JZ(LabelRef(name + "_end")) + + matrix := make([]reg.VecVirtual, total) + + for i := range matrix { + if loadNone { + break + } + table := ZMM() + VBROADCASTF32X2(Mem{Base: matrixBase, Disp: i * 8}, table) + matrix[i] = table + } + + inPtrs := make([]reg.GPVirtual, inputs) + inSlicePtr := GP64() + addr, err = Param("in").Base().Resolve() + if err != nil { + panic(err) + } + MOVQ(addr.Addr, inSlicePtr) + for i := range inPtrs { + ptr := GP64() + MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr) + inPtrs[i] = ptr + } + // Destination + dst := make([]reg.VecVirtual, outputs) + dstPtr := make([]reg.GPVirtual, outputs) + addr, err = Param("out").Base().Resolve() + if err != nil { + panic(err) + } + outBase := addr.Addr + outSlicePtr := GP64() + MOVQ(addr.Addr, outSlicePtr) + MOVQ(outBase, outSlicePtr) + for i := range dst { + dst[i] = ZMM() + if !regDst { + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + dstPtr[i] = ptr + } + + offset := GP64() + addr, err = Param("start").Resolve() + if err != nil { + panic(err) + } + + MOVQ(addr.Addr, offset) + if regDst { + Comment("Add start offset to output") + for _, ptr := range dstPtr { + ADDQ(offset, ptr) + } + } + + Comment("Add start offset to input") + for _, ptr := range inPtrs { + ADDQ(offset, ptr) + } + // Offset no longer needed unless not regdst + + if reloadLength { + Commentf("Reload length to save a register") + length = Load(Param("n"), GP64()) + SHRQ(U8(perLoopBits), length) + } + Label(name + "_loop") + + if xor { + Commentf("Load %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU64(Mem{Base: dstPtr[i]}, dst[i]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) + } + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU64(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + } + + in := ZMM() + look := ZMM() + for i := range inPtrs { + Commentf("Load and process 64 bytes from input %d to %d outputs", i, outputs) + VMOVDQU64(Mem{Base: inPtrs[i]}, in) + if prefetchSrc > 0 { + PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc}) + } + ADDQ(U8(perLoop), inPtrs[i]) + + for j := range dst { + if loadNone { + VBROADCASTF32X2(Mem{Base: matrixBase, Disp: 8 * (i*outputs + j)}, look) + if i == 0 && !xor { + VALIGNQ(U8(0), in, look, dst[j]) + } else { + VALIGNQ(U8(0), in, look, look) + VXORPD(dst[j], look, dst[j]) + } + } else { + if i == 0 && !xor { + VALIGNQ(U8(0), in, matrix[i*outputs+j], dst[j]) + } else { + VALIGNQ(U8(0), in, matrix[i*outputs+j], look) + VXORPD(dst[j], look, dst[j]) + } + } + } + } + Commentf("Store %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU64(dst[i], Mem{Base: dstPtr[i]}) + if prefetchDst > 0 && !xor { + PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) + } + ADDQ(U8(perLoop), dstPtr[i]) + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU64(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) + if prefetchDst > 0 && !xor { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + Comment("Prepare for next loop") + if !regDst { + ADDQ(U8(perLoop), offset) + } + DECQ(length) + JNZ(LabelRef(name + "_loop")) + VZEROUPPER() + + Label(name + "_end") + RET() +} diff --git a/galois.go b/galois.go index 703f2091..a940e803 100644 --- a/galois.go +++ b/galois.go @@ -932,6 +932,24 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) return dst } +var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f} + +func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 { + if !avx2CodeGen { + panic("codegen not enabled") + } + total := inputs * outputs + + // Duplicated in+out + dst = dst[:total] + for i, row := range matrixRows[:outputs] { + for j, idx := range row[inIdx : inIdx+inputs] { + dst[j*outputs+i] = gf2p811dMulMatrices[idx] + } + } + return dst +} + // xor slices writing to out. func sliceXorGo(in, out []byte, _ *options) { for len(out) >= 32 { diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 664a34c3..596ecb04 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -18,6 +18,17 @@ func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. // //go:noescape @@ -40,6 +51,17 @@ func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. // //go:noescape @@ -62,6 +84,17 @@ func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. // //go:noescape @@ -78,6 +111,17 @@ func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. // //go:noescape @@ -89,6 +133,17 @@ func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. // //go:noescape @@ -100,6 +155,17 @@ func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. // //go:noescape @@ -111,6 +177,17 @@ func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. // //go:noescape @@ -122,6 +199,17 @@ func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. // //go:noescape @@ -133,6 +221,17 @@ func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. // //go:noescape @@ -144,6 +243,17 @@ func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. // //go:noescape @@ -161,6 +271,17 @@ func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. // //go:noescape @@ -183,6 +304,17 @@ func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. // //go:noescape @@ -205,6 +337,17 @@ func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. // //go:noescape @@ -221,6 +364,17 @@ func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. // //go:noescape @@ -232,6 +386,17 @@ func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. // //go:noescape @@ -243,6 +408,17 @@ func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. // //go:noescape @@ -254,6 +430,17 @@ func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. // //go:noescape @@ -265,6 +452,17 @@ func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. // //go:noescape @@ -276,6 +474,17 @@ func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. // //go:noescape @@ -287,6 +496,17 @@ func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. // //go:noescape @@ -304,6 +524,17 @@ func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. // //go:noescape @@ -326,6 +557,17 @@ func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. // //go:noescape @@ -348,6 +590,17 @@ func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. // //go:noescape @@ -364,6 +617,17 @@ func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. // //go:noescape @@ -375,6 +639,17 @@ func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. // //go:noescape @@ -386,6 +661,17 @@ func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. // //go:noescape @@ -397,6 +683,17 @@ func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. // //go:noescape @@ -408,6 +705,17 @@ func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. // //go:noescape @@ -419,6 +727,17 @@ func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. // //go:noescape @@ -430,6 +749,17 @@ func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. // //go:noescape @@ -447,6 +777,17 @@ func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. // //go:noescape @@ -469,6 +810,17 @@ func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. // //go:noescape @@ -491,6 +843,17 @@ func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. // //go:noescape @@ -507,6 +870,17 @@ func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. // //go:noescape @@ -518,6 +892,17 @@ func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. // //go:noescape @@ -529,6 +914,17 @@ func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. // //go:noescape @@ -540,17 +936,39 @@ func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. +// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. // //go:noescape -func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. +// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. // //go:noescape @@ -562,6 +980,17 @@ func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. // //go:noescape @@ -573,6 +1002,17 @@ func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. // //go:noescape @@ -590,6 +1030,17 @@ func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. // //go:noescape @@ -612,6 +1063,17 @@ func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. // //go:noescape @@ -634,6 +1096,17 @@ func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. // //go:noescape @@ -650,6 +1123,17 @@ func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. // //go:noescape @@ -661,6 +1145,17 @@ func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. // //go:noescape @@ -672,6 +1167,17 @@ func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. // //go:noescape @@ -683,6 +1189,17 @@ func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. // //go:noescape @@ -694,6 +1211,17 @@ func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. // //go:noescape @@ -705,6 +1233,17 @@ func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. // //go:noescape @@ -716,6 +1255,17 @@ func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. // //go:noescape @@ -733,6 +1283,17 @@ func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. // //go:noescape @@ -755,6 +1316,17 @@ func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. // //go:noescape @@ -777,6 +1349,17 @@ func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. // //go:noescape @@ -793,6 +1376,17 @@ func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. // //go:noescape @@ -804,6 +1398,17 @@ func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. // //go:noescape @@ -815,6 +1420,17 @@ func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. // //go:noescape @@ -826,6 +1442,17 @@ func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. // //go:noescape @@ -837,6 +1464,17 @@ func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. // //go:noescape @@ -848,6 +1486,17 @@ func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. // //go:noescape @@ -859,6 +1508,17 @@ func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. // //go:noescape @@ -876,6 +1536,17 @@ func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. // //go:noescape @@ -898,6 +1569,17 @@ func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. // //go:noescape @@ -920,6 +1602,17 @@ func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. // //go:noescape @@ -936,6 +1629,17 @@ func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. // //go:noescape @@ -947,6 +1651,17 @@ func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. // //go:noescape @@ -958,6 +1673,17 @@ func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. // //go:noescape @@ -969,6 +1695,17 @@ func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. // //go:noescape @@ -980,6 +1717,17 @@ func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. // //go:noescape @@ -991,6 +1739,17 @@ func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. // //go:noescape @@ -1002,6 +1761,17 @@ func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. // //go:noescape @@ -1019,6 +1789,17 @@ func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. // //go:noescape @@ -1041,6 +1822,17 @@ func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. // //go:noescape @@ -1063,6 +1855,17 @@ func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. // //go:noescape @@ -1079,6 +1882,17 @@ func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. // //go:noescape @@ -1090,6 +1904,17 @@ func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. // //go:noescape @@ -1101,6 +1926,17 @@ func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. // //go:noescape @@ -1112,6 +1948,17 @@ func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. // //go:noescape @@ -1123,6 +1970,17 @@ func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. // //go:noescape @@ -1134,6 +1992,17 @@ func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. // //go:noescape @@ -1145,6 +2014,17 @@ func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. // //go:noescape @@ -1162,6 +2042,17 @@ func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. // //go:noescape @@ -1184,6 +2075,17 @@ func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. // //go:noescape @@ -1206,6 +2108,17 @@ func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. // //go:noescape @@ -1222,6 +2135,17 @@ func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. // //go:noescape @@ -1233,6 +2157,17 @@ func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. // //go:noescape @@ -1244,6 +2179,17 @@ func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. // //go:noescape @@ -1255,6 +2201,17 @@ func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. // //go:noescape @@ -1266,6 +2223,17 @@ func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. // //go:noescape @@ -1277,6 +2245,17 @@ func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. // //go:noescape @@ -1288,6 +2267,17 @@ func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. // //go:noescape @@ -1305,6 +2295,17 @@ func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. // //go:noescape @@ -1327,6 +2328,17 @@ func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. // //go:noescape @@ -1349,6 +2361,17 @@ func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. // //go:noescape @@ -1365,6 +2388,17 @@ func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n //go:noescape func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. // //go:noescape @@ -1376,6 +2410,17 @@ func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. // //go:noescape @@ -1387,6 +2432,17 @@ func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. // //go:noescape @@ -1398,6 +2454,17 @@ func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. // //go:noescape @@ -1409,6 +2476,17 @@ func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. // //go:noescape @@ -1420,6 +2498,17 @@ func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. // //go:noescape @@ -1431,6 +2520,17 @@ func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. // //go:noescape diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index e9736998..e55d869a 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -129,6 +129,96 @@ mulAvxTwo_1x1_64_loop: mulAvxTwo_1x1_64_end: RET +// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z1 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z0, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64_loop + VZEROUPPER + +mulGFNI_1x1_64_end: + RET + +// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DX), Z1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z2 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z0, Z2 + VXORPD Z1, Z2, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64Xor_loop + VZEROUPPER + +mulGFNI_1x1_64Xor_end: + RET + // func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 @@ -378,6 +468,110 @@ mulAvxTwo_1x2_64_loop: mulAvxTwo_1x2_64_end: RET +// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z0, Z2 + VGF2P8AFFINEQB $0x00, Z3, Z1, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64_loop + VZEROUPPER + +mulGFNI_1x2_64_end: + RET + +// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (BX), Z2 + VMOVDQU64 (DX), Z3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z0, Z5 + VXORPD Z2, Z5, Z2 + VGF2P8AFFINEQB $0x00, Z4, Z1, Z5 + VXORPD Z3, Z5, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64Xor_loop + VZEROUPPER + +mulGFNI_1x2_64Xor_end: + RET + // func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 @@ -674,6 +868,124 @@ mulAvxTwo_1x3_64_loop: mulAvxTwo_1x3_64_end: RET +// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z0, Z3 + VGF2P8AFFINEQB $0x00, Z5, Z1, Z4 + VGF2P8AFFINEQB $0x00, Z5, Z2, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64_loop + VZEROUPPER + +mulGFNI_1x3_64_end: + RET + +// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (BX), Z3 + VMOVDQU64 (SI), Z4 + VMOVDQU64 (DX), Z5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z0, Z7 + VXORPD Z3, Z7, Z3 + VGF2P8AFFINEQB $0x00, Z6, Z1, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z6, Z2, Z7 + VXORPD Z5, Z7, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64Xor_loop + VZEROUPPER + +mulGFNI_1x3_64Xor_end: + RET + // func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 @@ -918,6 +1230,138 @@ mulAvxTwo_1x4_loop: mulAvxTwo_1x4_end: RET +// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z0, Z4 + VGF2P8AFFINEQB $0x00, Z7, Z1, Z5 + VGF2P8AFFINEQB $0x00, Z7, Z2, Z6 + VGF2P8AFFINEQB $0x00, Z7, Z3, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64_loop + VZEROUPPER + +mulGFNI_1x4_64_end: + RET + +// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (BX), Z4 + VMOVDQU64 (SI), Z5 + VMOVDQU64 (DI), Z6 + VMOVDQU64 (DX), Z7 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z0, Z9 + VXORPD Z4, Z9, Z4 + VGF2P8AFFINEQB $0x00, Z8, Z1, Z9 + VXORPD Z5, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z8, Z2, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z3, Z9 + VXORPD Z7, Z9, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64Xor_loop + VZEROUPPER + +mulGFNI_1x4_64Xor_end: + RET + // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 @@ -1087,6 +1531,152 @@ mulAvxTwo_1x5_loop: mulAvxTwo_1x5_end: RET +// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z0, Z5 + VGF2P8AFFINEQB $0x00, Z9, Z1, Z6 + VGF2P8AFFINEQB $0x00, Z9, Z2, Z7 + VGF2P8AFFINEQB $0x00, Z9, Z3, Z8 + VGF2P8AFFINEQB $0x00, Z9, Z4, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64_loop + VZEROUPPER + +mulGFNI_1x5_64_end: + RET + +// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (BX), Z5 + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DX), Z9 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z0, Z11 + VXORPD Z5, Z11, Z5 + VGF2P8AFFINEQB $0x00, Z10, Z1, Z11 + VXORPD Z6, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z10, Z2, Z11 + VXORPD Z7, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z10, Z3, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z4, Z11 + VXORPD Z9, Z11, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64Xor_loop + VZEROUPPER + +mulGFNI_1x5_64Xor_end: + RET + // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 @@ -1275,6 +1865,166 @@ mulAvxTwo_1x6_loop: mulAvxTwo_1x6_end: RET +// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z11, Z0, Z6 + VGF2P8AFFINEQB $0x00, Z11, Z1, Z7 + VGF2P8AFFINEQB $0x00, Z11, Z2, Z8 + VGF2P8AFFINEQB $0x00, Z11, Z3, Z9 + VGF2P8AFFINEQB $0x00, Z11, Z4, Z10 + VGF2P8AFFINEQB $0x00, Z11, Z5, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64_loop + VZEROUPPER + +mulGFNI_1x6_64_end: + RET + +// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (BX), Z6 + VMOVDQU64 (SI), Z7 + VMOVDQU64 (DI), Z8 + VMOVDQU64 (R8), Z9 + VMOVDQU64 (R9), Z10 + VMOVDQU64 (DX), Z11 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z13 + VXORPD Z6, Z13, Z6 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z13 + VXORPD Z7, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z12, Z2, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z12, Z3, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z11, Z13, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64Xor_loop + VZEROUPPER + +mulGFNI_1x6_64Xor_end: + RET + // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 @@ -1482,6 +2232,180 @@ mulAvxTwo_1x7_loop: mulAvxTwo_1x7_end: RET +// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z13 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z13, Z0, Z7 + VGF2P8AFFINEQB $0x00, Z13, Z1, Z8 + VGF2P8AFFINEQB $0x00, Z13, Z2, Z9 + VGF2P8AFFINEQB $0x00, Z13, Z3, Z10 + VGF2P8AFFINEQB $0x00, Z13, Z4, Z11 + VGF2P8AFFINEQB $0x00, Z13, Z5, Z12 + VGF2P8AFFINEQB $0x00, Z13, Z6, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64_loop + VZEROUPPER + +mulGFNI_1x7_64_end: + RET + +// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (BX), Z7 + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (R9), Z11 + VMOVDQU64 (R10), Z12 + VMOVDQU64 (DX), Z13 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z0, Z15 + VXORPD Z7, Z15, Z7 + VGF2P8AFFINEQB $0x00, Z14, Z1, Z15 + VXORPD Z8, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z14, Z2, Z15 + VXORPD Z9, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z14, Z3, Z15 + VXORPD Z10, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z14, Z4, Z15 + VXORPD Z11, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z14, Z5, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z6, Z15 + VXORPD Z13, Z15, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64Xor_loop + VZEROUPPER + +mulGFNI_1x7_64Xor_end: + RET + // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 @@ -1708,6 +2632,194 @@ mulAvxTwo_1x8_loop: mulAvxTwo_1x8_end: RET +// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z0, Z8 + VGF2P8AFFINEQB $0x00, Z15, Z1, Z9 + VGF2P8AFFINEQB $0x00, Z15, Z2, Z10 + VGF2P8AFFINEQB $0x00, Z15, Z3, Z11 + VGF2P8AFFINEQB $0x00, Z15, Z4, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z5, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z6, Z14 + VGF2P8AFFINEQB $0x00, Z15, Z7, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64_loop + VZEROUPPER + +mulGFNI_1x8_64_end: + RET + +// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (BX), Z8 + VMOVDQU64 (SI), Z9 + VMOVDQU64 (DI), Z10 + VMOVDQU64 (R8), Z11 + VMOVDQU64 (R9), Z12 + VMOVDQU64 (R10), Z13 + VMOVDQU64 (R11), Z14 + VMOVDQU64 (DX), Z15 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z0, Z17 + VXORPD Z8, Z17, Z8 + VGF2P8AFFINEQB $0x00, Z16, Z1, Z17 + VXORPD Z9, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z16, Z2, Z17 + VXORPD Z10, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z16, Z3, Z17 + VXORPD Z11, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z16, Z4, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z5, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z6, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z7, Z17 + VXORPD Z15, Z17, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64Xor_loop + VZEROUPPER + +mulGFNI_1x8_64Xor_end: + RET + // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 @@ -1953,6 +3065,208 @@ mulAvxTwo_1x9_loop: mulAvxTwo_1x9_end: RET +// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z17 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z17, Z0, Z9 + VGF2P8AFFINEQB $0x00, Z17, Z1, Z10 + VGF2P8AFFINEQB $0x00, Z17, Z2, Z11 + VGF2P8AFFINEQB $0x00, Z17, Z3, Z12 + VGF2P8AFFINEQB $0x00, Z17, Z4, Z13 + VGF2P8AFFINEQB $0x00, Z17, Z5, Z14 + VGF2P8AFFINEQB $0x00, Z17, Z6, Z15 + VGF2P8AFFINEQB $0x00, Z17, Z7, Z16 + VGF2P8AFFINEQB $0x00, Z17, Z8, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64_loop + VZEROUPPER + +mulGFNI_1x9_64_end: + RET + +// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (BX), Z9 + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (R10), Z14 + VMOVDQU64 (R11), Z15 + VMOVDQU64 (R12), Z16 + VMOVDQU64 (DX), Z17 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z19 + VXORPD Z9, Z19, Z9 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z19 + VXORPD Z10, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z18, Z2, Z19 + VXORPD Z11, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z18, Z3, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z18, Z4, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z17, Z19, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64Xor_loop + VZEROUPPER + +mulGFNI_1x9_64Xor_end: + RET + // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 @@ -2217,6 +3531,222 @@ mulAvxTwo_1x10_loop: mulAvxTwo_1x10_end: RET +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z19 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z19, Z0, Z10 + VGF2P8AFFINEQB $0x00, Z19, Z1, Z11 + VGF2P8AFFINEQB $0x00, Z19, Z2, Z12 + VGF2P8AFFINEQB $0x00, Z19, Z3, Z13 + VGF2P8AFFINEQB $0x00, Z19, Z4, Z14 + VGF2P8AFFINEQB $0x00, Z19, Z5, Z15 + VGF2P8AFFINEQB $0x00, Z19, Z6, Z16 + VGF2P8AFFINEQB $0x00, Z19, Z7, Z17 + VGF2P8AFFINEQB $0x00, Z19, Z8, Z18 + VGF2P8AFFINEQB $0x00, Z19, Z9, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z20, Z2, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z20, Z4, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z19, Z21, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64Xor_loop + VZEROUPPER + +mulGFNI_1x10_64Xor_end: + RET + // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 @@ -2505,6 +4035,114 @@ mulAvxTwo_2x1_64_loop: mulAvxTwo_2x1_64_end: RET +// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z3, Z0, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z1, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64_loop + VZEROUPPER + +mulGFNI_2x1_64_end: + RET + +// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (BX), Z2 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z3, Z0, Z3 + VXORPD Z2, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z1, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64Xor_loop + VZEROUPPER + +mulGFNI_2x1_64Xor_end: + RET + // func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 @@ -2837,6 +4475,134 @@ mulAvxTwo_2x2_64_loop: mulAvxTwo_2x2_64_end: RET +// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z6, Z0, Z4 + VGF2P8AFFINEQB $0x00, Z6, Z1, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z2, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z6, Z3, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64_loop + VZEROUPPER + +mulGFNI_2x2_64_end: + RET + +// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (SI), Z4 + VMOVDQU64 (BX), Z5 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z6, Z0, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z6, Z1, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z2, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z6, Z3, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64Xor_loop + VZEROUPPER + +mulGFNI_2x2_64Xor_end: + RET + // func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 @@ -3242,6 +5008,154 @@ mulAvxTwo_2x3_64_loop: mulAvxTwo_2x3_64_end: RET +// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z9, Z0, Z6 + VGF2P8AFFINEQB $0x00, Z9, Z1, Z7 + VGF2P8AFFINEQB $0x00, Z9, Z2, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z3, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z9, Z4, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z9, Z5, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64_loop + VZEROUPPER + +mulGFNI_2x3_64_end: + RET + +// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (BX), Z8 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z9, Z0, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z9, Z1, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z9, Z2, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z3, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z9, Z4, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z9, Z5, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64Xor_loop + VZEROUPPER + +mulGFNI_2x3_64Xor_end: + RET + // func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 @@ -3576,6 +5490,174 @@ mulAvxTwo_2x4_loop: mulAvxTwo_2x4_end: RET +// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z8 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z2, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z3, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z6, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z7, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64_loop + VZEROUPPER + +mulGFNI_2x4_64_end: + RET + +// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (BX), Z11 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z2, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z3, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z6, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z7, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64Xor_loop + VZEROUPPER + +mulGFNI_2x4_64Xor_end: + RET + // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 @@ -3808,6 +5890,194 @@ mulAvxTwo_2x5_loop: mulAvxTwo_2x5_end: RET +// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z15, Z0, Z10 + VGF2P8AFFINEQB $0x00, Z15, Z1, Z11 + VGF2P8AFFINEQB $0x00, Z15, Z2, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z3, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z4, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z5, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z15, Z6, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z15, Z7, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z8, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z9, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64_loop + VZEROUPPER + +mulGFNI_2x5_64_end: + RET + +// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (BX), Z14 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z15, Z0, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z15, Z1, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z15, Z2, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z3, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z4, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z5, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z15, Z6, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z15, Z7, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z8, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z9, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64Xor_loop + VZEROUPPER + +mulGFNI_2x5_64Xor_end: + RET + // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 @@ -4069,6 +6339,214 @@ mulAvxTwo_2x6_loop: mulAvxTwo_2x6_end: RET +// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z12 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z13 + VGF2P8AFFINEQB $0x00, Z18, Z2, Z14 + VGF2P8AFFINEQB $0x00, Z18, Z3, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z4, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z18, Z9, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z10, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z11, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64_loop + VZEROUPPER + +mulGFNI_2x6_64_end: + RET + +// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (SI), Z12 + VMOVDQU64 (DI), Z13 + VMOVDQU64 (R8), Z14 + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (BX), Z17 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z18, Z2, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z18, Z3, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z4, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z18, Z9, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z10, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z11, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64Xor_loop + VZEROUPPER + +mulGFNI_2x6_64Xor_end: + RET + // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 @@ -4359,6 +6837,234 @@ mulAvxTwo_2x7_loop: mulAvxTwo_2x7_end: RET +// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z0, Z14 + VGF2P8AFFINEQB $0x00, Z21, Z1, Z15 + VGF2P8AFFINEQB $0x00, Z21, Z2, Z16 + VGF2P8AFFINEQB $0x00, Z21, Z3, Z17 + VGF2P8AFFINEQB $0x00, Z21, Z4, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z5, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z6, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z7, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z21, Z8, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z21, Z9, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z21, Z10, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z21, Z11, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z12, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z13, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64_loop + VZEROUPPER + +mulGFNI_2x7_64_end: + RET + +// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (SI), Z14 + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (BX), Z20 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z0, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z21, Z1, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z21, Z2, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z21, Z3, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z21, Z4, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z5, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z6, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z7, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z21, Z8, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z21, Z9, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z21, Z10, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z21, Z11, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z12, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z13, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64Xor_loop + VZEROUPPER + +mulGFNI_2x7_64Xor_end: + RET + // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 @@ -4678,6 +7384,254 @@ mulAvxTwo_2x8_loop: mulAvxTwo_2x8_end: RET +// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z16 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z17 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z3, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z4, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z6, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64_loop + VZEROUPPER + +mulGFNI_2x8_64_end: + RET + +// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (SI), Z16 + VMOVDQU64 (DI), Z17 + VMOVDQU64 (R8), Z18 + VMOVDQU64 (R9), Z19 + VMOVDQU64 (R10), Z20 + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (BX), Z23 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z3, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z4, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64Xor_loop + VZEROUPPER + +mulGFNI_2x8_64Xor_end: + RET + // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 @@ -5026,6 +7980,274 @@ mulAvxTwo_2x9_loop: mulAvxTwo_2x9_end: RET +// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z27, Z0, Z18 + VGF2P8AFFINEQB $0x00, Z27, Z1, Z19 + VGF2P8AFFINEQB $0x00, Z27, Z2, Z20 + VGF2P8AFFINEQB $0x00, Z27, Z3, Z21 + VGF2P8AFFINEQB $0x00, Z27, Z4, Z22 + VGF2P8AFFINEQB $0x00, Z27, Z5, Z23 + VGF2P8AFFINEQB $0x00, Z27, Z6, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z7, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z8, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z27, Z9, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z27, Z10, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z27, Z11, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z27, Z12, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z27, Z13, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z27, Z14, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z27, Z15, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z16, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z17, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64_loop + VZEROUPPER + +mulGFNI_2x9_64_end: + RET + +// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (SI), Z18 + VMOVDQU64 (DI), Z19 + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (BX), Z26 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z27, Z0, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z27, Z1, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z27, Z2, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z27, Z3, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z27, Z4, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z27, Z5, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z27, Z6, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z7, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z8, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z27, Z9, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z27, Z10, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z27, Z11, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z27, Z12, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z27, Z13, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z27, Z14, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z27, Z15, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z16, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z17, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64Xor_loop + VZEROUPPER + +mulGFNI_2x9_64Xor_end: + RET + // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 @@ -5403,6 +8625,294 @@ mulAvxTwo_2x10_loop: mulAvxTwo_2x10_end: RET +// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z20 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z21 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z22 + VGF2P8AFFINEQB $0x00, Z30, Z3, Z23 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z5, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z6, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z9, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64_loop + VZEROUPPER + +mulGFNI_2x10_64_end: + RET + +// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (SI), Z20 + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (BX), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z30, Z3, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z5, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64Xor_loop + VZEROUPPER + +mulGFNI_2x10_64Xor_end: + RET + // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 @@ -5785,6 +9295,132 @@ mulAvxTwo_3x1_64_loop: mulAvxTwo_3x1_64_end: RET +// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z4, Z0, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z1, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z2, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64_loop + VZEROUPPER + +mulGFNI_3x1_64_end: + RET + +// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (SI), Z3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z4, Z0, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z1, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z2, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64Xor_loop + VZEROUPPER + +mulGFNI_3x1_64Xor_end: + RET + // func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 @@ -6200,6 +9836,158 @@ mulAvxTwo_3x2_64_loop: mulAvxTwo_3x2_64_end: RET +// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z8, Z0, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z1, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z2, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z3, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z4, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z5, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64_loop + VZEROUPPER + +mulGFNI_3x2_64_end: + RET + +// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (DI), Z6 + VMOVDQU64 (SI), Z7 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z8, Z0, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z1, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z2, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z3, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z4, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z8, Z5, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64Xor_loop + VZEROUPPER + +mulGFNI_3x2_64Xor_end: + RET + // func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 @@ -6714,6 +10502,184 @@ mulAvxTwo_3x3_64_loop: mulAvxTwo_3x3_64_end: RET +// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z2, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z12, Z3, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z6, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z7, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z8, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64_loop + VZEROUPPER + +mulGFNI_3x3_64_end: + RET + +// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (SI), Z11 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z2, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z12, Z3, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z6, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z12, Z7, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z8, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64Xor_loop + VZEROUPPER + +mulGFNI_3x3_64Xor_end: + RET + // func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 @@ -7138,6 +11104,210 @@ mulAvxTwo_3x4_loop: mulAvxTwo_3x4_end: RET +// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z0, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z1, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z2, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z3, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z16, Z4, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z5, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z6, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z7, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z8, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z9, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z10, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z11, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64_loop + VZEROUPPER + +mulGFNI_3x4_64_end: + RET + +// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (SI), Z15 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z0, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z1, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z2, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z3, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z16, Z4, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z5, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z6, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z7, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z8, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z16, Z9, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z16, Z10, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z11, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64Xor_loop + VZEROUPPER + +mulGFNI_3x4_64Xor_end: + RET + // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 @@ -7433,6 +11603,236 @@ mulAvxTwo_3x5_loop: mulAvxTwo_3x5_end: RET +// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z2, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z4, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z10, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z11, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z12, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z13, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z14, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64_loop + VZEROUPPER + +mulGFNI_3x5_64_end: + RET + +// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (SI), Z19 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z2, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z4, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z10, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z20, Z11, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z12, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z13, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z14, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64Xor_loop + VZEROUPPER + +mulGFNI_3x5_64Xor_end: + RET + // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 @@ -7767,6 +12167,262 @@ mulAvxTwo_3x6_loop: mulAvxTwo_3x6_end: RET +// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z3, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z4, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z16, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z17, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64_loop + VZEROUPPER + +mulGFNI_3x6_64_end: + RET + +// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (DI), Z18 + VMOVDQU64 (R8), Z19 + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (SI), Z23 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z3, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z4, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z16, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z17, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64Xor_loop + VZEROUPPER + +mulGFNI_3x6_64Xor_end: + RET + // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 @@ -8140,6 +12796,288 @@ mulAvxTwo_3x7_loop: mulAvxTwo_3x7_end: RET +// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z28, Z0, Z21 + VGF2P8AFFINEQB $0x00, Z28, Z1, Z22 + VGF2P8AFFINEQB $0x00, Z28, Z2, Z23 + VGF2P8AFFINEQB $0x00, Z28, Z3, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z4, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z5, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z6, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z28, Z7, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z28, Z8, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z28, Z9, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z28, Z10, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z11, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z12, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z13, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z28, Z14, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z28, Z15, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z28, Z16, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z28, Z17, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z18, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z19, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z20, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64_loop + VZEROUPPER + +mulGFNI_3x7_64_end: + RET + +// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (SI), Z27 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z28, Z0, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z28, Z1, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z28, Z2, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z28, Z3, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z4, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z5, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z6, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z28, Z7, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z28, Z8, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z28, Z9, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z28, Z10, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z11, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z12, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z13, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z28, Z14, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z28, Z15, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z28, Z16, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z28, Z17, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z18, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z19, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z20, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64Xor_loop + VZEROUPPER + +mulGFNI_3x7_64Xor_end: + RET + // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 @@ -8552,6 +13490,314 @@ mulAvxTwo_3x8_loop: mulAvxTwo_3x8_end: RET +// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x8_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z1, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z5, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z6, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64_loop + VZEROUPPER + +mulGFNI_3x8_64_end: + RET + +// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R8), Z0 + VMOVDQU64 (R9), Z1 + VMOVDQU64 (R10), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (R12), Z4 + VMOVDQU64 (R13), Z5 + VMOVDQU64 (R14), Z6 + VMOVDQU64 (DI), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z1, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z5, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z6, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64Xor_loop + VZEROUPPER + +mulGFNI_3x8_64Xor_end: + RET + // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 @@ -9003,6 +14249,340 @@ mulAvxTwo_3x9_loop: mulAvxTwo_3x9_end: RET +// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x9_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z1, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z5, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z6, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z7, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64_loop + VZEROUPPER + +mulGFNI_3x9_64_end: + RET + +// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z0 + VMOVDQU64 (R9), Z1 + VMOVDQU64 (R10), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (R12), Z4 + VMOVDQU64 (R13), Z5 + VMOVDQU64 (R14), Z6 + VMOVDQU64 (R15), Z7 + VMOVDQU64 (DI), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z1, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z5, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z6, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z7, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64Xor_loop + VZEROUPPER + +mulGFNI_3x9_64Xor_end: + RET + // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 @@ -9495,6 +15075,374 @@ mulAvxTwo_3x10_loop: mulAvxTwo_3x10_end: RET +// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x10_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z10 + ADDQ $0x40, AX + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + VMOVDQU64 Z0, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z2, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z4, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z5, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z6, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z7, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z8, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64_loop + VZEROUPPER + +mulGFNI_3x10_64_end: + RET + +// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (DI), Z0 + VMOVDQU64 (R8), Z1 + VMOVDQU64 (R9), Z2 + VMOVDQU64 (R10), Z3 + VMOVDQU64 (R11), Z4 + VMOVDQU64 (R12), Z5 + VMOVDQU64 (R13), Z6 + VMOVDQU64 (R14), Z7 + VMOVDQU64 (R15), Z8 + VMOVDQU64 (SI), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z10 + ADDQ $0x40, AX + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + VMOVDQU64 Z0, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z1, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z2, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z3, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z4, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z5, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z6, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z7, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z8, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64Xor_loop + VZEROUPPER + +mulGFNI_3x10_64Xor_end: + RET + // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 @@ -9973,6 +15921,150 @@ mulAvxTwo_4x1_64_loop: mulAvxTwo_4x1_64_end: RET +// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z5, Z0, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z1, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z2, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z3, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64_loop + VZEROUPPER + +mulGFNI_4x1_64_end: + RET + +// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DI), Z4 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z5, Z0, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z1, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z2, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z3, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64Xor_loop + VZEROUPPER + +mulGFNI_4x1_64Xor_end: + RET + // func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 @@ -10471,6 +16563,182 @@ mulAvxTwo_4x2_64_loop: mulAvxTwo_4x2_64_end: RET +// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z10, Z0, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z1, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z2, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z3, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z4, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z5, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z6, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z7, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64_loop + VZEROUPPER + +mulGFNI_4x2_64_end: + RET + +// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DI), Z9 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z10, Z0, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z1, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z2, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z3, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z4, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z5, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z6, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z10, Z7, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64Xor_loop + VZEROUPPER + +mulGFNI_4x2_64Xor_end: + RET + // func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 @@ -11094,6 +17362,214 @@ mulAvxTwo_4x3_64_loop: mulAvxTwo_4x3_64_end: RET +// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z15, Z0, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z1, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z2, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z15, Z3, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z4, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z5, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z15, Z6, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z7, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z8, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z9, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z10, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z11, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64_loop + VZEROUPPER + +mulGFNI_4x3_64_end: + RET + +// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (DI), Z14 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z15, Z0, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z1, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z2, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z15, Z3, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z4, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z5, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z15, Z6, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z7, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z8, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z9, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z15, Z10, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z15, Z11, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64Xor_loop + VZEROUPPER + +mulGFNI_4x3_64Xor_end: + RET + // func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 @@ -11608,6 +18084,246 @@ mulAvxTwo_4x4_loop: mulAvxTwo_4x4_end: RET +// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z2, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z20, Z4, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z10, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z11, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z12, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z13, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z14, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z15, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64_loop + VZEROUPPER + +mulGFNI_4x4_64_end: + RET + +// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (DI), Z19 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z2, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z20, Z4, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z10, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z11, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z12, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z20, Z13, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z20, Z14, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z15, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64Xor_loop + VZEROUPPER + +mulGFNI_4x4_64Xor_end: + RET + // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 @@ -11966,6 +18682,278 @@ mulAvxTwo_4x5_loop: mulAvxTwo_4x5_end: RET +// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z25, Z0, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z1, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z2, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z3, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z4, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z25, Z5, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z6, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z7, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z8, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z9, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z25, Z10, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z11, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z12, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z13, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z14, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z25, Z15, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z16, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z17, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z18, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z19, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64_loop + VZEROUPPER + +mulGFNI_4x5_64_end: + RET + +// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (DI), Z24 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z25, Z0, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z1, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z2, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z3, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z4, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z25, Z5, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z6, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z7, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z8, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z9, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z25, Z10, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z11, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z12, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z13, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z14, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z25, Z15, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z25, Z16, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z25, Z17, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z25, Z18, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z25, Z19, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64Xor_loop + VZEROUPPER + +mulGFNI_4x5_64Xor_end: + RET + // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 @@ -12373,6 +19361,310 @@ mulAvxTwo_4x6_loop: mulAvxTwo_4x6_end: RET +// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z3, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z5, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z20, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z21, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z22, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z23, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64_loop + VZEROUPPER + +mulGFNI_4x6_64_end: + RET + +// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R8), Z24 + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z3, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z5, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z20, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z21, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z22, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z23, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64Xor_loop + VZEROUPPER + +mulGFNI_4x6_64Xor_end: + RET + // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 @@ -12829,6 +20121,342 @@ mulAvxTwo_4x7_loop: mulAvxTwo_4x7_end: RET +// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x7_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + VMOVDQU64 Z0, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z2, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z4, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z5, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z6, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64_loop + VZEROUPPER + +mulGFNI_4x7_64_end: + RET + +// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R9), Z0 + VMOVDQU64 (R10), Z1 + VMOVDQU64 (R11), Z2 + VMOVDQU64 (R12), Z3 + VMOVDQU64 (R13), Z4 + VMOVDQU64 (R14), Z5 + VMOVDQU64 (R8), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + VMOVDQU64 Z0, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z2, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z4, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z5, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z6, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64Xor_loop + VZEROUPPER + +mulGFNI_4x7_64Xor_end: + RET + // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 @@ -13334,6 +20962,374 @@ mulAvxTwo_4x8_loop: mulAvxTwo_4x8_end: RET +// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x8_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + VMOVDQU64 Z0, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z2, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z4, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z5, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z6, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z7, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64_loop + VZEROUPPER + +mulGFNI_4x8_64_end: + RET + +// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z0 + VMOVDQU64 (R10), Z1 + VMOVDQU64 (R11), Z2 + VMOVDQU64 (R12), Z3 + VMOVDQU64 (R13), Z4 + VMOVDQU64 (R14), Z5 + VMOVDQU64 (R15), Z6 + VMOVDQU64 (R8), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + VMOVDQU64 Z0, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z2, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z4, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z5, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z6, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z7, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64Xor_loop + VZEROUPPER + +mulGFNI_4x8_64Xor_end: + RET + // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 @@ -13890,6 +21886,414 @@ mulAvxTwo_4x9_loop: mulAvxTwo_4x9_end: RET +// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x9_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z9 + ADDQ $0x40, AX + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z1, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z5, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z6, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z7, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64_loop + VZEROUPPER + +mulGFNI_4x9_64_end: + RET + +// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z0 + VMOVDQU64 (R9), Z1 + VMOVDQU64 (R10), Z2 + VMOVDQU64 (R11), Z3 + VMOVDQU64 (R12), Z4 + VMOVDQU64 (R13), Z5 + VMOVDQU64 (R14), Z6 + VMOVDQU64 (R15), Z7 + VMOVDQU64 (DI), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z9 + ADDQ $0x40, AX + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + VMOVDQU64 Z0, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z1, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z2, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z4, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z5, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z6, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z7, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64Xor_loop + VZEROUPPER + +mulGFNI_4x9_64Xor_end: + RET + // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 @@ -14474,6 +22878,406 @@ mulAvxTwo_4x10_loop: mulAvxTwo_4x10_end: RET +// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x10_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z0, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z1, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z2, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z3, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z4, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z5, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z6, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z7, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z8, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z9, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z0 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z1 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z2 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z3 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z4 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z5 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z6 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z7 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z8 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z0, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z1, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z2, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z3, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z4, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z5, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z6, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z7, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z8, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z9, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop + VZEROUPPER + +mulGFNI_4x10_64Xor_end: + RET + // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 @@ -15033,6 +23837,168 @@ mulAvxTwo_5x1_64_loop: mulAvxTwo_5x1_64_end: RET +// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z6, Z0, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z1, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z2, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z3, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z4, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64_loop + VZEROUPPER + +mulGFNI_5x1_64_end: + RET + +// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R8), Z5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z6, Z0, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z1, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z2, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z3, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z4, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64Xor_loop + VZEROUPPER + +mulGFNI_5x1_64Xor_end: + RET + // func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 @@ -15614,6 +24580,206 @@ mulAvxTwo_5x2_64_loop: mulAvxTwo_5x2_64_end: RET +// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z12, Z2, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z3, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z6, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z7, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z8, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z9, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64_loop + VZEROUPPER + +mulGFNI_5x2_64_end: + RET + +// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R9), Z10 + VMOVDQU64 (R8), Z11 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z12, Z0, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z1, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z12, Z2, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z3, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z4, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z5, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z6, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z7, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z8, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z12, Z9, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64Xor_loop + VZEROUPPER + +mulGFNI_5x2_64Xor_end: + RET + // func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 @@ -16346,6 +25512,244 @@ mulAvxTwo_5x3_64_loop: mulAvxTwo_5x3_64_end: RET +// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z2, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z18, Z3, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z4, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z9, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z10, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z11, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z12, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z13, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z14, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64_loop + VZEROUPPER + +mulGFNI_5x3_64_end: + RET + +// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (R8), Z17 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z2, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z18, Z3, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z4, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z9, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z10, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z11, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z12, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z18, Z13, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z14, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64Xor_loop + VZEROUPPER + +mulGFNI_5x3_64Xor_end: + RET + // func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 @@ -16950,6 +26354,282 @@ mulAvxTwo_5x4_loop: mulAvxTwo_5x4_end: RET +// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z3, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z24, Z4, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z16, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z17, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z18, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z19, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64_loop + VZEROUPPER + +mulGFNI_5x4_64_end: + RET + +// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (R8), Z23 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z3, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z24, Z4, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z16, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z24, Z17, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z18, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z19, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64Xor_loop + VZEROUPPER + +mulGFNI_5x4_64Xor_end: + RET + // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 @@ -17371,6 +27051,320 @@ mulAvxTwo_5x5_loop: mulAvxTwo_5x5_end: RET +// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z3, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z30, Z5, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z20, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z21, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z22, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z23, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z24, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64_loop + VZEROUPPER + +mulGFNI_5x5_64_end: + RET + +// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z3, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z30, Z5, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z20, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z30, Z21, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z30, Z22, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z23, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z24, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64Xor_loop + VZEROUPPER + +mulGFNI_5x5_64Xor_end: + RET + // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 @@ -17851,6 +27845,358 @@ mulAvxTwo_5x6_loop: mulAvxTwo_5x6_end: RET +// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x6_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + VMOVDQU64 Z0, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z1, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z3, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z4, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z5, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64_loop + VZEROUPPER + +mulGFNI_5x6_64_end: + RET + +// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R10), Z0 + VMOVDQU64 (R11), Z1 + VMOVDQU64 (R12), Z2 + VMOVDQU64 (R13), Z3 + VMOVDQU64 (R14), Z4 + VMOVDQU64 (R9), Z5 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + VMOVDQU64 Z0, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z1, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z3, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z4, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z5, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64Xor_loop + VZEROUPPER + +mulGFNI_5x6_64Xor_end: + RET + // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 @@ -18390,6 +28736,396 @@ mulAvxTwo_5x7_loop: mulAvxTwo_5x7_end: RET +// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x7_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + VMOVDQU64 Z0, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z1, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z3, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z4, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z5, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64_loop + VZEROUPPER + +mulGFNI_5x7_64_end: + RET + +// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z0 + VMOVDQU64 (R11), Z1 + VMOVDQU64 (R12), Z2 + VMOVDQU64 (R13), Z3 + VMOVDQU64 (R14), Z4 + VMOVDQU64 (R15), Z5 + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + VMOVDQU64 Z0, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z1, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z3, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z4, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z5, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64Xor_loop + VZEROUPPER + +mulGFNI_5x7_64Xor_end: + RET + // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 @@ -18990,6 +29726,442 @@ mulAvxTwo_5x8_loop: mulAvxTwo_5x8_end: RET +// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x8_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z8 + ADDQ $0x40, AX + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + VMOVDQU64 Z0, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z2, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z4, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z5, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z6, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z7, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64_loop + VZEROUPPER + +mulGFNI_5x8_64_end: + RET + +// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z0 + VMOVDQU64 (R10), Z1 + VMOVDQU64 (R11), Z2 + VMOVDQU64 (R12), Z3 + VMOVDQU64 (R13), Z4 + VMOVDQU64 (R14), Z5 + VMOVDQU64 (R15), Z6 + VMOVDQU64 (R8), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z8 + ADDQ $0x40, AX + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + VMOVDQU64 Z0, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z1, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z2, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z4, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z5, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z6, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z7, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64Xor_loop + VZEROUPPER + +mulGFNI_5x8_64Xor_end: + RET + // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 @@ -19630,6 +30802,443 @@ mulAvxTwo_5x9_loop: mulAvxTwo_5x9_end: RET +// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x9_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z8, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64_loop + VZEROUPPER + +mulGFNI_5x9_64_end: + RET + +// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z0 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z1 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z2 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z3 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z4 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z5 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z6 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z7 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z8, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64Xor_loop + VZEROUPPER + +mulGFNI_5x9_64Xor_end: + RET + // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 @@ -20315,6 +31924,478 @@ mulAvxTwo_5x10_loop: mulAvxTwo_5x10_end: RET +// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x10_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z8, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z9, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64_loop + VZEROUPPER + +mulGFNI_5x10_64_end: + RET + +// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z0 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z1 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z2 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z3 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z4 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z5 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z6 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z7 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z8 + MOVQ 216(R9), R11 + VMOVDQU64 (R11)(R10*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z8, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z9, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64Xor_loop + VZEROUPPER + +mulGFNI_5x10_64Xor_end: + RET + // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 @@ -20968,6 +33049,186 @@ mulAvxTwo_6x1_64_loop: mulAvxTwo_6x1_64_end: RET +// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z7, Z0, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z1, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z2, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z7, Z3, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z7, Z4, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z5, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64_loop + VZEROUPPER + +mulGFNI_6x1_64_end: + RET + +// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z7, Z0, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z1, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z2, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z7, Z3, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z7, Z4, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z5, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64Xor_loop + VZEROUPPER + +mulGFNI_6x1_64Xor_end: + RET + // func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 @@ -21632,6 +33893,230 @@ mulAvxTwo_6x2_64_loop: mulAvxTwo_6x2_64_end: RET +// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z14, Z0, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z1, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z14, Z2, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z3, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z4, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z5, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z6, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z7, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z14, Z8, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z9, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z10, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z11, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64_loop + VZEROUPPER + +mulGFNI_6x2_64_end: + RET + +// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R10), Z12 + VMOVDQU64 (R9), Z13 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z14, Z0, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z1, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z14, Z2, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z3, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z4, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z5, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z6, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z7, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z14, Z8, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z9, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z10, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z14, Z11, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64Xor_loop + VZEROUPPER + +mulGFNI_6x2_64Xor_end: + RET + // func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 @@ -22473,6 +34958,274 @@ mulAvxTwo_6x3_64_loop: mulAvxTwo_6x3_64_end: RET +// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z0, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z1, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z2, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z21, Z3, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z4, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z5, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z21, Z6, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z7, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z8, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z9, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z10, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z11, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z12, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z13, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z14, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z15, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z16, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z17, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64_loop + VZEROUPPER + +mulGFNI_6x3_64_end: + RET + +// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (R9), Z20 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z0, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z1, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z2, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z21, Z3, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z4, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z5, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z21, Z6, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z7, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z8, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z9, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z10, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z11, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z12, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z13, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z14, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z15, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z21, Z16, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z21, Z17, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64Xor_loop + VZEROUPPER + +mulGFNI_6x3_64Xor_end: + RET + // func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 @@ -23167,6 +35920,318 @@ mulAvxTwo_6x4_loop: mulAvxTwo_6x4_end: RET +// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z28, Z0, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z1, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z2, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z3, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z28, Z4, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z5, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z6, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z7, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z28, Z8, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z9, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z10, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z11, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z28, Z12, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z13, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z14, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z15, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z28, Z16, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z17, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z18, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z19, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z28, Z20, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z21, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z22, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z23, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64_loop + VZEROUPPER + +mulGFNI_6x4_64_end: + RET + +// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R9), Z27 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z28, Z0, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z1, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z2, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z3, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z28, Z4, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z5, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z6, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z7, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z28, Z8, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z9, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z10, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z11, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z28, Z12, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z13, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z14, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z15, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z28, Z16, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z17, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z18, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z19, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z28, Z20, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z28, Z21, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z28, Z22, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z28, Z23, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64Xor_loop + VZEROUPPER + +mulGFNI_6x4_64Xor_end: + RET + // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 @@ -23651,6 +36716,362 @@ mulAvxTwo_6x5_loop: mulAvxTwo_6x5_end: RET +// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x5_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + VMOVDQU64 Z0, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z1, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z3, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z4, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64_loop + VZEROUPPER + +mulGFNI_6x5_64_end: + RET + +// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R11), Z0 + VMOVDQU64 (R12), Z1 + VMOVDQU64 (R13), Z2 + VMOVDQU64 (R14), Z3 + VMOVDQU64 (R10), Z4 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + VMOVDQU64 Z0, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z1, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z3, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z4, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64Xor_loop + VZEROUPPER + +mulGFNI_6x5_64Xor_end: + RET + // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 @@ -24204,6 +37625,406 @@ mulAvxTwo_6x6_loop: mulAvxTwo_6x6_end: RET +// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x6_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + VMOVDQU64 Z0, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z1, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z3, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z4, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z5, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64_loop + VZEROUPPER + +mulGFNI_6x6_64_end: + RET + +// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z0 + VMOVDQU64 (R12), Z1 + VMOVDQU64 (R13), Z2 + VMOVDQU64 (R14), Z3 + VMOVDQU64 (R15), Z4 + VMOVDQU64 (R10), Z5 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + VMOVDQU64 Z0, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z1, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z3, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z4, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z5, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64Xor_loop + VZEROUPPER + +mulGFNI_6x6_64Xor_end: + RET + // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 @@ -24828,6 +38649,458 @@ mulAvxTwo_6x7_loop: mulAvxTwo_6x7_end: RET +// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x7_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z7 + ADDQ $0x40, AX + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + VMOVDQU64 Z0, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z1, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z3, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z4, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z5, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64_loop + VZEROUPPER + +mulGFNI_6x7_64_end: + RET + +// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z0 + VMOVDQU64 (R11), Z1 + VMOVDQU64 (R12), Z2 + VMOVDQU64 (R13), Z3 + VMOVDQU64 (R14), Z4 + VMOVDQU64 (R15), Z5 + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z7 + ADDQ $0x40, AX + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + VMOVDQU64 Z0, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z1, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z2, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z3, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z4, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z5, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64Xor_loop + VZEROUPPER + +mulGFNI_6x7_64Xor_end: + RET + // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 @@ -25504,6 +39777,468 @@ mulAvxTwo_6x8_loop: mulAvxTwo_6x8_end: RET +// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x8_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64_loop + VZEROUPPER + +mulGFNI_6x8_64_end: + RET + +// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z0 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z1 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z2 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z3 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z4 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z5 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z6 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64Xor_loop + VZEROUPPER + +mulGFNI_6x8_64Xor_end: + RET + // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 @@ -26236,6 +40971,509 @@ mulAvxTwo_6x9_loop: mulAvxTwo_6x9_end: RET +// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x9_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z8, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64_loop + VZEROUPPER + +mulGFNI_6x9_64_end: + RET + +// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z0 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z1 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z2 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z3 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z4 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z5 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z6 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z7 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z8, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64Xor_loop + VZEROUPPER + +mulGFNI_6x9_64Xor_end: + RET + // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 @@ -27034,6 +42272,550 @@ mulAvxTwo_6x10_loop: mulAvxTwo_6x10_end: RET +// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x10_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z9, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64_loop + VZEROUPPER + +mulGFNI_6x10_64_end: + RET + +// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z0 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z1 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z2 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z3 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z4 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z5 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z6 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z7 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z8 + MOVQ 216(R10), R12 + VMOVDQU64 (R12)(R11*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z9, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64Xor_loop + VZEROUPPER + +mulGFNI_6x10_64Xor_end: + RET + // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 @@ -27781,6 +43563,204 @@ mulAvxTwo_7x1_64_loop: mulAvxTwo_7x1_64_end: RET +// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z8, Z0, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z1, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z2, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z3, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z4, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z8, Z5, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z6, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64_loop + VZEROUPPER + +mulGFNI_7x1_64_end: + RET + +// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R10), Z7 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z8, Z0, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z1, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z2, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z3, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z4, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z8, Z5, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z6, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64Xor_loop + VZEROUPPER + +mulGFNI_7x1_64Xor_end: + RET + // func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 @@ -28528,6 +44508,254 @@ mulAvxTwo_7x2_64_loop: mulAvxTwo_7x2_64_end: RET +// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z0, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z1, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z16, Z2, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z3, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z4, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z5, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z6, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z7, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z8, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z9, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z10, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z11, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z12, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z13, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64_loop + VZEROUPPER + +mulGFNI_7x2_64_end: + RET + +// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R11), Z14 + VMOVDQU64 (R10), Z15 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z0, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z1, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z16, Z2, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z3, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z4, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z5, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z6, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z7, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z8, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z9, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z10, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z11, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z12, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z16, Z13, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64Xor_loop + VZEROUPPER + +mulGFNI_7x2_64Xor_end: + RET + // func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 @@ -29478,6 +45706,304 @@ mulAvxTwo_7x3_64_loop: mulAvxTwo_7x3_64_end: RET +// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z24, Z3, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z4, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z16, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z17, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z18, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z19, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z20, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64_loop + VZEROUPPER + +mulGFNI_7x3_64_end: + RET + +// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (R10), Z23 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z0, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z1, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z2, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z24, Z3, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z4, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z5, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z24, Z6, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z7, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z8, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z24, Z9, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z10, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z11, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z24, Z12, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z13, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z14, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z24, Z15, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z16, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z17, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z18, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z24, Z19, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z24, Z20, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64Xor_loop + VZEROUPPER + +mulGFNI_7x3_64Xor_end: + RET + // func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 @@ -30262,6 +46788,354 @@ mulAvxTwo_7x4_loop: mulAvxTwo_7x4_end: RET +// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x4_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + VMOVDQU64 Z0, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z1, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z2, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64_loop + VZEROUPPER + +mulGFNI_7x4_64_end: + RET + +// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R12), Z0 + VMOVDQU64 (R13), Z1 + VMOVDQU64 (R14), Z2 + VMOVDQU64 (R11), Z3 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + VMOVDQU64 Z0, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z1, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z2, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z3, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64Xor_loop + VZEROUPPER + +mulGFNI_7x4_64Xor_end: + RET + // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 @@ -30809,6 +47683,404 @@ mulAvxTwo_7x5_loop: mulAvxTwo_7x5_end: RET +// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x5_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + VMOVDQU64 Z0, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z1, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z2, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z3, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z4, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64_loop + VZEROUPPER + +mulGFNI_7x5_64_end: + RET + +// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z0 + VMOVDQU64 (R13), Z1 + VMOVDQU64 (R14), Z2 + VMOVDQU64 (R15), Z3 + VMOVDQU64 (R11), Z4 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + VMOVDQU64 Z0, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z1, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z2, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z3, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z4, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64Xor_loop + VZEROUPPER + +mulGFNI_7x5_64Xor_end: + RET + // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 @@ -31437,6 +48709,462 @@ mulAvxTwo_7x6_loop: mulAvxTwo_7x6_end: RET +// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x6_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z6 + ADDQ $0x40, AX + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + VMOVDQU64 Z0, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z1, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z3, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z4, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z5, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z0 + VMOVDQU64 (R12), Z1 + VMOVDQU64 (R13), Z2 + VMOVDQU64 (R14), Z3 + VMOVDQU64 (R15), Z4 + VMOVDQU64 (R10), Z5 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z6 + ADDQ $0x40, AX + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + VMOVDQU64 Z0, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z1, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z3, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z4, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z5, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64Xor_loop + VZEROUPPER + +mulGFNI_7x6_64Xor_end: + RET + // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 @@ -32129,6 +49857,481 @@ mulAvxTwo_7x7_loop: mulAvxTwo_7x7_end: RET +// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x7_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64_loop + VZEROUPPER + +mulGFNI_7x7_64_end: + RET + +// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z0 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z1 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z2 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z3 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z4 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z5 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64Xor_loop + VZEROUPPER + +mulGFNI_7x7_64Xor_end: + RET + // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 @@ -32888,6 +51091,528 @@ mulAvxTwo_7x8_loop: mulAvxTwo_7x8_end: RET +// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x8_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64_loop + VZEROUPPER + +mulGFNI_7x8_64_end: + RET + +// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z0 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z1 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z2 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z3 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z4 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z5 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z6 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64Xor_loop + VZEROUPPER + +mulGFNI_7x8_64Xor_end: + RET + // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 @@ -33723,6 +52448,575 @@ mulAvxTwo_7x9_loop: mulAvxTwo_7x9_end: RET +// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x9_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64_loop + VZEROUPPER + +mulGFNI_7x9_64_end: + RET + +// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z0 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z1 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z2 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z3 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z4 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z5 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z6 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z7 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64Xor_loop + VZEROUPPER + +mulGFNI_7x9_64Xor_end: + RET + // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 @@ -34634,6 +53928,622 @@ mulAvxTwo_7x10_loop: mulAvxTwo_7x10_end: RET +// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x10_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64_loop + VZEROUPPER + +mulGFNI_7x10_64_end: + RET + +// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z0 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z1 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z2 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z3 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z4 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z5 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z6 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z7 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z8 + MOVQ 216(R11), R13 + VMOVDQU64 (R13)(R12*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64Xor_loop + VZEROUPPER + +mulGFNI_7x10_64Xor_end: + RET + // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 @@ -35475,6 +55385,222 @@ mulAvxTwo_8x1_64_loop: mulAvxTwo_8x1_64_end: RET +// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z9, Z0, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z1, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z2, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z3, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z9, Z4, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z9, Z5, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z9, Z6, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z7, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z9, Z0, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z1, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z2, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z3, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z9, Z4, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z9, Z5, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z9, Z6, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z7, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64Xor_loop + VZEROUPPER + +mulGFNI_8x1_64Xor_end: + RET + // func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 @@ -36305,6 +56431,278 @@ mulAvxTwo_8x2_64_loop: mulAvxTwo_8x2_64_end: RET +// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z18, Z2, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z3, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z4, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z9, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z18, Z10, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z11, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z12, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z13, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z14, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z15, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64_loop + VZEROUPPER + +mulGFNI_8x2_64_end: + RET + +// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R12), Z16 + VMOVDQU64 (R11), Z17 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z0, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z1, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z18, Z2, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z3, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z4, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z5, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z6, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z7, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z8, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z9, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z18, Z10, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z11, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z12, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z13, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z14, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z18, Z15, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64Xor_loop + VZEROUPPER + +mulGFNI_8x2_64Xor_end: + RET + // func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 @@ -37364,6 +57762,334 @@ mulAvxTwo_8x3_64_loop: mulAvxTwo_8x3_64_end: RET +// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z27, Z0, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z1, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z2, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z27, Z3, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z4, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z5, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z27, Z6, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z7, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z8, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z27, Z9, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z10, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z11, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z27, Z12, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z13, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z14, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z27, Z15, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z16, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z17, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z27, Z18, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z19, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z20, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z27, Z21, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z22, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z23, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64_loop + VZEROUPPER + +mulGFNI_8x3_64_end: + RET + +// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (R11), Z26 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z27, Z0, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z1, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z2, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z27, Z3, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z4, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z5, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z27, Z6, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z7, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z8, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z27, Z9, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z10, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z11, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z27, Z12, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z13, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z14, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z27, Z15, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z16, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z17, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z27, Z18, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z19, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z20, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z27, Z21, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z27, Z22, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z27, Z23, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64Xor_loop + VZEROUPPER + +mulGFNI_8x3_64Xor_end: + RET + // func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 @@ -38238,6 +58964,390 @@ mulAvxTwo_8x4_loop: mulAvxTwo_8x4_end: RET +// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x4_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z4 + ADDQ $0x40, R11 + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 224(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 232(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 240(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 248(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + VMOVDQU64 Z0, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z1, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z2, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64_loop + VZEROUPPER + +mulGFNI_8x4_64_end: + RET + +// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z0 + VMOVDQU64 (R14), Z1 + VMOVDQU64 (R15), Z2 + VMOVDQU64 (R12), Z3 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z4 + ADDQ $0x40, R11 + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 224(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 232(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 240(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 248(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + VMOVDQU64 Z0, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z1, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z2, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64Xor_loop + VZEROUPPER + +mulGFNI_8x4_64Xor_end: + RET + // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 @@ -38850,6 +59960,454 @@ mulAvxTwo_8x5_loop: mulAvxTwo_8x5_end: RET +// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x5_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z5 + ADDQ $0x40, AX + VBROADCASTF32X2 280(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 288(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 296(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 304(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 312(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + VMOVDQU64 Z0, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z1, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z2, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z3, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z4, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64_loop + VZEROUPPER + +mulGFNI_8x5_64_end: + RET + +// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z0 + VMOVDQU64 (R13), Z1 + VMOVDQU64 (R14), Z2 + VMOVDQU64 (R15), Z3 + VMOVDQU64 (R11), Z4 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z5 + ADDQ $0x40, AX + VBROADCASTF32X2 280(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 288(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 296(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 304(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 312(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + VMOVDQU64 Z0, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z1, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z2, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z3, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z4, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64Xor_loop + VZEROUPPER + +mulGFNI_8x5_64Xor_end: + RET + // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 @@ -39538,6 +61096,482 @@ mulAvxTwo_8x6_loop: mulAvxTwo_8x6_end: RET +// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x6_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z6 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z6 + ADDQ $0x40, R11 + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 336(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 344(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 352(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 360(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 368(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 376(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64_loop + VZEROUPPER + +mulGFNI_8x6_64_end: + RET + +// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z0 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z1 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z2 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z3 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z4 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z5 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z6 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z6 + ADDQ $0x40, R11 + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 336(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 344(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 352(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 360(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 368(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 376(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64Xor_loop + VZEROUPPER + +mulGFNI_8x6_64Xor_end: + RET + // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 @@ -40304,6 +62338,535 @@ mulAvxTwo_8x7_loop: mulAvxTwo_8x7_end: RET +// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x7_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z7 + ADDQ $0x40, R11 + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 392(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 400(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 408(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 416(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 424(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 432(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 440(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64_loop + VZEROUPPER + +mulGFNI_8x7_64_end: + RET + +// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z0 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z1 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z2 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z3 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z4 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z5 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z7 + ADDQ $0x40, R11 + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 392(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 400(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 408(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 416(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 424(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 432(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 440(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64Xor_loop + VZEROUPPER + +mulGFNI_8x7_64Xor_end: + RET + // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 @@ -41156,6 +63719,588 @@ mulAvxTwo_8x8_loop: mulAvxTwo_8x8_end: RET +// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x8_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z8 + ADDQ $0x40, R11 + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 448(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 456(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 464(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 472(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 480(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 488(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 496(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 504(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64_loop + VZEROUPPER + +mulGFNI_8x8_64_end: + RET + +// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z0 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z1 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z2 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z3 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z4 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z5 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z6 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z8 + ADDQ $0x40, R11 + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 448(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 456(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 464(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 472(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 480(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 488(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 496(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 504(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64Xor_loop + VZEROUPPER + +mulGFNI_8x8_64Xor_end: + RET + // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 @@ -42094,6 +65239,641 @@ mulAvxTwo_8x9_loop: mulAvxTwo_8x9_end: RET +// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x9_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z9 + ADDQ $0x40, R11 + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 504(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 512(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 520(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 528(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 536(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 544(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 552(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 560(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 568(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64_loop + VZEROUPPER + +mulGFNI_8x9_64_end: + RET + +// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z0 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z1 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z2 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z3 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z4 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z5 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z6 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z7 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z9 + ADDQ $0x40, R11 + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 504(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 512(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 520(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 528(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 536(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 544(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 552(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 560(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 568(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64Xor_loop + VZEROUPPER + +mulGFNI_8x9_64Xor_end: + RET + // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 @@ -43118,6 +66898,694 @@ mulAvxTwo_8x10_loop: mulAvxTwo_8x10_end: RET +// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x10_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 560(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 568(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 576(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 584(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 592(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 600(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 608(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 616(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 624(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 632(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64_loop + VZEROUPPER + +mulGFNI_8x10_64_end: + RET + +// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z0 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z1 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z2 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z3 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z4 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z5 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z6 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z7 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z8 + MOVQ 216(R12), R14 + VMOVDQU64 (R14)(R13*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 560(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 568(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 576(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 584(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 592(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 600(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 608(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 616(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 624(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 632(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64Xor_loop + VZEROUPPER + +mulGFNI_8x10_64Xor_end: + RET + // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 @@ -44053,6 +68521,240 @@ mulAvxTwo_9x1_64_loop: mulAvxTwo_9x1_64_end: RET +// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z10, Z0, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z1, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z2, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z3, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z10, Z4, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z5, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z10, Z6, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z10, Z7, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z8, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64_loop + VZEROUPPER + +mulGFNI_9x1_64_end: + RET + +// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R12), Z9 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z10, Z0, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z1, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z2, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z3, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z10, Z4, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z5, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z10, Z6, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z10, Z7, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z8, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64Xor_loop + VZEROUPPER + +mulGFNI_9x1_64Xor_end: + RET + // func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 @@ -44966,6 +69668,302 @@ mulAvxTwo_9x2_64_loop: mulAvxTwo_9x2_64_end: RET +// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z20, Z2, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z20, Z4, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z10, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z11, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z12, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z13, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z20, Z14, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z15, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z16, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z17, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64_loop + VZEROUPPER + +mulGFNI_9x2_64_end: + RET + +// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R13), Z18 + VMOVDQU64 (R12), Z19 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z20, Z0, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z1, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z20, Z2, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z3, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z20, Z4, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z5, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z20, Z6, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z7, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z8, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z9, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z10, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z11, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z12, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z13, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z20, Z14, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z15, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z16, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z20, Z17, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64Xor_loop + VZEROUPPER + +mulGFNI_9x2_64Xor_end: + RET + // func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 @@ -46134,6 +71132,364 @@ mulAvxTwo_9x3_64_loop: mulAvxTwo_9x3_64_end: RET +// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z30, Z3, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z5, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z20, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z30, Z21, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z22, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z23, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z24, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z25, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z26, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z30, Z0, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z1, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z2, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z30, Z3, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z4, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z5, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z30, Z6, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z7, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z8, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z30, Z9, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z10, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z11, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z30, Z12, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z13, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z14, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z30, Z15, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z16, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z17, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z30, Z18, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z19, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z20, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z30, Z21, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z22, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z23, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z30, Z24, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z30, Z25, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z30, Z26, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64Xor_loop + VZEROUPPER + +mulGFNI_9x3_64Xor_end: + RET + // func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 @@ -47100,6 +72456,434 @@ mulAvxTwo_9x4_loop: mulAvxTwo_9x4_end: RET +// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x4_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z4 + ADDQ $0x40, R11 + VBROADCASTF32X2 224(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 232(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 240(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 248(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z4 + ADDQ $0x40, AX + VBROADCASTF32X2 256(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 264(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 272(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 280(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + VMOVDQU64 Z0, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z1, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z2, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64_loop + VZEROUPPER + +mulGFNI_9x4_64_end: + RET + +// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z0 + VMOVDQU64 (R14), Z1 + VMOVDQU64 (R15), Z2 + VMOVDQU64 (R12), Z3 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z4 + ADDQ $0x40, R11 + VBROADCASTF32X2 224(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 232(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 240(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 248(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z4 + ADDQ $0x40, AX + VBROADCASTF32X2 256(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 264(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 272(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 280(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + VMOVDQU64 Z0, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z1, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z2, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z3, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64Xor_loop + VZEROUPPER + +mulGFNI_9x4_64Xor_end: + RET + // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 @@ -47764,6 +73548,471 @@ mulAvxTwo_9x5_loop: mulAvxTwo_9x5_end: RET +// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x5_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z5 + ADDQ $0x40, R11 + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z5 + ADDQ $0x40, R12 + VBROADCASTF32X2 280(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 288(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 296(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 304(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 312(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 320(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 328(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 336(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 344(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 352(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64_loop + VZEROUPPER + +mulGFNI_9x5_64_end: + RET + +// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z0 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z1 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z2 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z3 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z4 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z5 + ADDQ $0x40, R11 + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z5 + ADDQ $0x40, R12 + VBROADCASTF32X2 280(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 288(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 296(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 304(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 312(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 320(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 328(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 336(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 344(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 352(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64Xor_loop + VZEROUPPER + +mulGFNI_9x5_64Xor_end: + RET + // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 @@ -48517,6 +74766,530 @@ mulAvxTwo_9x6_loop: mulAvxTwo_9x6_end: RET +// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x6_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z6 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z6 + ADDQ $0x40, R11 + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z6 + ADDQ $0x40, R12 + VBROADCASTF32X2 336(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 344(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 352(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 360(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 368(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 376(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 384(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 392(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 400(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 408(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 416(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 424(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64_loop + VZEROUPPER + +mulGFNI_9x6_64_end: + RET + +// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z0 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z1 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z2 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z3 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z4 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z5 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z6 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z6 + ADDQ $0x40, R11 + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z6 + ADDQ $0x40, R12 + VBROADCASTF32X2 336(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 344(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 352(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 360(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 368(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 376(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 384(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 392(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 400(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 408(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 416(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 424(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64Xor_loop + VZEROUPPER + +mulGFNI_9x6_64Xor_end: + RET + // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 @@ -49366,6 +76139,589 @@ mulAvxTwo_9x7_loop: mulAvxTwo_9x7_end: RET +// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x7_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z7 + ADDQ $0x40, R11 + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z7 + ADDQ $0x40, R12 + VBROADCASTF32X2 392(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 400(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 408(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 416(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 424(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 432(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 440(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 448(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 456(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 464(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 472(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 480(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 488(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 496(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64_loop + VZEROUPPER + +mulGFNI_9x7_64_end: + RET + +// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z0 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z1 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z2 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z3 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z4 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z5 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z7 + ADDQ $0x40, R11 + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z7 + ADDQ $0x40, R12 + VBROADCASTF32X2 392(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 400(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 408(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 416(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 424(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 432(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 440(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 448(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 456(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 464(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 472(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 480(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 488(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 496(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64Xor_loop + VZEROUPPER + +mulGFNI_9x7_64Xor_end: + RET + // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 @@ -50311,6 +77667,648 @@ mulAvxTwo_9x8_loop: mulAvxTwo_9x8_end: RET +// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x8_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z8 + ADDQ $0x40, R11 + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z8 + ADDQ $0x40, R12 + VBROADCASTF32X2 448(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 456(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 464(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 472(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 480(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 488(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 496(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 504(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 512(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 520(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 528(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 536(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 544(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 552(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 560(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 568(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64_loop + VZEROUPPER + +mulGFNI_9x8_64_end: + RET + +// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z0 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z1 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z2 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z3 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z4 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z5 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z6 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z8 + ADDQ $0x40, R11 + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z8 + ADDQ $0x40, R12 + VBROADCASTF32X2 448(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 456(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 464(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 472(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 480(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 488(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 496(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 504(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 512(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 520(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 528(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 536(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 544(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 552(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 560(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 568(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64Xor_loop + VZEROUPPER + +mulGFNI_9x8_64Xor_end: + RET + // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 @@ -51352,6 +79350,707 @@ mulAvxTwo_9x9_loop: mulAvxTwo_9x9_end: RET +// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x9_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z9 + ADDQ $0x40, R11 + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z9 + ADDQ $0x40, R12 + VBROADCASTF32X2 504(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 512(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 520(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 528(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 536(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 544(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 552(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 560(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 568(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 576(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 584(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 592(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 600(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 608(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 616(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 624(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 632(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 640(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z8, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64_loop + VZEROUPPER + +mulGFNI_9x9_64_end: + RET + +// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z0 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z1 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z2 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z3 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z4 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z5 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z6 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z7 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z9 + ADDQ $0x40, R11 + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z9 + ADDQ $0x40, R12 + VBROADCASTF32X2 504(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 512(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 520(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 528(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 536(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 544(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 552(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 560(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 568(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 576(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 584(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 592(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 600(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 608(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 616(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 624(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 632(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 640(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z8, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64Xor_loop + VZEROUPPER + +mulGFNI_9x9_64Xor_end: + RET + // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 @@ -52489,6 +81188,766 @@ mulAvxTwo_9x10_loop: mulAvxTwo_9x10_end: RET +// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x10_64(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z10 + ADDQ $0x40, R12 + VBROADCASTF32X2 560(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 568(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 576(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 584(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 592(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 600(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 608(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 616(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 624(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 632(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 640(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 648(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 656(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 664(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 672(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 680(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 688(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 696(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 704(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 712(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z9, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64_loop + VZEROUPPER + +mulGFNI_9x10_64_end: + RET + +// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z0 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z1 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z2 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z3 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z4 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z5 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z6 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z7 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z8 + MOVQ 216(R13), R15 + VMOVDQU64 (R15)(R14*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z10 + ADDQ $0x40, R12 + VBROADCASTF32X2 560(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 568(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 576(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 584(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 592(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 600(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 608(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 616(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 624(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 632(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 640(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 648(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 656(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 664(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 672(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 680(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 688(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 696(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 704(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 712(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z9, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64Xor_loop + VZEROUPPER + +mulGFNI_9x10_64Xor_end: + RET + // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 @@ -53518,6 +82977,258 @@ mulAvxTwo_10x1_64_loop: mulAvxTwo_10x1_64_end: RET +// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z11, Z0, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z11, Z1, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z11, Z2, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z11, Z3, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z11, Z4, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z11, Z5, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z11, Z6, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z11, Z7, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z11, Z8, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z11, Z9, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64_loop + VZEROUPPER + +mulGFNI_10x1_64_end: + RET + +// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R13), Z10 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z11, Z0, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z11, Z1, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z11, Z2, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z11, Z3, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z11, Z4, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z11, Z5, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z11, Z6, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z11, Z7, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z11, Z8, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z11, Z9, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64Xor_loop + VZEROUPPER + +mulGFNI_10x1_64Xor_end: + RET + // func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 @@ -54514,6 +84225,326 @@ mulAvxTwo_10x2_64_loop: mulAvxTwo_10x2_64_end: RET +// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z22, Z0, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z1, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z22, Z2, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z3, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z22, Z4, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z5, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z22, Z6, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z7, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z22, Z8, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z9, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z22, Z10, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z11, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z22, Z12, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z13, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z22, Z14, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z15, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z22, Z16, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z17, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z22, Z18, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z19, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64_loop + VZEROUPPER + +mulGFNI_10x2_64_end: + RET + +// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R14), Z20 + VMOVDQU64 (R13), Z21 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z22, Z0, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z1, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z22, Z2, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z3, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z22, Z4, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z5, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z22, Z6, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z7, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z22, Z8, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z9, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z22, Z10, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z11, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z22, Z12, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z13, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z22, Z14, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z15, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z22, Z16, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z17, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z22, Z18, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z22, Z19, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64Xor_loop + VZEROUPPER + +mulGFNI_10x2_64Xor_end: + RET + // func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 @@ -55797,6 +85828,402 @@ mulAvxTwo_10x3_64_loop: mulAvxTwo_10x3_64_end: RET +// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z0 + VBROADCASTF32X2 8(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z1 + VBROADCASTF32X2 16(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z2 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z3 + ADDQ $0x40, BX + VBROADCASTF32X2 24(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 32(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 40(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z3 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 56(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 64(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z3 + ADDQ $0x40, DI + VBROADCASTF32X2 72(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 80(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 88(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z3 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 104(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 112(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z3 + ADDQ $0x40, R9 + VBROADCASTF32X2 120(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 128(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 136(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z3 + ADDQ $0x40, R10 + VBROADCASTF32X2 144(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 152(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 160(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z3 + ADDQ $0x40, R11 + VBROADCASTF32X2 168(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 176(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 184(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z3 + ADDQ $0x40, R12 + VBROADCASTF32X2 192(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 200(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 208(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z3 + ADDQ $0x40, AX + VBROADCASTF32X2 216(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 224(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 232(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Store 3 outputs + VMOVDQU64 Z0, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z1, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z0 + VMOVDQU64 (R15), Z1 + VMOVDQU64 (R13), Z2 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VBROADCASTF32X2 (CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 8(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 16(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z3 + ADDQ $0x40, BX + VBROADCASTF32X2 24(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 32(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 40(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z3 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 56(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 64(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z3 + ADDQ $0x40, DI + VBROADCASTF32X2 72(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 80(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 88(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z3 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 104(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 112(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z3 + ADDQ $0x40, R9 + VBROADCASTF32X2 120(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 128(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 136(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z3 + ADDQ $0x40, R10 + VBROADCASTF32X2 144(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 152(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 160(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z3 + ADDQ $0x40, R11 + VBROADCASTF32X2 168(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 176(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 184(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z3 + ADDQ $0x40, R12 + VBROADCASTF32X2 192(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 200(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 208(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z3 + ADDQ $0x40, AX + VBROADCASTF32X2 216(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z0, Z4, Z0 + VBROADCASTF32X2 224(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z1, Z4, Z1 + VBROADCASTF32X2 232(CX), Z4 + VGF2P8AFFINEQB $0x00, Z3, Z4, Z4 + VXORPD Z2, Z4, Z2 + + // Store 3 outputs + VMOVDQU64 Z0, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z1, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z2, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64Xor_loop + VZEROUPPER + +mulGFNI_10x3_64Xor_end: + RET + // func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 @@ -56848,6 +87275,448 @@ mulAvxTwo_10x4_loop: mulAvxTwo_10x4_end: RET +// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x4_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z4 + ADDQ $0x40, R11 + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z4 + ADDQ $0x40, R12 + VBROADCASTF32X2 224(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 232(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 240(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 248(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z4 + ADDQ $0x40, R13 + VBROADCASTF32X2 256(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 264(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 272(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 280(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 288(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 296(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 304(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 312(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64_loop + VZEROUPPER + +mulGFNI_10x4_64_end: + RET + +// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 8(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 16(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 24(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z4 + ADDQ $0x40, SI + VBROADCASTF32X2 32(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 40(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 48(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 56(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z4 + ADDQ $0x40, DI + VBROADCASTF32X2 64(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 72(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 80(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 88(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z4 + ADDQ $0x40, R8 + VBROADCASTF32X2 96(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 104(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 112(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 120(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z4 + ADDQ $0x40, R9 + VBROADCASTF32X2 128(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 136(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 144(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 152(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z4 + ADDQ $0x40, R10 + VBROADCASTF32X2 160(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 168(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 176(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 184(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z4 + ADDQ $0x40, R11 + VBROADCASTF32X2 192(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 200(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 208(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 216(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z4 + ADDQ $0x40, R12 + VBROADCASTF32X2 224(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 232(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 240(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 248(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z4 + ADDQ $0x40, R13 + VBROADCASTF32X2 256(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 264(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 272(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 280(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VBROADCASTF32X2 288(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z0, Z5, Z0 + VBROADCASTF32X2 296(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z1, Z5, Z1 + VBROADCASTF32X2 304(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z2, Z5, Z2 + VBROADCASTF32X2 312(CX), Z5 + VGF2P8AFFINEQB $0x00, Z4, Z5, Z5 + VXORPD Z3, Z5, Z3 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64Xor_loop + VZEROUPPER + +mulGFNI_10x4_64Xor_end: + RET + // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 @@ -57568,6 +88437,513 @@ mulAvxTwo_10x5_loop: mulAvxTwo_10x5_end: RET +// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x5_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z5 + ADDQ $0x40, R11 + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z5 + ADDQ $0x40, R12 + VBROADCASTF32X2 280(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 288(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 296(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 304(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 312(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z5 + ADDQ $0x40, R13 + VBROADCASTF32X2 320(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 328(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 336(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 344(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 352(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 360(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 368(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 376(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 384(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 392(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64_loop + VZEROUPPER + +mulGFNI_10x5_64_end: + RET + +// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z4 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 8(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 16(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 24(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 32(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VBROADCASTF32X2 40(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 48(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 56(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 64(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 72(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z5 + ADDQ $0x40, DI + VBROADCASTF32X2 80(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 88(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 96(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 104(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 112(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z5 + ADDQ $0x40, R8 + VBROADCASTF32X2 120(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 128(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 136(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 144(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 152(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z5 + ADDQ $0x40, R9 + VBROADCASTF32X2 160(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 168(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 176(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 184(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 192(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z5 + ADDQ $0x40, R10 + VBROADCASTF32X2 200(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 208(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 216(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 224(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 232(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z5 + ADDQ $0x40, R11 + VBROADCASTF32X2 240(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 248(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 256(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 264(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 272(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z5 + ADDQ $0x40, R12 + VBROADCASTF32X2 280(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 288(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 296(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 304(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 312(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z5 + ADDQ $0x40, R13 + VBROADCASTF32X2 320(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 328(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 336(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 344(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 352(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VBROADCASTF32X2 360(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z0, Z6, Z0 + VBROADCASTF32X2 368(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z1, Z6, Z1 + VBROADCASTF32X2 376(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z2, Z6, Z2 + VBROADCASTF32X2 384(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z3, Z6, Z3 + VBROADCASTF32X2 392(CX), Z6 + VGF2P8AFFINEQB $0x00, Z5, Z6, Z6 + VXORPD Z4, Z6, Z4 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64Xor_loop + VZEROUPPER + +mulGFNI_10x5_64Xor_end: + RET + // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 @@ -58394,6 +89770,578 @@ mulAvxTwo_10x6_loop: mulAvxTwo_10x6_end: RET +// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x6_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z6 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z6 + ADDQ $0x40, R11 + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z6 + ADDQ $0x40, R12 + VBROADCASTF32X2 336(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 344(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 352(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 360(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 368(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 376(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z6 + ADDQ $0x40, R13 + VBROADCASTF32X2 384(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 392(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 400(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 408(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 416(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 424(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 432(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 440(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 448(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 456(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 464(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 472(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64_loop + VZEROUPPER + +mulGFNI_10x6_64_end: + RET + +// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z4 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z5 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 8(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 16(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 24(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 32(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 40(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VBROADCASTF32X2 48(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 56(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 64(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 72(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 80(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 88(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VBROADCASTF32X2 96(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 104(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 112(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 120(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 128(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 136(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z6 + ADDQ $0x40, R8 + VBROADCASTF32X2 144(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 152(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 160(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 168(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 176(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 184(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z6 + ADDQ $0x40, R9 + VBROADCASTF32X2 192(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 200(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 208(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 216(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 224(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 232(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z6 + ADDQ $0x40, R10 + VBROADCASTF32X2 240(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 248(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 256(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 264(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 272(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 280(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z6 + ADDQ $0x40, R11 + VBROADCASTF32X2 288(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 296(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 304(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 312(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 320(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 328(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z6 + ADDQ $0x40, R12 + VBROADCASTF32X2 336(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 344(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 352(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 360(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 368(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 376(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z6 + ADDQ $0x40, R13 + VBROADCASTF32X2 384(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 392(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 400(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 408(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 416(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 424(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VBROADCASTF32X2 432(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z0, Z7, Z0 + VBROADCASTF32X2 440(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z1, Z7, Z1 + VBROADCASTF32X2 448(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z2, Z7, Z2 + VBROADCASTF32X2 456(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z3, Z7, Z3 + VBROADCASTF32X2 464(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z4, Z7, Z4 + VBROADCASTF32X2 472(CX), Z7 + VGF2P8AFFINEQB $0x00, Z6, Z7, Z7 + VXORPD Z5, Z7, Z5 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64Xor_loop + VZEROUPPER + +mulGFNI_10x6_64Xor_end: + RET + // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 @@ -59326,6 +91274,643 @@ mulAvxTwo_10x7_loop: mulAvxTwo_10x7_end: RET +// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x7_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z7 + ADDQ $0x40, R11 + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z7 + ADDQ $0x40, R12 + VBROADCASTF32X2 392(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 400(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 408(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 416(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 424(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 432(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 440(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z7 + ADDQ $0x40, R13 + VBROADCASTF32X2 448(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 456(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 464(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 472(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 480(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 488(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 496(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 504(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 512(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 520(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 528(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 536(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 544(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 552(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64_loop + VZEROUPPER + +mulGFNI_10x7_64_end: + RET + +// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z4 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z5 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z6 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 8(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 16(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 24(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 32(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 40(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 48(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VBROADCASTF32X2 56(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 64(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 72(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 80(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 88(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 96(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 104(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VBROADCASTF32X2 112(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 120(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 128(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 136(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 144(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 152(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 160(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VBROADCASTF32X2 168(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 176(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 184(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 192(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 200(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 208(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 216(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z7 + ADDQ $0x40, R9 + VBROADCASTF32X2 224(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 232(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 240(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 248(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 256(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 264(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 272(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z7 + ADDQ $0x40, R10 + VBROADCASTF32X2 280(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 288(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 296(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 304(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 312(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 320(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 328(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z7 + ADDQ $0x40, R11 + VBROADCASTF32X2 336(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 344(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 352(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 360(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 368(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 376(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 384(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z7 + ADDQ $0x40, R12 + VBROADCASTF32X2 392(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 400(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 408(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 416(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 424(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 432(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 440(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z7 + ADDQ $0x40, R13 + VBROADCASTF32X2 448(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 456(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 464(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 472(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 480(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 488(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 496(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VBROADCASTF32X2 504(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z0, Z8, Z0 + VBROADCASTF32X2 512(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z1, Z8, Z1 + VBROADCASTF32X2 520(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z2, Z8, Z2 + VBROADCASTF32X2 528(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z3, Z8, Z3 + VBROADCASTF32X2 536(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z4, Z8, Z4 + VBROADCASTF32X2 544(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z5, Z8, Z5 + VBROADCASTF32X2 552(CX), Z8 + VGF2P8AFFINEQB $0x00, Z7, Z8, Z8 + VXORPD Z6, Z8, Z6 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64Xor_loop + VZEROUPPER + +mulGFNI_10x7_64Xor_end: + RET + // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 @@ -60364,6 +92949,708 @@ mulAvxTwo_10x8_loop: mulAvxTwo_10x8_end: RET +// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x8_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z8 + ADDQ $0x40, R11 + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z8 + ADDQ $0x40, R12 + VBROADCASTF32X2 448(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 456(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 464(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 472(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 480(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 488(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 496(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 504(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z8 + ADDQ $0x40, R13 + VBROADCASTF32X2 512(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 520(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 528(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 536(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 544(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 552(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 560(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 568(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 576(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 584(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 592(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 600(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 608(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 616(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 624(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 632(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z7, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64_loop + VZEROUPPER + +mulGFNI_10x8_64_end: + RET + +// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z4 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z5 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z6 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z7 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 8(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 16(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 24(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 32(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 40(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 48(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 56(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VBROADCASTF32X2 64(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 72(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 80(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 88(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 96(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 104(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 112(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 120(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VBROADCASTF32X2 128(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 136(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 144(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 152(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 160(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 168(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 176(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 184(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VBROADCASTF32X2 192(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 200(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 208(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 216(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 224(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 232(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 240(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 248(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VBROADCASTF32X2 256(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 264(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 272(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 280(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 288(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 296(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 304(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 312(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z8 + ADDQ $0x40, R10 + VBROADCASTF32X2 320(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 328(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 336(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 344(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 352(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 360(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 368(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 376(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z8 + ADDQ $0x40, R11 + VBROADCASTF32X2 384(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 392(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 400(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 408(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 416(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 424(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 432(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 440(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z8 + ADDQ $0x40, R12 + VBROADCASTF32X2 448(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 456(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 464(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 472(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 480(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 488(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 496(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 504(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z8 + ADDQ $0x40, R13 + VBROADCASTF32X2 512(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 520(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 528(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 536(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 544(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 552(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 560(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 568(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VBROADCASTF32X2 576(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z0, Z9, Z0 + VBROADCASTF32X2 584(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z1, Z9, Z1 + VBROADCASTF32X2 592(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z2, Z9, Z2 + VBROADCASTF32X2 600(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z3, Z9, Z3 + VBROADCASTF32X2 608(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z4, Z9, Z4 + VBROADCASTF32X2 616(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z5, Z9, Z5 + VBROADCASTF32X2 624(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z6, Z9, Z6 + VBROADCASTF32X2 632(CX), Z9 + VGF2P8AFFINEQB $0x00, Z8, Z9, Z9 + VXORPD Z7, Z9, Z7 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z7, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64Xor_loop + VZEROUPPER + +mulGFNI_10x8_64Xor_end: + RET + // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 @@ -61508,6 +94795,773 @@ mulAvxTwo_10x9_loop: mulAvxTwo_10x9_end: RET +// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x9_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z9 + ADDQ $0x40, R11 + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z9 + ADDQ $0x40, R12 + VBROADCASTF32X2 504(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 512(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 520(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 528(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 536(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 544(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 552(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 560(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 568(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z9 + ADDQ $0x40, R13 + VBROADCASTF32X2 576(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 584(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 592(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 600(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 608(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 616(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 624(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 632(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 640(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 648(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 656(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 664(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 672(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 680(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 688(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 696(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 704(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 712(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z8, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64_loop + VZEROUPPER + +mulGFNI_10x9_64_end: + RET + +// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z4 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z5 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z6 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z7 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z8 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 8(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 16(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 24(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 32(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 40(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 48(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 56(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 64(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VBROADCASTF32X2 72(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 80(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 88(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 96(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 104(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 112(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 120(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 128(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 136(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VBROADCASTF32X2 144(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 152(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 160(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 168(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 176(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 184(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 192(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 200(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 208(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VBROADCASTF32X2 216(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 224(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 232(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 240(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 248(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 256(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 264(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 272(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 280(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VBROADCASTF32X2 288(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 296(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 304(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 312(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 320(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 328(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 336(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 344(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 352(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VBROADCASTF32X2 360(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 368(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 376(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 384(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 392(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 400(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 408(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 416(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 424(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z9 + ADDQ $0x40, R11 + VBROADCASTF32X2 432(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 440(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 448(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 456(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 464(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 472(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 480(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 488(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 496(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z9 + ADDQ $0x40, R12 + VBROADCASTF32X2 504(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 512(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 520(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 528(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 536(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 544(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 552(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 560(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 568(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z9 + ADDQ $0x40, R13 + VBROADCASTF32X2 576(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 584(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 592(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 600(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 608(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 616(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 624(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 632(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 640(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VBROADCASTF32X2 648(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z0, Z10, Z0 + VBROADCASTF32X2 656(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z1, Z10, Z1 + VBROADCASTF32X2 664(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z2, Z10, Z2 + VBROADCASTF32X2 672(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z3, Z10, Z3 + VBROADCASTF32X2 680(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z4, Z10, Z4 + VBROADCASTF32X2 688(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z5, Z10, Z5 + VBROADCASTF32X2 696(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z6, Z10, Z6 + VBROADCASTF32X2 704(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z7, Z10, Z7 + VBROADCASTF32X2 712(CX), Z10 + VGF2P8AFFINEQB $0x00, Z9, Z10, Z10 + VXORPD Z8, Z10, Z8 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z8, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64Xor_loop + VZEROUPPER + +mulGFNI_10x9_64Xor_end: + RET + // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 @@ -62758,6 +96812,838 @@ mulAvxTwo_10x10_loop: mulAvxTwo_10x10_end: RET +// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x10_64(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z10 + ADDQ $0x40, R12 + VBROADCASTF32X2 560(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 568(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 576(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 584(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 592(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 600(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 608(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 616(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 624(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 632(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z10 + ADDQ $0x40, R13 + VBROADCASTF32X2 640(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 648(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 656(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 664(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 672(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 680(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 688(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 696(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 704(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 712(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 720(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 728(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 736(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 744(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 752(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 760(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 768(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 776(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 784(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 792(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z8, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z9, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64_loop + VZEROUPPER + +mulGFNI_10x10_64_end: + RET + +// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z0 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z1 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z2 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z3 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z4 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z5 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z6 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z7 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z8 + MOVQ 216(R14), BP + VMOVDQU64 (BP)(R15*1), Z9 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VBROADCASTF32X2 (CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 8(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 16(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 24(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 32(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 40(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 48(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 56(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 64(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 72(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VBROADCASTF32X2 80(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 88(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 96(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 104(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 112(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 120(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 128(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 136(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 144(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 152(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VBROADCASTF32X2 160(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 168(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 176(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 184(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 192(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 200(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 208(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 216(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 224(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 232(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VBROADCASTF32X2 240(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 248(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 256(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 264(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 272(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 280(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 288(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 296(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 304(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 312(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VBROADCASTF32X2 320(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 328(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 336(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 344(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 352(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 360(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 368(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 376(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 384(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 392(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VBROADCASTF32X2 400(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 408(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 416(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 424(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 432(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 440(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 448(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 456(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 464(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 472(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VBROADCASTF32X2 480(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 488(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 496(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 504(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 512(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 520(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 528(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 536(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 544(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 552(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z10 + ADDQ $0x40, R12 + VBROADCASTF32X2 560(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 568(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 576(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 584(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 592(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 600(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 608(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 616(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 624(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 632(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z10 + ADDQ $0x40, R13 + VBROADCASTF32X2 640(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 648(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 656(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 664(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 672(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 680(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 688(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 696(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 704(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 712(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VBROADCASTF32X2 720(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z0, Z11, Z0 + VBROADCASTF32X2 728(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z1, Z11, Z1 + VBROADCASTF32X2 736(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z2, Z11, Z2 + VBROADCASTF32X2 744(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z3, Z11, Z3 + VBROADCASTF32X2 752(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z4, Z11, Z4 + VBROADCASTF32X2 760(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z5, Z11, Z5 + VBROADCASTF32X2 768(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z6, Z11, Z6 + VBROADCASTF32X2 776(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z7, Z11, Z7 + VBROADCASTF32X2 784(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z8, Z11, Z8 + VBROADCASTF32X2 792(CX), Z11 + VGF2P8AFFINEQB $0x00, Z10, Z11, Z11 + VXORPD Z9, Z11, Z9 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z8, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z9, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64Xor_loop + VZEROUPPER + +mulGFNI_10x10_64Xor_end: + RET + // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index 3078114b..ffc1bb1c 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -692,3 +692,679 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & avxSizeMask + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & avxSizeMask + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/leopard8.go b/leopard8.go index 53ec0bc6..136da672 100644 --- a/leopard8.go +++ b/leopard8.go @@ -769,10 +769,6 @@ func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, memclr(work[i]) } - // I tried splitting up the first few layers into L3-cache sized blocks but - // found that it only provides about 5% performance boost, which is not - // worth the extra complexity. - // Decimation in time: Unroll 2 layers at a time dist := 1 dist4 := 4 diff --git a/options.go b/options.go index e2fc09d6..a18ddfd1 100644 --- a/options.go +++ b/options.go @@ -15,15 +15,15 @@ type options struct { shardSize int perRound int - useAVX512, useAVX2, useSSSE3, useSSE2 bool - useJerasureMatrix bool - usePAR1Matrix bool - useCauchy bool - fastOneParity bool - inversionCache bool - forcedInversionCache bool - customMatrix [][]byte - withLeopard leopardMode + useGFNI, useAVX512, useAVX2, useSSSE3, useSSE2 bool + useJerasureMatrix bool + usePAR1Matrix bool + useCauchy bool + fastOneParity bool + inversionCache bool + forcedInversionCache bool + customMatrix [][]byte + withLeopard leopardMode // stream options concReads bool @@ -42,6 +42,7 @@ var defaultOptions = options{ useSSE2: cpuid.CPU.Supports(cpuid.SSE2), useAVX2: cpuid.CPU.Supports(cpuid.AVX2), useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), + useGFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL, cpuid.GFNI, cpuid.AVX512DQ), } // leopardMode controls the use of leopard GF in encoding and decoding. @@ -181,6 +182,14 @@ func WithAVX512(enabled bool) Option { } } +// WithGFNI allows to enable/disable AVX512+GFNI instructions. +// If not set, GFNI will be turned on or off automatically based on CPU ID information. +func WithGFNI(enabled bool) Option { + return func(o *options) { + o.useGFNI = enabled + } +} + // WithJerasureMatrix causes the encoder to build the Reed-Solomon-Vandermonde // matrix in the same way as done by the Jerasure library. // The first row and column of the coding matrix only contains 1's in this method diff --git a/reedsolomon.go b/reedsolomon.go index 2e7d8af1..05ad882e 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -753,6 +753,12 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs } +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { + return avx2CodeGen && r.o.useGFNI && + byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && + inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs +} + // Multiplies a subset of rows from a coding matrix by a full set of // input totalShards to produce some output totalShards. // 'matrixRows' is The rows from the matrix to use. @@ -783,12 +789,18 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if end > len(inputs[0]) { end = len(inputs[0]) } - if r.canAVX2C(byteCount, len(inputs), len(outputs)) { + if r.canGFNI(byteCount, len(inputs), len(outputs)) { + var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) + start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + end = len(inputs[0]) + } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte)) start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) r.mPool.Put(m) end = len(inputs[0]) } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { + var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 end = len(inputs[0]) inIdx := 0 m := r.mPool.Get().([]byte) @@ -806,11 +818,20 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if len(outPer) > maxAvx2Outputs { outPer = outPer[:maxAvx2Outputs] } - m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) - if inIdx == 0 { - galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + if r.o.useGFNI { + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) + if inIdx == 0 { + galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + } else { + galMulSlicesGFNIXor(m, inputs, outputs, 0, byteCount) + } } else { - galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + if inIdx == 0 { + galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + } else { + galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + } } start = byteCount & avxSizeMask outIdx += len(outPer)