diff --git a/_gen/gf8.go b/_gen/gf8.go index e715de7a..a3e1c3d6 100644 --- a/_gen/gf8.go +++ b/_gen/gf8.go @@ -117,264 +117,265 @@ func genGF8() { VZEROUPPER() RET() } - { - x := [8]int{} - for skipMask := range x[:] { - var suffix = "avx2_" + fmt.Sprint(skipMask) - if ctx.avx512 { - suffix = "avx512_" + fmt.Sprint(skipMask) - } - TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) - Pragma("noescape") - var t01, t23, t02 table256 - // Load and expand tables - - if (skipMask & 1) == 0 { - tablePtr := Load(Param("t01"), GP64()) - t01.Lo, t01.Hi = YMM(), YMM() - // We need one register when loading all. - if !ctx.avx512 && skipMask == 0 { - t01.loadLo128 = &Mem{Base: tablePtr, Disp: 0} - } else { - VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t01.Lo) + + x := [8]int{} + for skipMask := range x[:] { + for _, avx512 := range []bool{false, true} { + ctx.avx512 = avx512 + { + var suffix = "avx2_" + fmt.Sprint(skipMask) + if ctx.avx512 { + suffix = "avx512_" + fmt.Sprint(skipMask) } - VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t01.Hi) - } - if (skipMask & 2) == 0 { - tablePtr := Load(Param("t23"), GP64()) - t23.Lo, t23.Hi = YMM(), YMM() - VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t23.Lo) - VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t23.Hi) - } - if (skipMask & 4) == 0 { - tablePtr := Load(Param("t02"), GP64()) - t02.Lo, t02.Hi = YMM(), YMM() - VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t02.Lo) - VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t02.Hi) - } - dist := Load(Param("dist"), GP64()) - - var work [4]reg.GPVirtual - workTable := Load(Param("work").Base(), GP64()) // &work[0] - bytes := GP64() - MOVQ(Mem{Base: workTable, Disp: 8}, bytes) - - offset := GP64() - XORQ(offset, offset) - for i := range work { - work[i] = GP64() - // work[i] = &workTable[dist*i] - MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) - if i < len(work)-1 { - ADDQ(dist, offset) + TEXT("ifftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) + Pragma("noescape") + var t01, t23, t02 table256 + // Load and expand tables + + if (skipMask & 1) == 0 { + tablePtr := Load(Param("t01"), GP64()) + t01.Lo, t01.Hi = YMM(), YMM() + // We need one register when loading all. + if !ctx.avx512 && skipMask == 0 { + t01.loadLo128 = &Mem{Base: tablePtr, Disp: 0} + } else { + VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t01.Lo) + } + VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t01.Hi) + } + if (skipMask & 2) == 0 { + tablePtr := Load(Param("t23"), GP64()) + t23.Lo, t23.Hi = YMM(), YMM() + VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t23.Lo) + VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t23.Hi) + } + if (skipMask & 4) == 0 { + tablePtr := Load(Param("t02"), GP64()) + t02.Lo, t02.Hi = YMM(), YMM() + VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t02.Lo) + VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t02.Hi) + } + dist := Load(Param("dist"), GP64()) + + var work [4]reg.GPVirtual + workTable := Load(Param("work").Base(), GP64()) // &work[0] + bytes := GP64() + MOVQ(Mem{Base: workTable, Disp: 8}, bytes) + + offset := GP64() + XORQ(offset, offset) + for i := range work { + work[i] = GP64() + // work[i] = &workTable[dist*i] + MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if i < len(work)-1 { + ADDQ(dist, offset) + } } - } - // Generate mask - ctx.clrMask = YMM() - tmpMask := GP64() - MOVQ(U32(15), tmpMask) - MOVQ(tmpMask, ctx.clrMask.AsX()) - VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) - - Label("loop") - var workReg [4]reg.VecVirtual - var workReg2 [4]reg.VecVirtual - - workReg[0] = YMM() - workReg[1] = YMM() - workReg2[0] = YMM() - workReg2[1] = YMM() - - VMOVDQU(Mem{Base: work[0], Disp: 0}, workReg[0]) - VMOVDQU(Mem{Base: work[1], Disp: 0}, workReg[1]) - VMOVDQU(Mem{Base: work[0], Disp: 32}, workReg2[0]) - VMOVDQU(Mem{Base: work[1], Disp: 32}, workReg2[1]) - - // work1_reg = _mm256_xor_si256(work0_reg, work1_reg); - VPXOR(workReg[1], workReg[0], workReg[1]) - VPXOR(workReg2[1], workReg2[0], workReg2[1]) - if (skipMask & 1) == 0 { - t01.prepare() - leo8MulAdd256(ctx, workReg[0], workReg[1], t01) - leo8MulAdd256(ctx, workReg2[0], workReg2[1], t01) - } + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + + Label("loop") + var workReg [4]reg.VecVirtual + var workReg2 [4]reg.VecVirtual + + workReg[0] = YMM() + workReg[1] = YMM() + workReg2[0] = YMM() + workReg2[1] = YMM() + + VMOVDQU(Mem{Base: work[0], Disp: 0}, workReg[0]) + VMOVDQU(Mem{Base: work[1], Disp: 0}, workReg[1]) + VMOVDQU(Mem{Base: work[0], Disp: 32}, workReg2[0]) + VMOVDQU(Mem{Base: work[1], Disp: 32}, workReg2[1]) + + // work1_reg = _mm256_xor_si256(work0_reg, work1_reg); + VPXOR(workReg[1], workReg[0], workReg[1]) + VPXOR(workReg2[1], workReg2[0], workReg2[1]) + if (skipMask & 1) == 0 { + t01.prepare() + leo8MulAdd256(ctx, workReg[0], workReg[1], t01) + leo8MulAdd256(ctx, workReg2[0], workReg2[1], t01) + } - workReg[2] = YMM() - workReg[3] = YMM() - workReg2[2] = YMM() - workReg2[3] = YMM() - VMOVDQU(Mem{Base: work[2], Disp: 0}, workReg[2]) - VMOVDQU(Mem{Base: work[3], Disp: 0}, workReg[3]) - VMOVDQU(Mem{Base: work[2], Disp: 32}, workReg2[2]) - VMOVDQU(Mem{Base: work[3], Disp: 32}, workReg2[3]) - - //work3_reg = _mm256_xor_si256(work2_reg, work3_reg) - VPXOR(workReg[2], workReg[3], workReg[3]) - VPXOR(workReg2[2], workReg2[3], workReg2[3]) - if (skipMask & 2) == 0 { - leo8MulAdd256(ctx, workReg[2], workReg[3], t23) - leo8MulAdd256(ctx, workReg2[2], workReg2[3], t23) - } + workReg[2] = YMM() + workReg[3] = YMM() + workReg2[2] = YMM() + workReg2[3] = YMM() + VMOVDQU(Mem{Base: work[2], Disp: 0}, workReg[2]) + VMOVDQU(Mem{Base: work[3], Disp: 0}, workReg[3]) + VMOVDQU(Mem{Base: work[2], Disp: 32}, workReg2[2]) + VMOVDQU(Mem{Base: work[3], Disp: 32}, workReg2[3]) + + //work3_reg = _mm256_xor_si256(work2_reg, work3_reg) + VPXOR(workReg[2], workReg[3], workReg[3]) + VPXOR(workReg2[2], workReg2[3], workReg2[3]) + if (skipMask & 2) == 0 { + leo8MulAdd256(ctx, workReg[2], workReg[3], t23) + leo8MulAdd256(ctx, workReg2[2], workReg2[3], t23) + } - // Second layer: - // work2_reg = _mm256_xor_si256(work0_reg, work2_reg); - // work3_reg = _mm256_xor_si256(work1_reg, work3_reg); - VPXOR(workReg[0], workReg[2], workReg[2]) - VPXOR(workReg[1], workReg[3], workReg[3]) - VPXOR(workReg2[0], workReg2[2], workReg2[2]) - VPXOR(workReg2[1], workReg2[3], workReg2[3]) - - if (skipMask & 4) == 0 { - leo8MulAdd256(ctx, workReg[0], workReg[2], t02) - leo8MulAdd256(ctx, workReg[1], workReg[3], t02) - leo8MulAdd256(ctx, workReg2[0], workReg2[2], t02) - leo8MulAdd256(ctx, workReg2[1], workReg2[3], t02) - } + // Second layer: + // work2_reg = _mm256_xor_si256(work0_reg, work2_reg); + // work3_reg = _mm256_xor_si256(work1_reg, work3_reg); + VPXOR(workReg[0], workReg[2], workReg[2]) + VPXOR(workReg[1], workReg[3], workReg[3]) + VPXOR(workReg2[0], workReg2[2], workReg2[2]) + VPXOR(workReg2[1], workReg2[3], workReg2[3]) + + if (skipMask & 4) == 0 { + leo8MulAdd256(ctx, workReg[0], workReg[2], t02) + leo8MulAdd256(ctx, workReg[1], workReg[3], t02) + leo8MulAdd256(ctx, workReg2[0], workReg2[2], t02) + leo8MulAdd256(ctx, workReg2[1], workReg2[3], t02) + } - // Store + Next loop: - for i := range work { - VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) - VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) - ADDQ(U8(64), work[i]) - } + // Store + Next loop: + for i := range work { + VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) + ADDQ(U8(64), work[i]) + } - SUBQ(U8(64), bytes) - JA(LabelRef("loop")) + SUBQ(U8(64), bytes) + JA(LabelRef("loop")) - VZEROUPPER() - RET() - } - } - { - x := [8]int{} - for skipMask := range x[:] { - var suffix = "avx2_" + fmt.Sprint(skipMask) - if ctx.avx512 { - suffix = "avx512_" + fmt.Sprint(skipMask) + VZEROUPPER() + RET() } - TEXT("fftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) - Pragma("noescape") - var t01, t23, t02 table256 - // Load and expand tables - - if (skipMask & 2) == 0 { - tablePtr := Load(Param("t01"), GP64()) - t01.Lo, t01.Hi = YMM(), YMM() - if !ctx.avx512 && skipMask == 0 { - t01.loadLo128 = &Mem{Base: tablePtr, Disp: 0} - } else { - // We need additional registers - VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t01.Lo) + { + var suffix = "avx2_" + fmt.Sprint(skipMask) + if ctx.avx512 { + suffix = "avx512_" + fmt.Sprint(skipMask) } - VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t01.Hi) - } - if (skipMask & 4) == 0 { - tablePtr := Load(Param("t23"), GP64()) - t23.Lo, t23.Hi = YMM(), YMM() - if !ctx.avx512 && skipMask == 0 { - t23.loadLo128 = &Mem{Base: tablePtr, Disp: 0} - } else { - VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t23.Lo) + TEXT("fftDIT48_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, t01, t23, t02 *[2*16]uint8)")) + Pragma("noescape") + var t01, t23, t02 table256 + // Load and expand tables + + if (skipMask & 2) == 0 { + tablePtr := Load(Param("t01"), GP64()) + t01.Lo, t01.Hi = YMM(), YMM() + if !ctx.avx512 && skipMask == 0 { + t01.loadLo128 = &Mem{Base: tablePtr, Disp: 0} + } else { + // We need additional registers + VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t01.Lo) + } + VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t01.Hi) } - VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t23.Hi) - } - if (skipMask & 1) == 0 { - tablePtr := Load(Param("t02"), GP64()) + if (skipMask & 4) == 0 { + tablePtr := Load(Param("t23"), GP64()) + t23.Lo, t23.Hi = YMM(), YMM() + if !ctx.avx512 && skipMask == 0 { + t23.loadLo128 = &Mem{Base: tablePtr, Disp: 0} + } else { + VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t23.Lo) + } + VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t23.Hi) + } + if (skipMask & 1) == 0 { + tablePtr := Load(Param("t02"), GP64()) - t02.Lo, t02.Hi = YMM(), YMM() - VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t02.Lo) - VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t02.Hi) - } - dist := Load(Param("dist"), GP64()) - - var work [4]reg.GPVirtual - workTable := Load(Param("work").Base(), GP64()) // &work[0] - bytes := GP64() - MOVQ(Mem{Base: workTable, Disp: 8}, bytes) - - offset := GP64() - XORQ(offset, offset) - for i := range work { - work[i] = GP64() - // work[i] = &workTable[dist*i] - MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) - if i < len(work)-1 { - ADDQ(dist, offset) + t02.Lo, t02.Hi = YMM(), YMM() + VBROADCASTI128(Mem{Base: tablePtr, Disp: 0}, t02.Lo) + VBROADCASTI128(Mem{Base: tablePtr, Disp: 16}, t02.Hi) + } + dist := Load(Param("dist"), GP64()) + + var work [4]reg.GPVirtual + workTable := Load(Param("work").Base(), GP64()) // &work[0] + bytes := GP64() + MOVQ(Mem{Base: workTable, Disp: 8}, bytes) + + offset := GP64() + XORQ(offset, offset) + for i := range work { + work[i] = GP64() + // work[i] = &workTable[dist*i] + MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if i < len(work)-1 { + ADDQ(dist, offset) + } } - } - // Generate mask - ctx.clrMask = YMM() - tmpMask := GP64() - MOVQ(U32(15), tmpMask) - MOVQ(tmpMask, ctx.clrMask.AsX()) - VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) - Label("loop") - var workReg [4]reg.VecVirtual - var workReg2 [4]reg.VecVirtual + Label("loop") + var workReg [4]reg.VecVirtual + var workReg2 [4]reg.VecVirtual - for i := range workReg { - workReg[i] = YMM() - workReg2[i] = YMM() - } + for i := range workReg { + workReg[i] = YMM() + workReg2[i] = YMM() + } - VMOVDQU(Mem{Base: work[0], Disp: 0}, workReg[0]) - VMOVDQU(Mem{Base: work[0], Disp: 32}, workReg2[0]) - VMOVDQU(Mem{Base: work[2], Disp: 0}, workReg[2]) - VMOVDQU(Mem{Base: work[2], Disp: 32}, workReg2[2]) - VMOVDQU(Mem{Base: work[1], Disp: 0}, workReg[1]) - VMOVDQU(Mem{Base: work[1], Disp: 32}, workReg2[1]) - VMOVDQU(Mem{Base: work[3], Disp: 0}, workReg[3]) - VMOVDQU(Mem{Base: work[3], Disp: 32}, workReg2[3]) - - // work1_reg = _mm256_xor_si256(work0_reg, work1_reg); - if (skipMask & 1) == 0 { - leo8MulAdd256(ctx, workReg[0], workReg[2], t02) - leo8MulAdd256(ctx, workReg2[0], workReg2[2], t02) - - leo8MulAdd256(ctx, workReg[1], workReg[3], t02) - leo8MulAdd256(ctx, workReg2[1], workReg2[3], t02) - } - // work2_reg = _mm256_xor_si256(work0_reg, work2_reg); - // work3_reg = _mm256_xor_si256(work1_reg, work3_reg); - VPXOR(workReg[0], workReg[2], workReg[2]) - VPXOR(workReg[1], workReg[3], workReg[3]) - VPXOR(workReg2[0], workReg2[2], workReg2[2]) - VPXOR(workReg2[1], workReg2[3], workReg2[3]) - - // Second layer: - if (skipMask & 2) == 0 { - t01.prepare() - leo8MulAdd256(ctx, workReg[0], workReg[1], t01) - leo8MulAdd256(ctx, workReg2[0], workReg2[1], t01) - } - //work1_reg = _mm256_xor_si256(work0_reg, work1_reg); - VPXOR(workReg[1], workReg[0], workReg[1]) - VPXOR(workReg2[1], workReg2[0], workReg2[1]) - - if (skipMask & 4) == 0 { - t23.prepare() - leo8MulAdd256(ctx, workReg[2], workReg[3], t23) - leo8MulAdd256(ctx, workReg2[2], workReg2[3], t23) - } - // work3_reg = _mm256_xor_si256(work2_reg, work3_reg); - VPXOR(workReg[2], workReg[3], workReg[3]) - VPXOR(workReg2[2], workReg2[3], workReg2[3]) - - // Store + Next loop: - for i := range work { - VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) - VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) - ADDQ(U8(64), work[i]) - } + VMOVDQU(Mem{Base: work[0], Disp: 0}, workReg[0]) + VMOVDQU(Mem{Base: work[0], Disp: 32}, workReg2[0]) + VMOVDQU(Mem{Base: work[2], Disp: 0}, workReg[2]) + VMOVDQU(Mem{Base: work[2], Disp: 32}, workReg2[2]) + VMOVDQU(Mem{Base: work[1], Disp: 0}, workReg[1]) + VMOVDQU(Mem{Base: work[1], Disp: 32}, workReg2[1]) + VMOVDQU(Mem{Base: work[3], Disp: 0}, workReg[3]) + VMOVDQU(Mem{Base: work[3], Disp: 32}, workReg2[3]) + + // work1_reg = _mm256_xor_si256(work0_reg, work1_reg); + if (skipMask & 1) == 0 { + leo8MulAdd256(ctx, workReg[0], workReg[2], t02) + leo8MulAdd256(ctx, workReg2[0], workReg2[2], t02) + + leo8MulAdd256(ctx, workReg[1], workReg[3], t02) + leo8MulAdd256(ctx, workReg2[1], workReg2[3], t02) + } + // work2_reg = _mm256_xor_si256(work0_reg, work2_reg); + // work3_reg = _mm256_xor_si256(work1_reg, work3_reg); + VPXOR(workReg[0], workReg[2], workReg[2]) + VPXOR(workReg[1], workReg[3], workReg[3]) + VPXOR(workReg2[0], workReg2[2], workReg2[2]) + VPXOR(workReg2[1], workReg2[3], workReg2[3]) + + // Second layer: + if (skipMask & 2) == 0 { + t01.prepare() + leo8MulAdd256(ctx, workReg[0], workReg[1], t01) + leo8MulAdd256(ctx, workReg2[0], workReg2[1], t01) + } + //work1_reg = _mm256_xor_si256(work0_reg, work1_reg); + VPXOR(workReg[1], workReg[0], workReg[1]) + VPXOR(workReg2[1], workReg2[0], workReg2[1]) + + if (skipMask & 4) == 0 { + t23.prepare() + leo8MulAdd256(ctx, workReg[2], workReg[3], t23) + leo8MulAdd256(ctx, workReg2[2], workReg2[3], t23) + } + // work3_reg = _mm256_xor_si256(work2_reg, work3_reg); + VPXOR(workReg[2], workReg[3], workReg[3]) + VPXOR(workReg2[2], workReg2[3], workReg2[3]) + + // Store + Next loop: + for i := range work { + VMOVDQU(workReg[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workReg2[i], Mem{Base: work[i], Disp: 32}) + ADDQ(U8(64), work[i]) + } - SUBQ(U8(64), bytes) - JA(LabelRef("loop")) + SUBQ(U8(64), bytes) + JA(LabelRef("loop")) - VZEROUPPER() - RET() + VZEROUPPER() + RET() + } } } } @@ -384,12 +385,20 @@ func leo8MulAdd256(ctx gf8ctx, x, y reg.VecVirtual, table table256) { Comment("LEO_MULADD_256") lo, hi := YMM(), YMM() - VPAND(y, ctx.clrMask, lo) + if ctx.avx512 { + VPANDD(y, ctx.clrMask, lo) + } else { + VPAND(y, ctx.clrMask, lo) + } VPSRLQ(U8(4), y, hi) VPSHUFB(lo, table.Lo, lo) // Do high - VPAND(hi, ctx.clrMask, hi) + if ctx.avx512 { + VPANDD(hi, ctx.clrMask, hi) + } else { + VPAND(hi, ctx.clrMask, hi) + } VPSHUFB(hi, table.Hi, hi) if ctx.avx512 { VPTERNLOGD(U8(0x96), lo, hi, x) diff --git a/galois_amd64.go b/galois_amd64.go index 8fc0debb..fd3eea7f 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -223,6 +223,42 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio return } + if o.useAVX512 { + // Note that these currently require that length is multiple of 64. + t01 := &multiply256LUT8[log_m01] + t23 := &multiply256LUT8[log_m23] + t02 := &multiply256LUT8[log_m02] + if log_m01 == modulus8 { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_avx512_7(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx512_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_avx512_5(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx512_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_avx512_6(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx512_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_avx512_4(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx512_0(work, dist*24, t01, t23, t02) + } + } + } + return + } if o.useAVX2 { // Note that these currently require that length is multiple of 64. t01 := &multiply256LUT8[log_m01] @@ -342,10 +378,46 @@ func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *option return } - t01 := &multiply256LUT8[log_m01] - t23 := &multiply256LUT8[log_m23] - t02 := &multiply256LUT8[log_m02] + if o.useAVX512 { + t01 := &multiply256LUT8[log_m01] + t23 := &multiply256LUT8[log_m23] + t02 := &multiply256LUT8[log_m02] + // Note that these currently require that length is multiple of 64. + if log_m02 == modulus8 { + if log_m01 == modulus8 { + if log_m23 == modulus8 { + fftDIT48_avx512_7(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx512_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus8 { + fftDIT48_avx512_5(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx512_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m01 == modulus8 { + if log_m23 == modulus8 { + fftDIT48_avx512_6(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx512_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus8 { + fftDIT48_avx512_4(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx512_0(work, dist*24, t01, t23, t02) + } + } + } + return + } if o.useAVX2 { + t01 := &multiply256LUT8[log_m01] + t23 := &multiply256LUT8[log_m23] + t02 := &multiply256LUT8[log_m02] // Note that these currently require that length is multiple of 64. if log_m02 == modulus8 { if log_m01 == modulus8 { diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 3574ea3b..664a34c3 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -1560,46 +1560,94 @@ func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8) func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func ifftDIT48_avx512_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func fftDIT48_avx512_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func ifftDIT48_avx512_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func fftDIT48_avx512_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape -func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +func ifftDIT48_avx512_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx512_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) //go:noescape func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_avx512_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx512_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_avx512_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx512_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_avx512_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx512_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +//go:noescape +func ifftDIT48_avx512_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx512_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + //go:noescape func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx512_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx512_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index ad64a695..e9736998 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -67827,6 +67827,381 @@ loop: VZEROUPPER RET +// func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_0(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 16(AX), Y0 + MOVQ t23+40(FP), CX + VBROADCASTI128 16(CX), Y1 + MOVQ t02+48(FP), DX + VBROADCASTI128 (DX), Y2 + VBROADCASTI128 16(DX), Y3 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y10 + VMOVDQU (R9), Y7 + VMOVDQU 32(R9), Y8 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + VBROADCASTI128 (AX), Y13 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y14 + VPSRLQ $0x04, Y7, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y0, Y15 + XOR3WAY( $0x00, Y14, Y15, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y14 + VPSRLQ $0x04, Y8, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y0, Y15 + XOR3WAY( $0x00, Y14, Y15, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + VBROADCASTI128 (CX), Y13 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y14 + VPSRLQ $0x04, Y12, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (R9) + VMOVDQU Y8, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y9, (R10) + VMOVDQU Y10, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y11, (DX) + VMOVDQU Y12, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx512_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx512_0(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y4 + VBROADCASTI128 16(AX), Y5 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X6 + VPBROADCASTB X6, Y6 + +loop: + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU 32(SI), Y9 + VMOVDQU 32(DI), Y10 + VPXOR Y8, Y7, Y8 + VPXOR Y10, Y9, Y10 + + // LEO_MULADD_256 + VPANDD Y8, Y6, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y6, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y7 + + // LEO_MULADD_256 + VPANDD Y10, Y6, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y6, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y9 + VMOVDQU (R8), Y11 + VMOVDQU (AX), Y12 + VMOVDQU 32(R8), Y13 + VMOVDQU 32(AX), Y14 + VPXOR Y11, Y12, Y12 + VPXOR Y13, Y14, Y14 + + // LEO_MULADD_256 + VPANDD Y12, Y6, Y15 + VPSRLQ $0x04, Y12, Y16 + VPSHUFB Y15, Y2, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y3, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y11 + + // LEO_MULADD_256 + VPANDD Y14, Y6, Y15 + VPSRLQ $0x04, Y14, Y16 + VPSHUFB Y15, Y2, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y3, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y13 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VPXOR Y9, Y13, Y13 + VPXOR Y10, Y14, Y14 + + // LEO_MULADD_256 + VPANDD Y11, Y6, Y15 + VPSRLQ $0x04, Y11, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y7 + + // LEO_MULADD_256 + VPANDD Y12, Y6, Y15 + VPSRLQ $0x04, Y12, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y8 + + // LEO_MULADD_256 + VPANDD Y13, Y6, Y15 + VPSRLQ $0x04, Y13, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y9 + + // LEO_MULADD_256 + VPANDD Y14, Y6, Y15 + VPSRLQ $0x04, Y14, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y10 + VMOVDQU Y7, (SI) + VMOVDQU Y9, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y8, (DI) + VMOVDQU Y10, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y11, (R8) + VMOVDQU Y13, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y12, (AX) + VMOVDQU Y14, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx512_0(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y4 + VBROADCASTI128 16(AX), Y5 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X6 + VPBROADCASTB X6, Y6 + +loop: + VMOVDQU (SI), Y7 + VMOVDQU 32(SI), Y8 + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y12 + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y10 + VMOVDQU (AX), Y13 + VMOVDQU 32(AX), Y14 + + // LEO_MULADD_256 + VPANDD Y11, Y6, Y15 + VPSRLQ $0x04, Y11, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y7 + + // LEO_MULADD_256 + VPANDD Y12, Y6, Y15 + VPSRLQ $0x04, Y12, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y8 + + // LEO_MULADD_256 + VPANDD Y13, Y6, Y15 + VPSRLQ $0x04, Y13, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y9 + + // LEO_MULADD_256 + VPANDD Y14, Y6, Y15 + VPSRLQ $0x04, Y14, Y16 + VPSHUFB Y15, Y4, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y5, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y9, Y13, Y13 + VPXOR Y8, Y12, Y12 + VPXOR Y10, Y14, Y14 + + // LEO_MULADD_256 + VPANDD Y9, Y6, Y15 + VPSRLQ $0x04, Y9, Y16 + VPSHUFB Y15, Y0, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y1, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y7 + + // LEO_MULADD_256 + VPANDD Y10, Y6, Y15 + VPSRLQ $0x04, Y10, Y16 + VPSHUFB Y15, Y0, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y1, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y8 + VPXOR Y9, Y7, Y9 + VPXOR Y10, Y8, Y10 + + // LEO_MULADD_256 + VPANDD Y13, Y6, Y15 + VPSRLQ $0x04, Y13, Y16 + VPSHUFB Y15, Y2, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y3, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y11 + + // LEO_MULADD_256 + VPANDD Y14, Y6, Y15 + VPSRLQ $0x04, Y14, Y16 + VPSHUFB Y15, Y2, Y15 + VPANDD Y16, Y6, Y16 + VPSHUFB Y16, Y3, Y16 + VPTERNLOGD $0x96, Y15, Y16, Y12 + VPXOR Y11, Y13, Y13 + VPXOR Y12, Y14, Y14 + VMOVDQU Y7, (SI) + VMOVDQU Y8, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y9, (DI) + VMOVDQU Y10, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y11, (R8) + VMOVDQU Y12, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y13, (AX) + VMOVDQU Y14, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + // func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·ifftDIT48_avx2_1(SB), NOSPLIT, $0-56 @@ -67933,13 +68308,13 @@ loop: VZEROUPPER RET -// func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56 +TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 - MOVQ t02+48(FP), AX + MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX @@ -67959,53 +68334,35 @@ TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56 loop: VMOVDQU (SI), Y5 - VMOVDQU (DI), Y6 - VMOVDQU 32(SI), Y7 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 VMOVDQU 32(DI), Y8 - VPXOR Y6, Y5, Y6 - VPXOR Y8, Y7, Y8 - - // LEO_MULADD_256 - VPAND Y6, Y4, Y9 - VPSRLQ $0x04, Y6, Y10 - VPSHUFB Y9, Y0, Y9 - VPAND Y10, Y4, Y10 - VPSHUFB Y10, Y1, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 // LEO_MULADD_256 - VPAND Y8, Y4, Y9 - VPSRLQ $0x04, Y8, Y10 - VPSHUFB Y9, Y0, Y9 - VPAND Y10, Y4, Y10 - VPSHUFB Y10, Y1, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) - VMOVDQU (R8), Y9 - VMOVDQU (AX), Y10 - VMOVDQU 32(R8), Y11 - VMOVDQU 32(AX), Y12 - VPXOR Y9, Y10, Y10 - VPXOR Y11, Y12, Y12 - VPXOR Y5, Y9, Y9 - VPXOR Y6, Y10, Y10 - VPXOR Y7, Y11, Y11 - VPXOR Y8, Y12, Y12 - - // LEO_MULADD_256 - VPAND Y9, Y4, Y13 - VPSRLQ $0x04, Y9, Y14 - VPSHUFB Y13, Y2, Y13 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 + VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 - VPAND Y10, Y4, Y13 - VPSRLQ $0x04, Y10, Y14 - VPSHUFB Y13, Y2, Y13 + VPAND Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 + VPSHUFB Y14, Y1, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 // LEO_MULADD_256 VPAND Y11, Y4, Y13 @@ -68013,7 +68370,7 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y7) + XOR3WAY( $0x00, Y13, Y14, Y9) // LEO_MULADD_256 VPAND Y12, Y4, Y13 @@ -68021,17 +68378,19 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y8) + XOR3WAY( $0x00, Y13, Y14, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 VMOVDQU Y5, (SI) - VMOVDQU Y7, 32(SI) + VMOVDQU Y6, 32(SI) ADDQ $0x40, SI - VMOVDQU Y6, (DI) + VMOVDQU Y7, (DI) VMOVDQU Y8, 32(DI) ADDQ $0x40, DI VMOVDQU Y9, (R8) - VMOVDQU Y11, 32(R8) + VMOVDQU Y10, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y10, (AX) + VMOVDQU Y11, (AX) VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX @@ -68039,12 +68398,15 @@ loop: VZEROUPPER RET -// func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_avx512_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT48_avx2_3(SB), NOSPLIT, $0-56 - MOVQ t02+48(FP), AX +TEXT ·ifftDIT48_avx512_1(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX @@ -68057,82 +68419,188 @@ TEXT ·ifftDIT48_avx2_3(SB), NOSPLIT, $0-56 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX - MOVQ CX, X2 - VPBROADCASTB X2, Y2 + MOVQ CX, X4 + VPBROADCASTB X4, Y4 loop: - VMOVDQU (SI), Y3 - VMOVDQU (DI), Y4 - VMOVDQU 32(SI), Y5 - VMOVDQU 32(DI), Y6 - VPXOR Y4, Y3, Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 VPXOR Y6, Y5, Y6 - VMOVDQU (R8), Y7 - VMOVDQU (AX), Y8 - VMOVDQU 32(R8), Y9 - VMOVDQU 32(AX), Y10 - VPXOR Y7, Y8, Y8 + VPXOR Y8, Y7, Y8 + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 VPXOR Y9, Y10, Y10 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 + VPXOR Y11, Y12, Y12 + + // LEO_MULADD_256 + VPANDD Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y9 + + // LEO_MULADD_256 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y11 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPANDD Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y5 + + // LEO_MULADD_256 + VPANDD Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y6 + + // LEO_MULADD_256 + VPANDD Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y7 + + // LEO_MULADD_256 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y8 + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx512_1(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 // LEO_MULADD_256 - VPAND Y7, Y2, Y11 - VPSRLQ $0x04, Y7, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPANDD Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y5 // LEO_MULADD_256 - VPAND Y8, Y2, Y11 - VPSRLQ $0x04, Y8, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPANDD Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y6 + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 // LEO_MULADD_256 - VPAND Y9, Y2, Y11 - VPSRLQ $0x04, Y9, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPANDD Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y9 // LEO_MULADD_256 - VPAND Y10, Y2, Y11 - VPSRLQ $0x04, Y10, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) - VMOVDQU Y3, (SI) - VMOVDQU Y5, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y4, (DI) - VMOVDQU Y6, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y7, (R8) - VMOVDQU Y9, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y8, (AX) - VMOVDQU Y10, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JA loop + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y10 + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop VZEROUPPER RET -// func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56 +TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 - MOVQ t23+40(FP), AX + MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX @@ -68179,6 +68647,18 @@ loop: VMOVDQU 32(AX), Y12 VPXOR Y9, Y10, Y10 VPXOR Y11, Y12, Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 VPAND Y10, Y4, Y13 @@ -68186,7 +68666,15 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y9) + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 @@ -68194,11 +68682,7 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y11) - VPXOR Y5, Y9, Y9 - VPXOR Y6, Y10, Y10 - VPXOR Y7, Y11, Y11 - VPXOR Y8, Y12, Y12 + XOR3WAY( $0x00, Y13, Y14, Y8) VMOVDQU Y5, (SI) VMOVDQU Y7, 32(SI) ADDQ $0x40, SI @@ -68216,12 +68700,15 @@ loop: VZEROUPPER RET -// func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56 +TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56 MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX @@ -68234,65 +68721,309 @@ TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX - MOVQ CX, X2 - VPBROADCASTB X2, Y2 + MOVQ CX, X4 + VPBROADCASTB X4, Y4 loop: - VMOVDQU (SI), Y3 - VMOVDQU (DI), Y4 - VMOVDQU 32(SI), Y5 - VMOVDQU 32(DI), Y6 - VPXOR Y4, Y3, Y4 - VPXOR Y6, Y5, Y6 - VMOVDQU (R8), Y7 - VMOVDQU (AX), Y8 - VMOVDQU 32(R8), Y9 - VMOVDQU 32(AX), Y10 - VPXOR Y7, Y8, Y8 - VPXOR Y9, Y10, Y10 + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 // LEO_MULADD_256 - VPAND Y8, Y2, Y11 - VPSRLQ $0x04, Y8, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 - VPAND Y10, Y2, Y11 - VPSRLQ $0x04, Y10, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 VPXOR Y6, Y10, Y10 - VMOVDQU Y3, (SI) - VMOVDQU Y5, 32(SI) + VPXOR Y8, Y12, Y12 + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) ADDQ $0x40, SI - VMOVDQU Y4, (DI) - VMOVDQU Y6, 32(DI) + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) ADDQ $0x40, DI - VMOVDQU Y7, (R8) - VMOVDQU Y9, 32(R8) + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y8, (AX) - VMOVDQU Y10, 32(AX) + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET -// func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_avx512_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT48_avx2_6(SB), NOSPLIT, $0-56 +TEXT ·ifftDIT48_avx512_2(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + + // LEO_MULADD_256 + VPANDD Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPANDD Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + VPTERNLOGD $0x96, Y9, Y10, Y5 + + // LEO_MULADD_256 + VPANDD Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPANDD Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + VPTERNLOGD $0x96, Y9, Y10, Y7 + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPANDD Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y5 + + // LEO_MULADD_256 + VPANDD Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y6 + + // LEO_MULADD_256 + VPANDD Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y7 + + // LEO_MULADD_256 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y8 + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx512_2(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + + // LEO_MULADD_256 + VPANDD Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y5 + + // LEO_MULADD_256 + VPANDD Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y6 + + // LEO_MULADD_256 + VPANDD Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y7 + + // LEO_MULADD_256 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + + // LEO_MULADD_256 + VPANDD Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y9 + + // LEO_MULADD_256 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y10 + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_3(SB), NOSPLIT, $0-56 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX @@ -68315,22 +69046,6 @@ loop: VMOVDQU 32(DI), Y6 VPXOR Y4, Y3, Y4 VPXOR Y6, Y5, Y6 - - // LEO_MULADD_256 - VPAND Y4, Y2, Y7 - VPSRLQ $0x04, Y4, Y8 - VPSHUFB Y7, Y0, Y7 - VPAND Y8, Y2, Y8 - VPSHUFB Y8, Y1, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) - - // LEO_MULADD_256 - VPAND Y6, Y2, Y7 - VPSRLQ $0x04, Y6, Y8 - VPSHUFB Y7, Y0, Y7 - VPAND Y8, Y2, Y8 - VPSHUFB Y8, Y1, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) VMOVDQU (R8), Y7 VMOVDQU (AX), Y8 VMOVDQU 32(R8), Y9 @@ -68341,6 +69056,38 @@ loop: VPXOR Y4, Y8, Y8 VPXOR Y5, Y9, Y9 VPXOR Y6, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU Y3, (SI) VMOVDQU Y5, 32(SI) ADDQ $0x40, SI @@ -68358,100 +69105,288 @@ loop: VZEROUPPER RET -// func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) -// Requires: AVX, AVX2, SSE2 -TEXT ·ifftDIT48_avx2_7(SB), NOSPLIT, $0-56 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 +// func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_3(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 loop: - VMOVDQU (SI), Y0 - VMOVDQU (DI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU 32(DI), Y3 - VPXOR Y1, Y0, Y1 - VPXOR Y3, Y2, Y3 - VMOVDQU (R8), Y4 - VMOVDQU (AX), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU 32(AX), Y7 - VPXOR Y4, Y5, Y5 - VPXOR Y6, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 VPXOR Y3, Y7, Y7 - VMOVDQU Y0, (SI) - VMOVDQU Y2, 32(SI) + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) ADDQ $0x40, SI - VMOVDQU Y1, (DI) - VMOVDQU Y3, 32(DI) + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) ADDQ $0x40, DI - VMOVDQU Y4, (R8) - VMOVDQU Y6, 32(R8) + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y5, (AX) - VMOVDQU Y7, 32(AX) + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET -// func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_avx512_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_0(SB), NOSPLIT, $0-56 +TEXT ·ifftDIT48_avx512_3(SB), NOSPLIT, $0-56 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + + // LEO_MULADD_256 + VPANDD Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y3 + + // LEO_MULADD_256 + VPANDD Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y4 + + // LEO_MULADD_256 + VPANDD Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y5 + + // LEO_MULADD_256 + VPANDD Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y6 + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx512_3(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + + // LEO_MULADD_256 + VPANDD Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y7 + + // LEO_MULADD_256 + VPANDD Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y8 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX - VBROADCASTI128 16(AX), Y0 - MOVQ t23+40(FP), CX - VBROADCASTI128 16(CX), Y1 - MOVQ t02+48(FP), DX - VBROADCASTI128 (DX), Y2 - VBROADCASTI128 16(DX), Y3 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - MOVQ $0x0000000f, BX - MOVQ BX, X4 + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 VPBROADCASTB X4, Y4 loop: - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y10 - VMOVDQU (R9), Y7 - VMOVDQU 32(R9), Y8 - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y12 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 // LEO_MULADD_256 - VPAND Y9, Y4, Y13 - VPSRLQ $0x04, Y9, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y5) + VPAND Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 // LEO_MULADD_256 VPAND Y10, Y4, Y13 @@ -68459,89 +69394,43 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y6) + XOR3WAY( $0x00, Y13, Y14, Y9) // LEO_MULADD_256 - VPAND Y11, Y4, Y13 - VPSRLQ $0x04, Y11, Y14 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y7) - - // LEO_MULADD_256 - VPAND Y12, Y4, Y13 - VPSRLQ $0x04, Y12, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y8) - VPXOR Y5, Y9, Y9 - VPXOR Y7, Y11, Y11 - VPXOR Y6, Y10, Y10 - VPXOR Y8, Y12, Y12 - VBROADCASTI128 (AX), Y13 - - // LEO_MULADD_256 - VPAND Y7, Y4, Y14 - VPSRLQ $0x04, Y7, Y15 - VPSHUFB Y14, Y13, Y14 - VPAND Y15, Y4, Y15 - VPSHUFB Y15, Y0, Y15 - XOR3WAY( $0x00, Y14, Y15, Y5) - - // LEO_MULADD_256 - VPAND Y8, Y4, Y14 - VPSRLQ $0x04, Y8, Y15 - VPSHUFB Y14, Y13, Y14 - VPAND Y15, Y4, Y15 - VPSHUFB Y15, Y0, Y15 - XOR3WAY( $0x00, Y14, Y15, Y6) - VPXOR Y7, Y5, Y7 - VPXOR Y8, Y6, Y8 - VBROADCASTI128 (CX), Y13 - - // LEO_MULADD_256 - VPAND Y11, Y4, Y14 - VPSRLQ $0x04, Y11, Y15 - VPSHUFB Y14, Y13, Y14 - VPAND Y15, Y4, Y15 - VPSHUFB Y15, Y1, Y15 - XOR3WAY( $0x00, Y14, Y15, Y9) - - // LEO_MULADD_256 - VPAND Y12, Y4, Y14 - VPSRLQ $0x04, Y12, Y15 - VPSHUFB Y14, Y13, Y14 - VPAND Y15, Y4, Y15 - VPSHUFB Y15, Y1, Y15 - XOR3WAY( $0x00, Y14, Y15, Y10) - VPXOR Y9, Y11, Y11 - VPXOR Y10, Y12, Y12 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) + XOR3WAY( $0x00, Y13, Y14, Y11) + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y7, (R9) - VMOVDQU Y8, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y9, (R10) - VMOVDQU Y10, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y11, (DX) - VMOVDQU Y12, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX JA loop VZEROUPPER RET -// func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56 +TEXT ·fftDIT48_avx2_4(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 - MOVQ t23+40(FP), AX + MOVQ t02+48(FP), AX VBROADCASTI128 (AX), Y2 VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX @@ -68568,28 +69457,22 @@ loop: VMOVDQU 32(DI), Y8 VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y12 - VPXOR Y5, Y9, Y9 - VPXOR Y7, Y11, Y11 - VPXOR Y6, Y10, Y10 - VPXOR Y8, Y12, Y12 // LEO_MULADD_256 - VPAND Y7, Y4, Y13 - VPSRLQ $0x04, Y7, Y14 - VPSHUFB Y13, Y0, Y13 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y1, Y14 + VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y5) // LEO_MULADD_256 - VPAND Y8, Y4, Y13 - VPSRLQ $0x04, Y8, Y14 - VPSHUFB Y13, Y0, Y13 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y1, Y14 + VPSHUFB Y14, Y3, Y14 XOR3WAY( $0x00, Y13, Y14, Y6) - VPXOR Y7, Y5, Y7 - VPXOR Y8, Y6, Y8 // LEO_MULADD_256 VPAND Y11, Y4, Y13 @@ -68597,7 +69480,7 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y9) + XOR3WAY( $0x00, Y13, Y14, Y7) // LEO_MULADD_256 VPAND Y12, Y4, Y13 @@ -68605,7 +69488,29 @@ loop: VPSHUFB Y13, Y2, Y13 VPAND Y14, Y4, Y14 VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y10) + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 VPXOR Y9, Y11, Y11 VPXOR Y10, Y12, Y12 VMOVDQU Y5, (SI) @@ -68625,10 +69530,100 @@ loop: VZEROUPPER RET -// func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_avx512_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56 +TEXT ·ifftDIT48_avx512_4(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + + // LEO_MULADD_256 + VPANDD Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPANDD Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + VPTERNLOGD $0x96, Y9, Y10, Y5 + + // LEO_MULADD_256 + VPANDD Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPANDD Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + VPTERNLOGD $0x96, Y9, Y10, Y7 + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + + // LEO_MULADD_256 + VPANDD Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y9 + + // LEO_MULADD_256 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y11 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx512_4(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ t02+48(FP), AX @@ -68660,81 +69655,152 @@ loop: VMOVDQU 32(AX), Y12 // LEO_MULADD_256 - VPAND Y9, Y4, Y13 - VPSRLQ $0x04, Y9, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y5) + VPANDD Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y5 // LEO_MULADD_256 - VPAND Y10, Y4, Y13 - VPSRLQ $0x04, Y10, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y6) + VPANDD Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y6 // LEO_MULADD_256 - VPAND Y11, Y4, Y13 - VPSRLQ $0x04, Y11, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y7) + VPANDD Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y7 // LEO_MULADD_256 - VPAND Y12, Y4, Y13 - VPSRLQ $0x04, Y12, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y8) - VPXOR Y5, Y9, Y9 - VPXOR Y7, Y11, Y11 - VPXOR Y6, Y10, Y10 - VPXOR Y8, Y12, Y12 - VPXOR Y7, Y5, Y7 - VPXOR Y8, Y6, Y8 + VPANDD Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPANDD Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y5 + + // LEO_MULADD_256 + VPANDD Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPANDD Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + VPTERNLOGD $0x96, Y13, Y14, Y6 + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 // LEO_MULADD_256 - VPAND Y11, Y4, Y13 - VPSRLQ $0x04, Y11, Y14 - VPSHUFB Y13, Y0, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y1, Y14 - XOR3WAY( $0x00, Y13, Y14, Y9) + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) // LEO_MULADD_256 - VPAND Y12, Y4, Y13 - VPSRLQ $0x04, Y12, Y14 - VPSHUFB Y13, Y0, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y1, Y14 - XOR3WAY( $0x00, Y13, Y14, Y10) - VPXOR Y9, Y11, Y11 - VPXOR Y10, Y12, Y12 - VMOVDQU Y5, (SI) - VMOVDQU Y6, 32(SI) + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) ADDQ $0x40, SI - VMOVDQU Y7, (DI) - VMOVDQU Y8, 32(DI) + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) ADDQ $0x40, DI - VMOVDQU Y9, (R8) - VMOVDQU Y10, 32(R8) + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y11, (AX) - VMOVDQU Y12, 32(AX) + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX JA loop VZEROUPPER RET -// func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_3(SB), NOSPLIT, $0-56 - MOVQ t23+40(FP), AX +TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 MOVQ dist+24(FP), AX @@ -68765,24 +69831,24 @@ loop: VPXOR Y5, Y9, Y9 VPXOR Y4, Y8, Y8 VPXOR Y6, Y10, Y10 - VPXOR Y5, Y3, Y5 - VPXOR Y6, Y4, Y6 // LEO_MULADD_256 - VPAND Y9, Y2, Y11 - VPSRLQ $0x04, Y9, Y12 + VPAND Y5, Y2, Y11 + VPSRLQ $0x04, Y5, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + XOR3WAY( $0x00, Y11, Y12, Y3) // LEO_MULADD_256 - VPAND Y10, Y2, Y11 - VPSRLQ $0x04, Y10, Y12 + VPAND Y6, Y2, Y11 + VPSRLQ $0x04, Y6, Y12 VPSHUFB Y11, Y0, Y11 VPAND Y12, Y2, Y12 VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 VPXOR Y7, Y9, Y9 VPXOR Y8, Y10, Y10 VMOVDQU Y3, (SI) @@ -68802,15 +69868,12 @@ loop: VZEROUPPER RET -// func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func ifftDIT48_avx512_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_4(SB), NOSPLIT, $0-56 - MOVQ t01+32(FP), AX +TEXT ·ifftDIT48_avx512_5(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 - MOVQ t02+48(FP), AX - VBROADCASTI128 (AX), Y2 - VBROADCASTI128 16(AX), Y3 MOVQ dist+24(FP), AX MOVQ work_base+0(FP), CX MOVQ 8(CX), DX @@ -68823,94 +69886,62 @@ TEXT ·fftDIT48_avx2_4(SB), NOSPLIT, $0-56 ADDQ AX, BX MOVQ (CX)(BX*1), AX MOVQ $0x0000000f, CX - MOVQ CX, X4 - VPBROADCASTB X4, Y4 + MOVQ CX, X2 + VPBROADCASTB X2, Y2 loop: - VMOVDQU (SI), Y5 - VMOVDQU 32(SI), Y6 - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y10 - VMOVDQU (DI), Y7 - VMOVDQU 32(DI), Y8 - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y12 - - // LEO_MULADD_256 - VPAND Y9, Y4, Y13 - VPSRLQ $0x04, Y9, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y5) - - // LEO_MULADD_256 - VPAND Y10, Y4, Y13 - VPSRLQ $0x04, Y10, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y6) - - // LEO_MULADD_256 - VPAND Y11, Y4, Y13 - VPSRLQ $0x04, Y11, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y7) - - // LEO_MULADD_256 - VPAND Y12, Y4, Y13 - VPSRLQ $0x04, Y12, Y14 - VPSHUFB Y13, Y2, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y3, Y14 - XOR3WAY( $0x00, Y13, Y14, Y8) - VPXOR Y5, Y9, Y9 - VPXOR Y7, Y11, Y11 - VPXOR Y6, Y10, Y10 - VPXOR Y8, Y12, Y12 + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 // LEO_MULADD_256 - VPAND Y7, Y4, Y13 - VPSRLQ $0x04, Y7, Y14 - VPSHUFB Y13, Y0, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y1, Y14 - XOR3WAY( $0x00, Y13, Y14, Y5) + VPANDD Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y7 // LEO_MULADD_256 - VPAND Y8, Y4, Y13 - VPSRLQ $0x04, Y8, Y14 - VPSHUFB Y13, Y0, Y13 - VPAND Y14, Y4, Y14 - VPSHUFB Y14, Y1, Y14 - XOR3WAY( $0x00, Y13, Y14, Y6) - VPXOR Y7, Y5, Y7 - VPXOR Y8, Y6, Y8 - VPXOR Y9, Y11, Y11 - VPXOR Y10, Y12, Y12 - VMOVDQU Y5, (SI) - VMOVDQU Y6, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y7, (DI) - VMOVDQU Y8, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y9, (R8) - VMOVDQU Y10, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y11, (AX) - VMOVDQU Y12, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JA loop + VPANDD Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y9 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop VZEROUPPER RET -// func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// func fftDIT48_avx512_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56 +TEXT ·fftDIT48_avx512_5(SB), NOSPLIT, $0-56 MOVQ t01+32(FP), AX VBROADCASTI128 (AX), Y0 VBROADCASTI128 16(AX), Y1 @@ -68944,34 +69975,105 @@ loop: VPXOR Y6, Y10, Y10 // LEO_MULADD_256 - VPAND Y5, Y2, Y11 - VPSRLQ $0x04, Y5, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPANDD Y5, Y2, Y11 + VPSRLQ $0x04, Y5, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y3 // LEO_MULADD_256 - VPAND Y6, Y2, Y11 - VPSRLQ $0x04, Y6, Y12 - VPSHUFB Y11, Y0, Y11 - VPAND Y12, Y2, Y12 - VPSHUFB Y12, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) - VPXOR Y5, Y3, Y5 - VPXOR Y6, Y4, Y6 - VPXOR Y7, Y9, Y9 - VPXOR Y8, Y10, Y10 + VPANDD Y6, Y2, Y11 + VPSRLQ $0x04, Y6, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y4 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_6(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + + // LEO_MULADD_256 + VPAND Y4, Y2, Y7 + VPSRLQ $0x04, Y4, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y7 + VPSRLQ $0x04, Y6, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 VMOVDQU Y3, (SI) - VMOVDQU Y4, 32(SI) + VMOVDQU Y5, 32(SI) ADDQ $0x40, SI - VMOVDQU Y5, (DI) + VMOVDQU Y4, (DI) VMOVDQU Y6, 32(DI) ADDQ $0x40, DI VMOVDQU Y7, (R8) - VMOVDQU Y8, 32(R8) + VMOVDQU Y9, 32(R8) ADDQ $0x40, R8 - VMOVDQU Y9, (AX) + VMOVDQU Y8, (AX) VMOVDQU Y10, 32(AX) ADDQ $0x40, AX SUBQ $0x40, DX @@ -69066,6 +70168,216 @@ loop: VZEROUPPER RET +// func ifftDIT48_avx512_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx512_6(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + + // LEO_MULADD_256 + VPANDD Y4, Y2, Y7 + VPSRLQ $0x04, Y4, Y8 + VPSHUFB Y7, Y0, Y7 + VPANDD Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + VPTERNLOGD $0x96, Y7, Y8, Y3 + + // LEO_MULADD_256 + VPANDD Y6, Y2, Y7 + VPSRLQ $0x04, Y6, Y8 + VPSHUFB Y7, Y0, Y7 + VPANDD Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + VPTERNLOGD $0x96, Y7, Y8, Y5 + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx512_6(SB), NOSPLIT, $0-56 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + + // LEO_MULADD_256 + VPANDD Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y3 + + // LEO_MULADD_256 + VPANDD Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y4 + + // LEO_MULADD_256 + VPANDD Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y5 + + // LEO_MULADD_256 + VPANDD Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPANDD Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + VPTERNLOGD $0x96, Y11, Y12, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·ifftDIT48_avx2_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (SI), Y0 + VMOVDQU (DI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU 32(DI), Y3 + VPXOR Y1, Y0, Y1 + VPXOR Y3, Y2, Y3 + VMOVDQU (R8), Y4 + VMOVDQU (AX), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y4, Y5, Y5 + VPXOR Y6, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y1, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + // func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) // Requires: AVX, AVX2, SSE2 TEXT ·fftDIT48_avx2_7(SB), NOSPLIT, $0-56 @@ -69117,3 +70429,107 @@ loop: JA loop VZEROUPPER RET + +// func ifftDIT48_avx512_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·ifftDIT48_avx512_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (SI), Y0 + VMOVDQU (DI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU 32(DI), Y3 + VPXOR Y1, Y0, Y1 + VPXOR Y3, Y2, Y3 + VMOVDQU (R8), Y4 + VMOVDQU (AX), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y4, Y5, Y5 + VPXOR Y6, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y1, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx512_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·fftDIT48_avx512_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y2, Y6, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y3, Y7, Y7 + VPXOR Y2, Y0, Y2 + VPXOR Y3, Y1, Y3 + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET