-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: Expose remaining AVX512-VBMI2 hardware instructions #88946
Comments
Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics Issue DetailsBackground and motivationThere is already present AVX512-VBMI PermuteVar64x8(x2) intrinsics and approved and soon to be added AVX512-VBMI2 Compress & Expand intrinsics as part of new vector mask proposal. There is little reason if at all to not to add the left-over instructions from aforementioned instruction sets.
API Proposalnamespace System.Runtime.Intrinsics.X86
{
[Intrinsic]
public abstract class Avx512Vbmi : Avx512BW
{
public static new bool IsSupported { get => IsSupported; }
[Intrinsic]
public new abstract class X64 : Avx512BW.X64
{
public static new bool IsSupported { get => IsSupported; }
}
[Intrinsic]
public new abstract class VL : Avx512BW.VL
{
public static new bool IsSupported { get => IsSupported; }
/// <summary>
/// __m128i _mm128_multishift_epi64_epi8 (__m128i a, __m128i b)
/// VPMULTISHIFTQB xmm, xmm, xmm
/// </summary>
public static Vector128<byte> MultipleShift(Vector128<byte> control, Vector128<byte> source) => MultipleShift(control, source);
/// <summary>
/// __m256i _mm256_multishift_epi64_epi8 (__m256i a, __m256i b)
/// VPMULTISHIFTQB ymm, ymm, ymm
/// </summary>
public static Vector256<byte> MultipleShift(Vector256<byte> control, Vector256<byte> source) => MultipleShift(control, source);
}
/// <summary>
/// __m512i _mm512_multishift_epi64_epi8 (__m512i a, __m512i b)
/// VPMULTISHIFTQB zmm, zmm, zmm
/// </summary>
public static Vector512<byte> MultipleShift(Vector512<byte> control, Vector512<byte> source) => MultipleShift(control, source);
}
[Intrinsic]
public abstract class Avx512Vbmi2 : Avx512BW
{
public static new bool IsSupported { get => IsSupported; }
[Intrinsic]
public new abstract class X64 : Avx512BW.X64
{
public static new bool IsSupported { get => IsSupported; }
}
[Intrinsic]
public new abstract class VL : Avx512BW.VL
{
public static new bool IsSupported { get => IsSupported; }
/// <summary>
/// __m128i _mm512_shldi_epi16 (__m128i a, __m128i b, int imm8)
/// VPSHLDW xmm, xmm, xmm, imm8
/// </summary>
public static Vector128<ushort> ConcatenateShiftLeftUpper(Vector128<ushort> upper, Vector128<ushort> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m256i _mm512_shldi_epi16 (__m256i a, __m256i b, int imm8)
/// VPSHLDW ymm, ymm, ymm, imm8
/// </summary>
public static Vector256<ushort> ConcatenateShiftLeftUpper(Vector256<ushort> upper, Vector256<ushort> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m128i _mm512_shldi_epi32 (__m128i a, __m128i b, int imm8)
/// VPSHLDD xmm, xmm, xmm, imm8
/// </summary>
public static Vector128<uint> ConcatenateShiftLeftUpper(Vector128<uint> upper, Vector128<uint> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m256i _mm512_shldi_epi32 (__m256i a, __m256i b, int imm8)
/// VPSHLDD ymm, ymm, ymm, imm8
/// </summary>
public static Vector256<uint> ConcatenateShiftLeftUpper(Vector256<uint> upper, Vector256<uint> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m128i _mm512_shldi_epi64 (__m128i a, __m128i b, int imm8)
/// VPSHLDQ xmm, xmm, xmm, imm8
/// </summary>
public static Vector128<ulong> ConcatenateShiftLeftUpper(Vector128<ulong> upper, Vector128<ulong> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m256i _mm512_shldi_epi64 (__m256i a, __m256i b, int imm8)
/// VPSHLDQ ymm, ymm, ymm, imm8
/// </summary>
public static Vector256<ulong> ConcatenateShiftLeftUpper(Vector256<ulong> upper, Vector256<ulong> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m128i _mm512_shrdi_epi16 (__m128i a, __m128i b, int imm8)
/// VPSHRDW xmm, xmm, xmm, imm8
/// </summary>
public static Vector128<ushort> ConcatenateShiftRightLower(Vector128<ushort> upper, Vector128<ushort> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m256i _mm512_shrdi_epi16 (__m256i a, __m256i b, int imm8)
/// VPSHRDW ymm, ymm, ymm, imm8
/// </summary>
public static Vector256<ushort> ConcatenateShiftRightLower(Vector256<ushort> upper, Vector256<ushort> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m128i _mm512_shrdi_epi32 (__m128i a, __m128i b, int imm8)
/// VPSHRDD xmm, xmm, xmm, imm8
/// </summary>
public static Vector128<uint> ConcatenateShiftRightLower(Vector128<uint> upper, Vector128<uint> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m256i _mm512_shrdi_epi32 (__m256i a, __m256i b, int imm8)
/// VPSHRDD ymm, ymm, ymm, imm8
/// </summary>
public static Vector256<uint> ConcatenateShiftRightLower(Vector256<uint> upper, Vector256<uint> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m128i _mm512_shrdi_epi64 (__m128i a, __m128i b, int imm8)
/// VPSHRDQ xmm, xmm, xmm, imm8
/// </summary>
public static Vector128<ulong> ConcatenateShiftRightLower(Vector128<ulong> upper, Vector128<ulong> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m256i _mm512_shrdi_epi64 (__m256i a, __m256i b, int imm8)
/// VPSHRDQ ymm, ymm, ymm, imm8
/// </summary>
public static Vector256<ulong> ConcatenateShiftRightLower(Vector256<ulong> upper, Vector256<ulong> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m128i _mm512_shldv_epi16 (__m128i a, __m128i b, __m128i c)
/// VPSHLDVW xmm, xmm, xmm
/// </summary>
public static Vector128<ushort> ConcatenateShiftLeftUpperVariable(Vector128<ushort> upper, Vector128<ushort> lower, Vector128<ushort> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m256i _mm512_shldv_epi16 (__m256i a, __m256i b, __m256i c)
/// VPSHLDVW ymm, ymm, ymm
/// </summary>
public static Vector256<ushort> ConcatenateShiftLeftUpperVariable(Vector256<ushort> upper, Vector256<ushort> lower, Vector256<ushort> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m128i _mm512_shldv_epi32 (__m128i a, __m128i b, __m128i c)
/// VPSHLDVD xmm, xmm, xmm
/// </summary>
public static Vector128<uint> ConcatenateShiftLeftUpperVariable(Vector128<uint> upper, Vector128<uint> lower, Vector128<uint> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m256i _mm512_shldv_epi32 (__m256i a, __m256i b, __m256i c)
/// VPSHLDVD ymm, ymm, ymm
/// </summary>
public static Vector256<uint> ConcatenateShiftLeftUpperVariable(Vector256<uint> upper, Vector256<uint> lower, Vector256<uint> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m128i _mm512_shldv_epi64 (__m128i a, __m128i b, __m128i c)
/// VPSHLDVQ xmm, xmm, xmm
/// </summary>
public static Vector128<ulong> ConcatenateShiftLeftUpperVariable(Vector128<ulong> upper, Vector128<ulong> lower, Vector128<ulong> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m256i _mm512_shldv_epi64 (__m256i a, __m256i b, __m256i c)
/// VPSHLDVQ ymm, ymm, ymm
/// </summary>
public static Vector256<ulong> ConcatenateShiftLeftUpperVariable(Vector256<ulong> upper, Vector256<ulong> lower, Vector256<ulong> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m128i _mm512_shrdv_epi16 (__m128i a, __m128i b, __m128i c)
/// VPSHLDVW xmm, xmm, xmm
/// </summary>
public static Vector128<ushort> ConcatenateShiftRightLowerVariable(Vector128<ushort> upper, Vector128<ushort> lower, Vector128<ushort> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m256i _mm512_shrdv_epi16 (__m256i a, __m256i b, __m256i c)
/// VPSHLDVW ymm, ymm, ymm
/// </summary>
public static Vector256<ushort> ConcatenateShiftRightLowerVariable(Vector256<ushort> upper, Vector256<ushort> lower, Vector256<ushort> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m128i _mm512_shrdv_epi32 (__m128i a, __m128i b, __m128i c)
/// VPSHLDVD xmm, xmm, xmm
/// </summary>
public static Vector128<uint> ConcatenateShiftRightLowerVariable(Vector128<uint> upper, Vector128<uint> lower, Vector128<uint> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m256i _mm512_shrdv_epi32 (__m256i a, __m256i b, __m256i c)
/// VPSHLDVD ymm, ymm, ymm
/// </summary>
public static Vector256<uint> ConcatenateShiftRightLowerVariable(Vector256<uint> upper, Vector256<uint> lower, Vector256<uint> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m128i _mm512_shrdv_epi64 (__m128i a, __m128i b, __m128i c)
/// VPSHLDVQ xmm, xmm, xmm
/// </summary>
public static Vector128<ulong> ConcatenateShiftRightLowerVariable(Vector128<ulong> upper, Vector128<ulong> lower, Vector128<ulong> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m256i _mm512_shrdv_epi64 (__m256i a, __m256i b, __m256i c)
/// VPSHLDVQ ymm, ymm, ymm
/// </summary>
public static Vector256<ulong> ConcatenateShiftRightLowerVariable(Vector256<ulong> upper, Vector256<ulong> lower, Vector256<ulong> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
}
/// <summary>
/// __m512i _mm512_shldi_epi16 (__m512i a, __m512i b, int imm8)
/// VPSHLDW zmm, zmm, zmm, imm8
/// </summary>
public static Vector512<ushort> ConcatenateShiftLeftUpper(Vector512<ushort> upper, Vector512<ushort> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m512i _mm512_shldi_epi32 (__m512i a, __m512i b, int imm8)
/// VPSHLDD zmm, zmm, zmm, imm8
/// </summary>
public static Vector512<uint> ConcatenateShiftLeftUpper(Vector512<uint> upper, Vector512<uint> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m512i _mm512_shldi_epi64 (__m512i a, __m512i b, int imm8)
/// VPSHLDQ zmm, zmm, zmm, imm8
/// </summary>
public static Vector512<ulong> ConcatenateShiftLeftUpper(Vector512<ulong> upper, Vector512<ulong> lower, [ConstantExpected] byte count) => ConcatenateShiftLeftUpper(upper, lower, count);
/// <summary>
/// __m512i _mm512_shrdi_epi16 (__m512i a, __m512i b, int imm8)
/// VPSHRDW zmm, zmm, zmm, imm8
/// </summary>
public static Vector512<ushort> ConcatenateShiftRightLower(Vector512<ushort> upper, Vector512<ushort> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m512i _mm512_shrdi_epi32 (__m512i a, __m512i b, int imm8)
/// VPSHRDD zmm, zmm, zmm, imm8
/// </summary>
public static Vector512<uint> ConcatenateShiftRightLower(Vector512<uint> upper, Vector512<uint> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m512i _mm512_shrdi_epi64 (__m512i a, __m512i b, int imm8)
/// VPSHRDQ zmm, zmm, zmm, imm8
/// </summary>
public static Vector512<ulong> ConcatenateShiftRightLower(Vector512<ulong> upper, Vector512<ulong> lower, [ConstantExpected] byte count) => ConcatenateShiftRightLower(upper, lower, count);
/// <summary>
/// __m512i _mm512_shldv_epi16 (__m512i a, __m512i b, __m512i c)
/// VPSHLDVW zmm, zmm, zmm
/// </summary>
public static Vector512<ushort> ConcatenateShiftLeftUpperVariable(Vector512<ushort> upper, Vector512<ushort> lower, Vector512<ushort> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m512i _mm512_shldv_epi32 (__m512i a, __m512i b, __m512i c)
/// VPSHLDVD zmm, zmm, zmm
/// </summary>
public static Vector512<uint> ConcatenateShiftLeftUpperVariable(Vector512<uint> upper, Vector512<uint> lower, Vector512<uint> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m512i _mm512_shldv_epi64 (__m512i a, __m512i b, __m512i c)
/// VPSHLDVQ zmm, zmm, zmm
/// </summary>
public static Vector512<ulong> ConcatenateShiftLeftUpperVariable(Vector512<ulong> upper, Vector512<ulong> lower, Vector512<ulong> count) => ConcatenateShiftLeftUpperVariable(upper, lower, count);
/// <summary>
/// __m512i _mm512_shrdv_epi16 (__m512i a, __m512i b, __m512i c)
/// VPSHLDVW zmm, zmm, zmm
/// </summary>
public static Vector512<ushort> ConcatenateShiftRightLowerVariable(Vector512<ushort> upper, Vector512<ushort> lower, Vector512<ushort> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m512i _mm512_shrdv_epi32 (__m512i a, __m512i b, __m512i c)
/// VPSHLDVD zmm, zmm, zmm
/// </summary>
public static Vector512<uint> ConcatenateShiftRightLowerVariable(Vector512<uint> upper, Vector512<uint> lower, Vector512<uint> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
/// <summary>
/// __m512i _mm512_shrdv_epi64 (__m512i a, __m512i b, __m512i c)
/// VPSHLDVQ zmm, zmm, zmm
/// </summary>
public static Vector512<ulong> ConcatenateShiftRightLowerVariable(Vector512<ulong> upper, Vector512<ulong> lower, Vector512<ulong> count) => ConcatenateShiftRightLowerVariable(upper, lower, count);
}
} API Usage// Avx512Vbmi2.ConcatenateShift(LeftUpper/RightLower)(...) example
Vector512<ushort> some_data = GetData(), some_data2 = GetData();
// result consists of lower 8 bit part of ushort data in upper_data and upper 8 bit of ushort data in lower_data
// shift count can be changed to get count of lower bits of ushort data in upper_data and upper 16 - count bit of ushort data in lower_data
var result = Avx512Vbmi2.ConcatenateShiftLeftUpper(upper_data, lower_data, 8); // Avx512Vbmi.MultipleShift(...) example
Vector512<ushort> d = GetData(), control = Vector512.Create(0x0B_1B_2B_3B_04_14_24_34);
// Given control makes every 64 bit part of result structure is this: (d[59:52], d[43:36], d[27:20], d[11:4], (d[3:0] << 4) | d[63:60], d[51:44], d[35:28], d[19:12])
var result = Avx512Vbmi.MultupleShift(control, d); Alternative DesignsN/A RisksN/A
|
|
I have deleted the duplicated |
Need to give some consideration around the suggested names, but this will be a .NET 9 change regardless; so marking with needs-further-triage for the moment. |
Note that AMD Zen 4 and better as well as Intel Ice Lake and better support VBMI2. As motivation, this would make it possible support fast Unicode transcoding function in C#, like what is done in the simdutf library (which is part of the Node.js runtime). See https://arxiv.org/pdf/2212.05098.pdf cc @EgorBo |
Background and motivation
There are approved and soon to be added AVX512-VBMI2 Compress & Expand intrinsics as part of new vector mask proposal. There is little reason if at all to not to add the left-over instructions from AVX512-VBMI2 instruction set.
Notice:
lower << count
andupper >> count
, which produce the same result and the other operand is unused.API Proposal
API Usage
Alternative Designs
N/A
Risks
N/A
The text was updated successfully, but these errors were encountered: