Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Add support for byte/sbyte SIMD multiplication on XArch #86811

Merged
merged 23 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
5c6fdc1
Add support for byte/sbyte SIMD multiply on XArch
BladeWise May 24, 2023
b065946
Fix formatting
BladeWise May 26, 2023
6846733
Replace gtCloneExpr with fgMakeMultiUse (review)
BladeWise May 26, 2023
e6193d8
Merge branch 'main' into feature/simd-byte-multiply
BladeWise May 27, 2023
1fa1b6c
Merge branch 'main' into feature/simd-byte-multiply
BladeWise May 27, 2023
516b94f
Attempt to fix codepath for AVX512
BladeWise May 29, 2023
c621093
Merge branch 'main' into feature/simd-byte-multiply
BladeWise May 29, 2023
c5e5911
Merge branch 'main' into feature/simd-byte-multiply
BladeWise May 30, 2023
a5162a7
Merge branch 'main' into feature/simd-byte-multiply
BladeWise May 30, 2023
3dfc0b9
Merge branch 'main' into feature/simd-byte-multiply
BladeWise May 31, 2023
339cb2f
Merge branch 'main' into feature/simd-byte-multiply
BladeWise Jun 5, 2023
c2b5e2e
Merge branch 'main' into feature/simd-byte-multiply
BladeWise Jun 13, 2023
9471e8d
Merge remote-tracking branch 'upstream/main' into feature/simd-byte-m…
BladeWise Jul 10, 2023
97ca828
Merge branch 'main' into feature/simd-byte-multiply
BladeWise Sep 30, 2023
43120bf
Merge remote-tracking branch 'upstream/main' into feature/simd-byte-m…
BladeWise Sep 30, 2023
56680db
Merge remote-tracking branch 'upstream/main' into feature/simd-byte-m…
BladeWise Sep 30, 2023
3ac62c6
Fix renamed method to check for SIMD 512 support
BladeWise Sep 30, 2023
024a87c
Avoid to re-use variables for newly created nodes
BladeWise Oct 4, 2023
81b1e07
Merge remote-tracking branch 'upstream/main' into feature/simd-byte-m…
BladeWise Oct 4, 2023
22912c1
Merge remote-tracking branch 'upstream/main' into feature/simd-byte-m…
BladeWise Oct 6, 2023
b1a9686
Add detailed comments
BladeWise Oct 6, 2023
74bbff9
Merge remote-tracking branch 'upstream/main' into feature/simd-byte-m…
BladeWise Oct 6, 2023
cdde321
Fix formatting
BladeWise Oct 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 188 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20197,6 +20197,194 @@ GenTree* Compiler::gtNewSimdBinOpNode(

switch (simdBaseType)
{
case TYP_BYTE:
case TYP_UBYTE:
Comment on lines +20200 to +20201
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like there is some precedent for inserting additional IR nodes in this function, but this is quite a lot more than the other cases, and these new cases return without falling through like the others. Is this the right place for this code? Or should this be a separate function that is called before getting here?

{
assert((simdSize != 64) || IsBaselineVector512IsaSupportedDebugOnly());

CorInfoType widenedSimdBaseJitType;
NamedIntrinsic widenIntrinsic;
NamedIntrinsic narrowIntrinsic;
var_types widenedType;
unsigned widenedSimdSize;

if (simdSize == 32 && IsBaselineVector512IsaSupportedOpportunistically())
{
// Input is SIMD32 [U]Byte and AVX512BW is supported:
// - Widen inputs as SIMD64 [U]Short
// - Multiply widened inputs (SIMD64 [U]Short) as widened product (SIMD64 [U]Short)
// - Narrow widened product (SIMD64 [U]Short) as SIMD32 [U]Byte
if (simdBaseType == TYP_BYTE)
{
widenedSimdBaseJitType = CORINFO_TYPE_SHORT;
widenIntrinsic = NI_AVX512BW_ConvertToVector512Int16;
narrowIntrinsic = NI_AVX512BW_ConvertToVector256SByte;
}
else
{
widenedSimdBaseJitType = CORINFO_TYPE_USHORT;
widenIntrinsic = NI_AVX512BW_ConvertToVector512UInt16;
narrowIntrinsic = NI_AVX512BW_ConvertToVector256Byte;
}

widenedType = TYP_SIMD64;
widenedSimdSize = 64;

// Vector512<ushort> widenedOp1 = Avx512BW.ConvertToVector512UInt16(op1)
GenTree* widenedOp1 = gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic,
simdBaseJitType, widenedSimdSize);

// Vector512<ushort> widenedOp2 = Avx512BW.ConvertToVector512UInt16(op2)
GenTree* widenedOp2 = gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic,
simdBaseJitType, widenedSimdSize);

// Vector512<ushort> widenedProduct = widenedOp1 * widenedOp2;
GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2,
widenedSimdBaseJitType, widenedSimdSize);

// Vector256<byte> product = Avx512BW.ConvertToVector256Byte(widenedProduct)
return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic, widenedSimdBaseJitType,
widenedSimdSize);
}
else if (simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
if (IsBaselineVector512IsaSupportedOpportunistically())
{
// Input is SIMD16 [U]Byte and AVX512BW_VL is supported:
// - Widen inputs as SIMD32 [U]Short
// - Multiply widened inputs (SIMD32 [U]Short) as widened product (SIMD32 [U]Short)
// - Narrow widened product (SIMD32 [U]Short) as SIMD16 [U]Byte
widenIntrinsic = NI_AVX2_ConvertToVector256Int16;

if (simdBaseType == TYP_BYTE)
{
widenedSimdBaseJitType = CORINFO_TYPE_SHORT;
narrowIntrinsic = NI_AVX512BW_VL_ConvertToVector128SByte;
}
else
{
widenedSimdBaseJitType = CORINFO_TYPE_USHORT;
narrowIntrinsic = NI_AVX512BW_VL_ConvertToVector128Byte;
}

widenedType = TYP_SIMD32;
widenedSimdSize = 32;

// Vector256<ushort> widenedOp1 = Avx2.ConvertToVector256Int16(op1).AsUInt16()
GenTree* widenedOp1 = gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic,
simdBaseJitType, widenedSimdSize);

// Vector256<ushort> widenedOp2 = Avx2.ConvertToVector256Int16(op2).AsUInt16()
GenTree* widenedOp2 = gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic,
simdBaseJitType, widenedSimdSize);

// Vector256<ushort> widenedProduct = widenedOp1 * widenedOp2
GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2,
widenedSimdBaseJitType, widenedSimdSize);

// Vector128<byte> product = Avx512BW.VL.ConvertToVector128Byte(widenedProduct)
return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic,
widenedSimdBaseJitType, widenedSimdSize);
}
else
{
// Input is SIMD16 [U]Byte and AVX512BW_VL is NOT supported (only AVX2 will be used):
// - Widen inputs as SIMD32 [U]Short
// - Multiply widened inputs (SIMD32 [U]Short) as widened product (SIMD32 [U]Short)
// - Mask widened product (SIMD32 [U]Short) to select relevant bits
// - Pack masked product so that relevant bits are packed together in upper and lower halves
// - Shuffle packed product so that relevant bits are placed together in the lower half
// - Select lower (SIMD16 [U]Byte) from shuffled product (SIMD32 [U]Short)
widenedSimdBaseJitType =
simdBaseType == TYP_BYTE ? CORINFO_TYPE_SHORT : CORINFO_TYPE_USHORT;
widenIntrinsic = NI_AVX2_ConvertToVector256Int16;
widenedType = TYP_SIMD32;
widenedSimdSize = 32;

// Vector256<ushort> widenedOp1 = Avx2.ConvertToVector256Int16(op1).AsUInt16()
GenTree* widenedOp1 =
gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic, simdBaseJitType, simdSize);

// Vector256<ushort> widenedOp2 = Avx2.ConvertToVector256Int16(op2).AsUInt16()
GenTree* widenedOp2 =
gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic, simdBaseJitType, simdSize);

// Vector256<ushort> widenedProduct = widenedOp1 * widenedOp2
GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2,
widenedSimdBaseJitType, widenedSimdSize);

// Vector256<ushort> vecCon1 = Vector256.Create(0x00FF00FF00FF00FF).AsUInt16()
GenTreeVecCon* vecCon1 = gtNewVconNode(widenedType);

for (unsigned i = 0; i < (widenedSimdSize / 8); i++)
{
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
}

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL));

// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
widenedSimdBaseJitType, widenedSimdSize);
GenTree* maskedProductDup = fgMakeMultiUse(&maskedProduct);

// Vector256<ulong> packedProduct = Avx2.PackUnsignedSaturate(maskedProduct,
// maskedProduct).AsUInt64()
GenTree* packedProduct =
gtNewSimdHWIntrinsicNode(widenedType, maskedProduct, maskedProductDup,
NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
widenedSimdSize);

CorInfoType permuteBaseJitType =
(simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG;

// Vector256<byte> shuffledProduct = Avx2.Permute4x64(w1, 0xD8).AsByte()
GenTree* shuffledProduct =
gtNewSimdHWIntrinsicNode(widenedType, packedProduct, gtNewIconNode(SHUFFLE_WYZX),
NI_AVX2_Permute4x64, permuteBaseJitType, widenedSimdSize);

// Vector128<byte> product = shuffledProduct.getLower()
return gtNewSimdGetLowerNode(type, shuffledProduct, simdBaseJitType, widenedSimdSize);
}
}

// No special handling could be performed, apply fallback logic:
// - Widen both inputs lower and upper halves as [U]Short (using helper method)
// - Multiply corrsponding widened input halves together as widened product halves
// - Narrow widened product halves as [U]Byte (using helper method)
widenedSimdBaseJitType = simdBaseType == TYP_BYTE ? CORINFO_TYPE_SHORT : CORINFO_TYPE_USHORT;

// op1Dup = op1
GenTree* op1Dup = fgMakeMultiUse(&op1);

// op2Dup = op2
GenTree* op2Dup = fgMakeMultiUse(&op2);

// Vector256<ushort> lowerOp1 = Avx2.ConvertToVector256Int16(op1.GetLower()).AsUInt16()
GenTree* lowerOp1 = gtNewSimdWidenLowerNode(type, op1, simdBaseJitType, simdSize);

// Vector256<ushort> lowerOp2 = Avx2.ConvertToVector256Int16(op2.GetLower()).AsUInt16()
GenTree* lowerOp2 = gtNewSimdWidenLowerNode(type, op2, simdBaseJitType, simdSize);

// Vector256<ushort> lowerProduct = lowerOp1 * lowerOp2
GenTree* lowerProduct =
gtNewSimdBinOpNode(GT_MUL, type, lowerOp1, lowerOp2, widenedSimdBaseJitType, simdSize);

// Vector256<ushort> upperOp1 = Avx2.ConvertToVector256Int16(op1.GetUpper()).AsUInt16()
GenTree* upperOp1 = gtNewSimdWidenUpperNode(type, op1Dup, simdBaseJitType, simdSize);

// Vector256<ushort> upperOp2 = Avx2.ConvertToVector256Int16(op2.GetUpper()).AsUInt16()
GenTree* upperOp2 = gtNewSimdWidenUpperNode(type, op2Dup, simdBaseJitType, simdSize);

// Vector256<ushort> upperProduct = upperOp1 * upperOp2
GenTree* upperProduct =
gtNewSimdBinOpNode(GT_MUL, type, upperOp1, upperOp2, widenedSimdBaseJitType, simdSize);

// Narrow and merge halves using helper method
return gtNewSimdNarrowNode(type, lowerProduct, upperProduct, simdBaseJitType, simdSize);
}

case TYP_SHORT:
case TYP_USHORT:
{
Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2429,12 +2429,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

assert(simdSize != 64 || IsBaselineVector512IsaSupportedDebugOnly());

if ((simdBaseType == TYP_BYTE) || (simdBaseType == TYP_UBYTE))
BladeWise marked this conversation as resolved.
Show resolved Hide resolved
{
// TODO-XARCH-CQ: We should support byte/sbyte multiplication
break;
}

if (varTypeIsLong(simdBaseType))
{
if (simdSize != 64 && !compOpportunisticallyDependsOn(InstructionSet_AVX512DQ_VL))
Expand Down
Loading