Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5405,6 +5405,9 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)

switch (intrinsicId)
{
case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
case NI_SSE2_ConvertToInt32:
case NI_SSE2_ConvertToUInt32:
case NI_SSE2_X64_ConvertToInt64:
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1702,11 +1702,15 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
case NI_Vector128_GetElement:
case NI_Vector256_GetElement:
case NI_Vector512_GetElement:
{
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
}

default:
{
noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition");
break;
}
}

return nullptr;
Expand Down
46 changes: 45 additions & 1 deletion src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19067,6 +19067,9 @@ bool GenTree::isContainableHWIntrinsic() const
}

case NI_Vector128_GetElement:
case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
case NI_SSE2_ConvertToInt32:
case NI_SSE2_ConvertToUInt32:
case NI_SSE2_X64_ConvertToInt64:
Expand Down Expand Up @@ -22057,27 +22060,56 @@ GenTree* Compiler::gtNewSimdGetElementNode(
assert(varTypeIsArithmetic(simdBaseType));

#if defined(TARGET_XARCH)
bool useToScalar = op2->IsIntegralConst(0);

#if defined(TARGET_X86)
// We handle decomposition via GetElement for simplicity
useToScalar &= !varTypeIsLong(simdBaseType);
#endif // TARGET_X86

if (useToScalar)
{
intrinsicId = NI_Vector128_ToScalar;

if (simdSize == 64)
{
intrinsicId = NI_Vector512_ToScalar;
}
else if (simdSize == 32)
{
intrinsicId = NI_Vector256_ToScalar;
}

return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize);
}

switch (simdBaseType)
{
// Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
case TYP_LONG:
case TYP_ULONG:
{
// Using software fallback if simdBaseType is not supported by hardware
assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
break;
}

case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
{
assert(compIsaSupportedDebugOnly(InstructionSet_SSE2));
break;
}

default:
{
unreached();
}
}

if (simdSize == 64)
Expand All @@ -22089,6 +22121,18 @@ GenTree* Compiler::gtNewSimdGetElementNode(
intrinsicId = NI_Vector256_GetElement;
}
#elif defined(TARGET_ARM64)
if (op2->IsIntegralConst(0))
{
intrinsicId = NI_Vector128_ToScalar;

if (simdSize == 8)
{
intrinsicId = NI_Vector64_ToScalar;
}

return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize);
}

if (simdSize == 8)
{
intrinsicId = NI_Vector64_GetElement;
Expand Down
10 changes: 8 additions & 2 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1276,14 +1276,20 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
{
assert(varTypeIsFloating(baseType));

if (op1->isContained() || op1->isUsedFromSpillTemp())
{
if (varTypeIsIntegral(baseType))
{
// We just want to emit a standard read from memory
ins = ins_Move_Extend(baseType, false);
attr = emitTypeSize(baseType);
}
genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1);
}
else
{
assert(varTypeIsFloating(baseType));

// Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
emit->emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
}
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal,
HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToVector512, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
Expand Down Expand Up @@ -226,7 +226,7 @@ HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal,
HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, ToVector512, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
Expand Down Expand Up @@ -328,7 +328,7 @@ HARDWARE_INTRINSIC(Vector512, StoreAligned,
HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Subtract, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WidenLower, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WidenUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WithElement, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
Expand Down
30 changes: 23 additions & 7 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1678,34 +1678,48 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

op2 = impStackTop(0).val;

switch (simdBaseType)
{
// Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
case TYP_LONG:
case TYP_ULONG:
if (!compExactlyDependsOn(InstructionSet_SSE41))
{
bool useToScalar = op2->IsIntegralConst(0);

#if defined(TARGET_X86)
useToScalar &= !varTypeIsLong(simdBaseType);
#endif // TARGET_X86

if (!useToScalar && !compExactlyDependsOn(InstructionSet_SSE41))
{
// Using software fallback if simdBaseType is not supported by hardware
return nullptr;
}
break;
}

case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
{
// short/ushort/float/double is supported by SSE2
break;
}

default:
{
unreached();
}
}

GenTree* op2 = impPopStack().val;
GenTree* op1 = impSIMDPopStack();
impPopStack();
op1 = impSIMDPopStack();

retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
break;
Expand Down Expand Up @@ -2543,16 +2557,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 1);

op1 = impSIMDPopStack();

#if defined(TARGET_X86)
if (varTypeIsLong(simdBaseType))
{
// TODO-XARCH-CQ: It may be beneficial to decompose this operation
// Create a GetElement node which handles decomposition
op2 = gtNewIconNode(0);
retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
break;
}
#endif // TARGET_X86

// TODO-XARCH-CQ: It may be beneficial to import this as GetElement(0)
op1 = impSIMDPopStack();
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize);
break;
}
Expand Down
Loading