Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6249,6 +6249,7 @@ class Compiler
void fgConvertBBToThrowBB(BasicBlock* block);

bool fgCastNeeded(GenTree* tree, var_types toType);
bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false);

void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB);
void fgLoopCallMark();
Expand Down
184 changes: 166 additions & 18 deletions src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
// On x86, long->floating casts are implemented in DecomposeCast.
// Those nodes, plus any nodes that produce a long, will be examined.
if (!tree->TypeIs(TYP_LONG) &&
!(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree)))
#else
Expand All @@ -159,6 +161,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
// Here we do a conservative check for specific cases where it is certain the load/store
// can be contained. In those cases, we can skip decomposition.
//
// We also look for longs consumed directly by a long->floating cast. These can skip
// decomposition because the cast is implemented using HWIntrinsics.

GenTree* user = use.User();

Expand Down Expand Up @@ -582,44 +587,187 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use)
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
if (varTypeIsFloating(dstType))
if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType))
{
// We will reach this path only if morph did not convert the cast to a helper call,
// meaning we can perform the cast using SIMD instructions.
// The sequence this creates is simply:
// AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar()

NamedIntrinsic intrinsicId = NI_Illegal;
GenTree* srcOp = cast->CastOp();
var_types dstType = cast->CastToType();
CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
CorInfoType baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;

assert(!cast->gtOverflow());
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512));

intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;
GenTree* srcOp = cast->CastOp();
GenTree* castResult = nullptr;
LIR::Range castRange = LIR::EmptyRange();
CorInfoType srcBaseType = CORINFO_TYPE_UNDEF;
CorInfoType dstBaseType = CORINFO_TYPE_UNDEF;

GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16);
GenTree* convert =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16);
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16);
if (varTypeIsFloating(srcType))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know if the compiler is CSEing this check with the above check as expected?

-- Asking since manually caching might be a way to win some throughput back.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for sure, but I generally assume C++ compilers will handle 'obvious' ones like this. It should be noted, the throughput hit to x86 directly correlates with the number of casts that are now inlined.

i.e. the only significant throughput hit is on the coreclr_tests collection

image

which is also the one that had the most casts in it

image

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍. The biggest concern is the TP hit to minopts. It may be desirable to leave that using the helper there so that floating-point heavy code doesn't start up slower.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think #117512 will reduce the hit a bit.

It may be desirable to leave that using the helper there so that floating-point heavy code doesn't start up slower.

This is interesting, because the same argument could apply to the complicated saturating logic that we have for x64 as well. #97529 introduced a similar throughput regression, and although it was done for correctness instead of perf, the throughput hit could have been avoided by using the helper in minopts there too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the throughput hit could have been avoided by using the helper in minopts there too.

AFAIR, the JIT throughput hit there ended up being very minimal (and often an improvement). It was the perf score and code output size that regressed, which was expected.

If there was a significant perf score hit to minopts, then yes the same would apply here and it would likely be beneficial to ensure that is doing the "better" thing as well.

{
srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
dstBaseType = (dstType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
}
else
{
srcBaseType = (srcType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG;
dstBaseType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE;
}

Range().InsertAfter(cast, createScalar, convert, toScalar);
Range().Remove(cast);
// This creates the equivalent of the following C# code:
// var srcVec = Vector128.CreateScalarUnsafe(castOp);

GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcBaseType, 16);
castRange.InsertAtEnd(srcVector);

if (createScalar->IsCnsVec())
if (srcVector->IsCnsVec())
{
Range().Remove(srcOp);
}

if (varTypeIsFloating(dstType))
{
// long->floating casts don't require any kind of fixup. We simply use the vector
// form of the instructions, because the scalar form is not supported on 32-bit.

NamedIntrinsic intrinsicId =
(dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double;

castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16);
}
else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
{
// Likewise, the AVX10.2 saturating floating->long instructions give the correct result,
// but we have to use the vector form.

NamedIntrinsic intrinsicId = (dstType == TYP_ULONG)
? NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation
: NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation;

castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16);
}
else if (dstType == TYP_ULONG)
{
// AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so
// we only need to fix up negative or NaN values before conversion.
//
// maxs[sd] will take the value from the second operand if the first operand's value is
// NaN, which allows us to fix up both negative and NaN values with a single instruction.
//
// This creates the equivalent of the following C# code:
// var fixupVal = Sse.MaxScalar(srcVec, Vector128<T>.Zero);
// castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal);

GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
GenTree* fixupVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar,
srcBaseType, 16);

castRange.InsertAtEnd(zero);
castRange.InsertAtEnd(fixupVal);

castResult =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal,
NI_AVX512_ConvertToVector128UInt64WithTruncation, srcBaseType, 16);
}
else
{
assert(dstType == TYP_LONG);

// We will use the input value multiple times, so we replace it with a lclVar.
LIR::Use srcUse;
LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse);
srcUse.ReplaceWithLclVar(m_compiler);
srcVector = srcUse.Def();

// We fix up NaN values by masking in zero during conversion. Negative saturation is handled
// correctly by the conversion instructions. Positive saturation is handled after conversion,
// because MaxValue is not precisely representable in the floating format.
//
// This creates roughly the equivalent of the following C# code:
// var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling);
// var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec);
// convertResult = Vector128.ConditionalSelect(nanMask, convert, Vector128<long>.Zero);

GenTree* srcClone = m_compiler->gtClone(srcVector);
GenTree* compareMode =
m_compiler->gtNewIconNode(static_cast<int32_t>(FloatComparisonMode::OrderedNonSignaling));
GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode,
NI_AVX512_CompareScalarMask, srcBaseType, 16);

castRange.InsertAtEnd(srcClone);
castRange.InsertAtEnd(compareMode);
castRange.InsertAtEnd(nanMask);

srcClone = m_compiler->gtClone(srcVector);
GenTree* convertResult =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone,
NI_AVX512_ConvertToVector128Int64WithTruncation, srcBaseType, 16);

castRange.InsertAtEnd(srcClone);
castRange.InsertAtEnd(convertResult);

nanMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, nanMask, dstBaseType, 16);
GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16);
convertResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, nanMask, convertResult, zero, dstBaseType, 16);

castRange.InsertAtEnd(nanMask);
castRange.InsertAtEnd(zero);
castRange.InsertAtEnd(convertResult);

// Now we handle saturation of the result for positive overflow.
//
// This creates roughly the equivalent of the following C# code:
// var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling;
// var maxFloatingValue = Vector128.Create(9223372036854775808.0);
// var compareMax = Avx.CompareScalar(srcVec, maxFloatingValue, compareMode);
// var maxLong = Vector128<long>.AllOnes >>> 1;
// castResult = Vector128.ConditionalSelect(compareMax, maxLong, convertResult);

compareMode = m_compiler->gtNewIconNode(
static_cast<int32_t>(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling));

GenTreeVecCon* maxFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16);
maxFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0);

srcClone = m_compiler->gtClone(srcVector);
GenTree* compareMax =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, maxFloatingValue, compareMode,
NI_AVX512_CompareScalarMask, srcBaseType, 16);

castRange.InsertAtEnd(srcClone);
castRange.InsertAtEnd(maxFloatingValue);
castRange.InsertAtEnd(compareMode);
castRange.InsertAtEnd(compareMax);

GenTree* allOnes = m_compiler->gtNewAllBitsSetConNode(TYP_SIMD16);
GenTree* one = m_compiler->gtNewIconNode(1);
GenTree* maxLong = m_compiler->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, allOnes, one, dstBaseType, 16);

castRange.InsertAtEnd(allOnes);
castRange.InsertAtEnd(one);
castRange.InsertAtEnd(maxLong);

compareMax = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, compareMax, dstBaseType, 16);
castResult =
m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, compareMax, maxLong, convertResult, dstBaseType, 16);

castRange.InsertAtEnd(compareMax);
}

// Because the results are in a SIMD register, we need to ToScalar() them out.
GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstBaseType, 16);

castRange.InsertAtEnd(castResult);
castRange.InsertAtEnd(toScalar);

Range().InsertAfter(cast, std::move(castRange));
Range().Remove(cast);

if (use.IsDummyUse())
{
toScalar->SetUnusedValue();
}
use.ReplaceWith(toScalar);

return toScalar->gtNext;
return toScalar;
}
#endif // FEATURE_HW_INTRINSICS && TARGET_X86

Expand Down
37 changes: 37 additions & 0 deletions src/coreclr/jit/flowgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,43 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType)
return true;
}

//-------------------------------------------------------------------------------------
// fgCastRequiresHelper: Check whether a given cast must be converted to a helper call.
//
// Arguments:
// fromType - The source type of the cast.
// toType - The target type of the cast.
// overflow - True if the cast has the GTF_OVERFLOW flag set.
//
// Return Value:
// True if the cast requires a helper call, otherwise false.
//
bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */)
{
if (varTypeIsFloating(fromType) && overflow)
{
assert(varTypeIsIntegral(toType));
return true;
}

#if !defined(TARGET_64BIT)
if ((varTypeIsFloating(fromType) && varTypeIsLong(toType)) ||
(varTypeIsLong(fromType) && varTypeIsFloating(toType)))
{
#if defined(TARGET_X86)
if (compOpportunisticallyDependsOn(InstructionSet_AVX512))
{
return false;
}
#endif // TARGET_X86

return true;
}
#endif // !TARGET_64BIT

return false;
}

GenTree* Compiler::fgGetCritSectOfStaticMethod()
{
noway_assert(!compIsForInlining());
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask,
HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, HW_Category_MemoryStore, HW_Flag_NoFlag)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id)
case NI_AVX_CompareScalar:
case NI_AVX512_Compare:
case NI_AVX512_CompareMask:
case NI_AVX512_CompareScalarMask:
case NI_AVX10v2_MinMaxScalar:
case NI_AVX10v2_MinMax:
{
Expand Down
24 changes: 2 additions & 22 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8258,28 +8258,8 @@ void Compiler::impImportBlockCode(BasicBlock* block)
goto _CONV;

_CONV:
// only converts from FLOAT or DOUBLE to an integer type
// and converts from ULONG (or LONG on ARM) to DOUBLE are morphed to calls

if (varTypeIsFloating(lclTyp))
{
callNode = varTypeIsLong(impStackTop().val) ||
uns // uint->dbl gets turned into uint->long->dbl
#ifdef TARGET_64BIT
// TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK?
// TYP_BYREF could be used as TYP_I_IMPL which is long.
// TODO-CQ: remove this when we lower casts long/ulong --> float/double
// and generate SSE2 code instead of going through helper calls.
|| impStackTop().val->TypeIs(TYP_BYREF)
#endif
;
}
else
{
callNode = varTypeIsFloating(impStackTop().val->TypeGet());
}

op1 = impPopStack().val;
op1 = impPopStack().val;
callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl);

impBashVarAddrsToI(op1);

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
}

LowerCast(node);
break;
return nextNode;
}

case GT_BITCAST:
Expand Down
Loading
Loading