diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 3845678c08a542..ee4b544c7082f5 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6249,6 +6249,7 @@ class Compiler void fgConvertBBToThrowBB(BasicBlock* block); bool fgCastNeeded(GenTree* tree, var_types toType); + bool fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow = false); void fgLoopCallTest(BasicBlock* srcBB, BasicBlock* dstBB); void fgLoopCallMark(); diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 6425c393b3620d..79228482cf8279 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -138,6 +138,8 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + // On x86, long->floating casts are implemented in DecomposeCast. + // Those nodes, plus any nodes that produce a long, will be examined. if (!tree->TypeIs(TYP_LONG) && !(tree->OperIs(GT_CAST) && varTypeIsLong(tree->AsCast()->CastOp()) && varTypeIsFloating(tree))) #else @@ -159,6 +161,9 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) // HWIntrinsics can consume/produce a long directly, provided its source/target is memory. // Here we do a conservative check for specific cases where it is certain the load/store // can be contained. In those cases, we can skip decomposition. + // + // We also look for longs consumed directly by a long->floating cast. These can skip + // decomposition because the cast is implemented using HWIntrinsics. GenTree* user = use.User(); @@ -582,44 +587,187 @@ GenTree* DecomposeLongs::DecomposeCast(LIR::Use& use) } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) - if (varTypeIsFloating(dstType)) + if (varTypeIsFloating(srcType) || varTypeIsFloating(dstType)) { // We will reach this path only if morph did not convert the cast to a helper call, // meaning we can perform the cast using SIMD instructions. - // The sequence this creates is simply: - // AVX512DQ.VL.ConvertToVector128Single(Vector128.CreateScalarUnsafe(LONG)).ToScalar() - - NamedIntrinsic intrinsicId = NI_Illegal; - GenTree* srcOp = cast->CastOp(); - var_types dstType = cast->CastToType(); - CorInfoType baseFloatingType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; - CorInfoType baseIntegralType = cast->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; assert(!cast->gtOverflow()); assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512)); - intrinsicId = (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; + GenTree* srcOp = cast->CastOp(); + GenTree* castResult = nullptr; + LIR::Range castRange = LIR::EmptyRange(); + CorInfoType srcBaseType = CORINFO_TYPE_UNDEF; + CorInfoType dstBaseType = CORINFO_TYPE_UNDEF; - GenTree* createScalar = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, baseIntegralType, 16); - GenTree* convert = - m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, createScalar, intrinsicId, baseIntegralType, 16); - GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(dstType, convert, baseFloatingType, 16); + if (varTypeIsFloating(srcType)) + { + srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; + dstBaseType = (dstType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + } + else + { + srcBaseType = (srcType == TYP_ULONG) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + dstBaseType = (dstType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; + } - Range().InsertAfter(cast, createScalar, convert, toScalar); - Range().Remove(cast); + // This creates the equivalent of the following C# code: + // var srcVec = Vector128.CreateScalarUnsafe(castOp); + + GenTree* srcVector = m_compiler->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, srcOp, srcBaseType, 16); + castRange.InsertAtEnd(srcVector); - if (createScalar->IsCnsVec()) + if (srcVector->IsCnsVec()) { Range().Remove(srcOp); } + if (varTypeIsFloating(dstType)) + { + // long->floating casts don't require any kind of fixup. We simply use the vector + // form of the instructions, because the scalar form is not supported on 32-bit. + + NamedIntrinsic intrinsicId = + (dstType == TYP_FLOAT) ? NI_AVX512_ConvertToVector128Single : NI_AVX512_ConvertToVector128Double; + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16); + } + else if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) + { + // Likewise, the AVX10.2 saturating floating->long instructions give the correct result, + // but we have to use the vector form. + + NamedIntrinsic intrinsicId = (dstType == TYP_ULONG) + ? NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation + : NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation; + + castResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, intrinsicId, srcBaseType, 16); + } + else if (dstType == TYP_ULONG) + { + // AVX-512 unsigned conversion instructions correctly saturate for positive overflow, so + // we only need to fix up negative or NaN values before conversion. + // + // maxs[sd] will take the value from the second operand if the first operand's value is + // NaN, which allows us to fix up both negative and NaN values with a single instruction. + // + // This creates the equivalent of the following C# code: + // var fixupVal = Sse.MaxScalar(srcVec, Vector128.Zero); + // castResult = Avx512DQ.VL.ConvertToVector128UInt64WithTruncation(fixupVal); + + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + GenTree* fixupVal = m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, NI_X86Base_MaxScalar, + srcBaseType, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(fixupVal); + + castResult = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal, + NI_AVX512_ConvertToVector128UInt64WithTruncation, srcBaseType, 16); + } + else + { + assert(dstType == TYP_LONG); + + // We will use the input value multiple times, so we replace it with a lclVar. + LIR::Use srcUse; + LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse); + srcUse.ReplaceWithLclVar(m_compiler); + srcVector = srcUse.Def(); + + // We fix up NaN values by masking in zero during conversion. Negative saturation is handled + // correctly by the conversion instructions. Positive saturation is handled after conversion, + // because MaxValue is not precisely representable in the floating format. + // + // This creates roughly the equivalent of the following C# code: + // var nanMask = Avx.CompareScalar(srcVec, srcVec, FloatComparisonMode.OrderedNonSignaling); + // var convert = Avx512DQ.VL.ConvertToVector128Int64WithTruncation(srcVec); + // convertResult = Vector128.ConditionalSelect(nanMask, convert, Vector128.Zero); + + GenTree* srcClone = m_compiler->gtClone(srcVector); + GenTree* compareMode = + m_compiler->gtNewIconNode(static_cast(FloatComparisonMode::OrderedNonSignaling)); + GenTree* nanMask = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcVector, srcClone, compareMode, + NI_AVX512_CompareScalarMask, srcBaseType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(compareMode); + castRange.InsertAtEnd(nanMask); + + srcClone = m_compiler->gtClone(srcVector); + GenTree* convertResult = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, + NI_AVX512_ConvertToVector128Int64WithTruncation, srcBaseType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(convertResult); + + nanMask = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, nanMask, dstBaseType, 16); + GenTree* zero = m_compiler->gtNewZeroConNode(TYP_SIMD16); + convertResult = m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, nanMask, convertResult, zero, dstBaseType, 16); + + castRange.InsertAtEnd(nanMask); + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(convertResult); + + // Now we handle saturation of the result for positive overflow. + // + // This creates roughly the equivalent of the following C# code: + // var compareMode = FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling; + // var maxFloatingValue = Vector128.Create(9223372036854775808.0); + // var compareMax = Avx.CompareScalar(srcVec, maxFloatingValue, compareMode); + // var maxLong = Vector128.AllOnes >>> 1; + // castResult = Vector128.ConditionalSelect(compareMax, maxLong, convertResult); + + compareMode = m_compiler->gtNewIconNode( + static_cast(FloatComparisonMode::OrderedGreaterThanOrEqualNonSignaling)); + + GenTreeVecCon* maxFloatingValue = m_compiler->gtNewVconNode(TYP_SIMD16); + maxFloatingValue->EvaluateBroadcastInPlace(srcType, 9223372036854775808.0); + + srcClone = m_compiler->gtClone(srcVector); + GenTree* compareMax = + m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, srcClone, maxFloatingValue, compareMode, + NI_AVX512_CompareScalarMask, srcBaseType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(maxFloatingValue); + castRange.InsertAtEnd(compareMode); + castRange.InsertAtEnd(compareMax); + + GenTree* allOnes = m_compiler->gtNewAllBitsSetConNode(TYP_SIMD16); + GenTree* one = m_compiler->gtNewIconNode(1); + GenTree* maxLong = m_compiler->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, allOnes, one, dstBaseType, 16); + + castRange.InsertAtEnd(allOnes); + castRange.InsertAtEnd(one); + castRange.InsertAtEnd(maxLong); + + compareMax = m_compiler->gtNewSimdCvtMaskToVectorNode(TYP_SIMD16, compareMax, dstBaseType, 16); + castResult = + m_compiler->gtNewSimdCndSelNode(TYP_SIMD16, compareMax, maxLong, convertResult, dstBaseType, 16); + + castRange.InsertAtEnd(compareMax); + } + + // Because the results are in a SIMD register, we need to ToScalar() them out. + GenTree* toScalar = m_compiler->gtNewSimdToScalarNode(genActualType(dstType), castResult, dstBaseType, 16); + + castRange.InsertAtEnd(castResult); + castRange.InsertAtEnd(toScalar); + + Range().InsertAfter(cast, std::move(castRange)); + Range().Remove(cast); + if (use.IsDummyUse()) { toScalar->SetUnusedValue(); } use.ReplaceWith(toScalar); - return toScalar->gtNext; + return toScalar; } #endif // FEATURE_HW_INTRINSICS && TARGET_X86 diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 1bff7f951db56e..62b48bfc0d52a5 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1270,6 +1270,43 @@ bool Compiler::fgCastNeeded(GenTree* tree, var_types toType) return true; } +//------------------------------------------------------------------------------------- +// fgCastRequiresHelper: Check whether a given cast must be converted to a helper call. +// +// Arguments: +// fromType - The source type of the cast. +// toType - The target type of the cast. +// overflow - True if the cast has the GTF_OVERFLOW flag set. +// +// Return Value: +// True if the cast requires a helper call, otherwise false. +// +bool Compiler::fgCastRequiresHelper(var_types fromType, var_types toType, bool overflow /* false */) +{ + if (varTypeIsFloating(fromType) && overflow) + { + assert(varTypeIsIntegral(toType)); + return true; + } + +#if !defined(TARGET_64BIT) + if ((varTypeIsFloating(fromType) && varTypeIsLong(toType)) || + (varTypeIsLong(fromType) && varTypeIsFloating(toType))) + { +#if defined(TARGET_X86) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) + { + return false; + } +#endif // TARGET_X86 + + return true; + } +#endif // !TARGET_64BIT + + return false; +} + GenTree* Compiler::fgGetCritSectOfStaticMethod() { noway_assert(!compIsForInlining()); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 482035112b0706..f61e6f34bcbe85 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1230,6 +1230,7 @@ HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqualMask, HARDWARE_INTRINSIC(AVX512, CompareNotLessThanMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqualMask, -1, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareOrderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX512, CompareScalarMask, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpss, INS_vcmpsd}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompareUnorderedMask, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcmpps, INS_vcmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX512, CompressMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512, CompressStoreMask, -1, 3, {INS_vpcompressb, INS_vpcompressb, INS_vpcompressw, INS_vpcompressw, INS_vpcompressd, INS_vpcompressd, INS_vpcompressq, INS_vpcompressq, INS_vcompressps, INS_vcompresspd}, HW_Category_MemoryStore, HW_Flag_NoFlag) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 0a326b51662b34..95c7636f5d938d 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -494,6 +494,7 @@ int HWIntrinsicInfo::lookupImmUpperBound(NamedIntrinsic id) case NI_AVX_CompareScalar: case NI_AVX512_Compare: case NI_AVX512_CompareMask: + case NI_AVX512_CompareScalarMask: case NI_AVX10v2_MinMaxScalar: case NI_AVX10v2_MinMax: { diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index c2b0b30f243b01..2a920d4c2d46bf 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -8258,28 +8258,8 @@ void Compiler::impImportBlockCode(BasicBlock* block) goto _CONV; _CONV: - // only converts from FLOAT or DOUBLE to an integer type - // and converts from ULONG (or LONG on ARM) to DOUBLE are morphed to calls - - if (varTypeIsFloating(lclTyp)) - { - callNode = varTypeIsLong(impStackTop().val) || - uns // uint->dbl gets turned into uint->long->dbl -#ifdef TARGET_64BIT - // TODO-ARM64-Bug?: This was AMD64; I enabled it for ARM64 also. OK? - // TYP_BYREF could be used as TYP_I_IMPL which is long. - // TODO-CQ: remove this when we lower casts long/ulong --> float/double - // and generate SSE2 code instead of going through helper calls. - || impStackTop().val->TypeIs(TYP_BYREF) -#endif - ; - } - else - { - callNode = varTypeIsFloating(impStackTop().val->TypeGet()); - } - - op1 = impPopStack().val; + op1 = impPopStack().val; + callNode = fgCastRequiresHelper(op1->TypeGet(), lclTyp, ovfl); impBashVarAddrsToI(op1); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6134dde57b21e9..adaf29a41d2d91 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -501,7 +501,7 @@ GenTree* Lowering::LowerNode(GenTree* node) } LowerCast(node); - break; + return nextNode; } case GT_BITCAST: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4dac70f9e6998a..764d409aca1217 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -826,7 +826,7 @@ void Lowering::LowerCast(GenTree* tree) GenTree* castOp = tree->AsCast()->CastOp(); var_types dstType = tree->CastToType(); - var_types srcType = castOp->TypeGet(); + var_types srcType = genActualType(castOp); // force the srcType to unsigned if GT_UNSIGNED flag is set if (tree->IsUnsigned()) @@ -843,12 +843,104 @@ void Lowering::LowerCast(GenTree* tree) // Long types should have been handled by helper call or in DecomposeLongs on x86. assert(!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit); } - else if (srcType == TYP_UINT) + +#ifdef TARGET_X86 + if ((srcType == TYP_UINT) && varTypeIsFloating(dstType) && + !comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - // uint->float casts should have an intermediate cast to long unless - // we have the EVEX unsigned conversion instructions available. - assert(dstType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly()); + // Pre-AVX-512, there was no conversion instruction for uint->floating, so we emulate it + // using signed int conversion. This is necessary only on 32-bit, because x64 simply casts + // the uint up to a signed long before conversion. + // + // This logic depends on the fact that conversion from int to double is lossless. When + // converting to float, we use a double intermediate, and convert to float only after the + // double result is fixed up. This ensures the floating result is rounded correctly. + + LABELEDDISPTREERANGE("LowerCast before", BlockRange(), tree); + + LIR::Range castRange = LIR::EmptyRange(); + CorInfoType dstBaseType = CORINFO_TYPE_DOUBLE; + + // We will use the input value twice, so replace it with a lclVar. + LIR::Use srcUse; + LIR::Use::MakeDummyUse(castRange, castOp, &srcUse); + srcUse.ReplaceWithLclVar(comp); + castOp = srcUse.Def(); + + // This creates the equivalent of the following C# code: + // var castResult = Sse2.ConvertScalarToVector128Double(Vector128.Zero, (int)castOp); + + GenTree* zero = comp->gtNewZeroConNode(TYP_SIMD16); + GenTree* castResult = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, zero, castOp, NI_X86Base_ConvertScalarToVector128Double, + CORINFO_TYPE_INT, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(castResult); + + // We will use the conversion result multiple times, so replace it with a lclVar. + LIR::Use resUse; + LIR::Use::MakeDummyUse(castRange, castResult, &resUse); + resUse.ReplaceWithLclVar(comp); + castResult = resUse.Def(); + + // If the input had the MSB set, it will have converted as a negative, so we must wrap the + // result back around to positive by adding 2^32. `blendvpd` uses only the MSB of the mask + // element. + // + // This creates the equivalent of the following C# code: + // var addRes = Sse2.AddScalar(castResult, Vector128.CreateScalar(4294967296.0)); + // castResult = Sse41.BlendVariable(castResult, addRes, castResult); + + GenTreeVecCon* addCns = comp->gtNewVconNode(TYP_SIMD16); + addCns->gtSimdVal.f64[0] = 4294967296.0; + + GenTree* addRes = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult, addCns, NI_X86Base_AddScalar, dstBaseType, 16); + + castRange.InsertAtEnd(addCns); + castRange.InsertAtEnd(addRes); + + GenTree* resClone1 = comp->gtClone(castResult); + GenTree* resClone2 = comp->gtClone(castResult); + castResult = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, resClone1, addRes, resClone2, NI_X86Base_BlendVariable, + dstBaseType, 16); + castRange.InsertAtEnd(resClone1); + castRange.InsertAtEnd(resClone2); + castRange.InsertAtEnd(castResult); + + // Convert to float if necessary, then ToScalar() the result out. + if (dstType == TYP_FLOAT) + { + castResult = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castResult, NI_X86Base_ConvertToVector128Single, + dstBaseType, 16); + dstBaseType = CORINFO_TYPE_FLOAT; + castRange.InsertAtEnd(castResult); + } + + GenTree* toScalar = comp->gtNewSimdToScalarNode(dstType, castResult, dstBaseType, 16); + castRange.InsertAtEnd(toScalar); + + LIR::ReadOnlyRange lowerRange(castRange.FirstNode(), castRange.LastNode()); + BlockRange().InsertBefore(tree, std::move(castRange)); + + LABELEDDISPTREERANGE("LowerCast after", BlockRange(), toScalar); + + LIR::Use castUse; + if (BlockRange().TryGetUse(tree, &castUse)) + { + castUse.ReplaceWith(toScalar); + } + else + { + toScalar->SetUnusedValue(); + } + + BlockRange().Remove(tree); + LowerRange(lowerRange); + return; } +#endif // TARGET_X86 #ifdef FEATURE_HW_INTRINSICS if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && @@ -10227,6 +10319,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512_Shuffle: case NI_AVX512_SumAbsoluteDifferencesInBlock32: case NI_AVX512_CompareMask: + case NI_AVX512_CompareScalarMask: case NI_AES_CarrylessMultiply: case NI_AES_V256_CarrylessMultiply: case NI_AES_V512_CarrylessMultiply: diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index a4f09b64f6213a..da14c6b2c32031 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -264,8 +264,7 @@ GenTree* Compiler::fgMorphIntoHelperCall(GenTree* tree, int helper, bool morphAr // casts for all targets. // 2. Morphs casts not supported by the target directly into helpers. // These mostly have to do with casts from and to floating point -// types, especially checked ones. Refer to the implementation for -// what specific casts need to be handled - it is a complex matrix. +// types, especially checked ones. // 3. "Casts away" the GC-ness of a tree (for CAST(nint <- byref)) via // storing the GC tree to an inline non-GC temporary. // 3. "Pushes down" truncating long -> int casts for some operations: @@ -288,27 +287,11 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) GenTree* oper = tree->CastOp(); var_types srcType = genActualType(oper); var_types dstType = tree->CastToType(); - unsigned dstSize = genTypeSize(dstType); - // See if the cast has to be done in two steps. R -> I if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType)) { - if (srcType == TYP_FLOAT -#ifdef TARGET_64BIT - // 64-bit: src = float, dst is overflow conversion. - // This goes through helper and hence src needs to be converted to double. - && tree->gtOverflow() -#else - // 32-bit: src = float, dst = int64/uint64 or overflow conversion. - && (tree->gtOverflow() || varTypeIsLong(dstType)) -#endif // TARGET_64BIT - ) - { - oper = gtNewCastNode(TYP_DOUBLE, oper, false, TYP_DOUBLE); - } - // Do we need to do it in two steps R -> I -> smallType? - if (dstSize < genTypeSize(TYP_INT)) + if (varTypeIsSmall(dstType)) { oper = gtNewCastNodeL(TYP_INT, oper, /* fromUnsigned */ false, TYP_INT); oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); @@ -318,45 +301,51 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // CAST_OVF(BYTE <- INT) != CAST_OVF(BYTE <- UINT). assert(!tree->IsUnsigned()); } - else + else if (fgCastRequiresHelper(srcType, dstType, tree->gtOverflow())) { - if (!tree->gtOverflow()) + CorInfoHelpFunc helper = CORINFO_HELP_UNDEF; + + if (srcType == TYP_FLOAT) { -#ifdef TARGET_64BIT - return nullptr; -#else - if (!varTypeIsLong(dstType)) - { - return nullptr; - } + oper = gtNewCastNode(TYP_DOUBLE, oper, false, TYP_DOUBLE); + } + if (tree->gtOverflow()) + { switch (dstType) { + case TYP_INT: + helper = CORINFO_HELP_DBL2INT_OVF; + break; + case TYP_UINT: + helper = CORINFO_HELP_DBL2UINT_OVF; + break; case TYP_LONG: - return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2LNG, oper); + helper = CORINFO_HELP_DBL2LNG_OVF; + break; case TYP_ULONG: - return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2ULNG, oper); + helper = CORINFO_HELP_DBL2ULNG_OVF; + break; default: unreached(); } -#endif // TARGET_64BIT } else { switch (dstType) { - case TYP_INT: - return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2INT_OVF, oper); - case TYP_UINT: - return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2UINT_OVF, oper); case TYP_LONG: - return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2LNG_OVF, oper); + helper = CORINFO_HELP_DBL2LNG; + break; case TYP_ULONG: - return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2ULNG_OVF, oper); + helper = CORINFO_HELP_DBL2ULNG; + break; default: unreached(); } } + + return fgMorphCastIntoHelper(tree, helper, oper); } } @@ -386,13 +375,12 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) tree->ClearUnsigned(); tree->AsCast()->CastOp() = oper; } -#endif //! TARGET_64BIT -#ifdef TARGET_ARM - // converts long/ulong --> float/double casts into helper calls. - else if (varTypeIsFloating(dstType) && varTypeIsLong(srcType)) + // Convert long/ulong --> float/double casts into helper calls if necessary. + else if (varTypeIsLong(srcType) && varTypeIsFloating(dstType) && fgCastRequiresHelper(srcType, dstType)) { CorInfoHelpFunc helper = CORINFO_HELP_UNDEF; + if (dstType == TYP_FLOAT) { helper = tree->IsUnsigned() ? CORINFO_HELP_ULNG2FLT : CORINFO_HELP_LNG2FLT; @@ -401,65 +389,25 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) { helper = tree->IsUnsigned() ? CORINFO_HELP_ULNG2DBL : CORINFO_HELP_LNG2DBL; } + return fgMorphCastIntoHelper(tree, helper, oper); } -#endif // TARGET_ARM +#endif // !TARGET_64BIT #ifdef TARGET_AMD64 // Do we have to do two step U4 -> R4/8 ? // If we don't have the EVEX unsigned conversion instructions available, // we will widen to long and use signed conversion: U4 -> Long -> R4/8. - // U8 -> R4/R8 is handled directly in codegen, so we ignore it here. - else if (tree->IsUnsigned() && varTypeIsFloating(dstType)) + else if (tree->IsUnsigned() && varTypeIsInt(srcType) && varTypeIsFloating(dstType) && + !compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - srcType = varTypeToUnsigned(srcType); - - if (srcType == TYP_UINT && !canUseEvexEncoding()) - { - oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); - oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); - tree->ClearUnsigned(); - tree->CastOp() = oper; - } + oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); + oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); + tree->ClearUnsigned(); + tree->CastOp() = oper; } #endif // TARGET_AMD64 -#ifdef TARGET_X86 -#ifdef FEATURE_HW_INTRINSICS - else if (varTypeIsLong(srcType) && varTypeIsFloating(dstType) && canUseEvexEncoding()) - { - // We can handle these casts directly using SIMD instructions. - // The transform to SIMD is done in DecomposeLongs. - return nullptr; - } -#endif // FEATURE_HW_INTRINSICS - - // Do we have to do two step U4/8 -> R4/8 ? - else if (tree->IsUnsigned() && varTypeIsFloating(dstType)) - { - srcType = varTypeToUnsigned(srcType); - - if (srcType == TYP_ULONG) - { - CorInfoHelpFunc helper = (dstType == TYP_FLOAT) ? CORINFO_HELP_ULNG2FLT : CORINFO_HELP_ULNG2DBL; - return fgMorphCastIntoHelper(tree, helper, oper); - } - else if (srcType == TYP_UINT && !canUseEvexEncoding()) - { - oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); - oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); - tree->ClearUnsigned(); - - CorInfoHelpFunc helper = (dstType == TYP_FLOAT) ? CORINFO_HELP_LNG2FLT : CORINFO_HELP_LNG2DBL; - return fgMorphCastIntoHelper(tree, helper, oper); - } - } - else if (!tree->IsUnsigned() && (srcType == TYP_LONG) && varTypeIsFloating(dstType)) - { - CorInfoHelpFunc helper = (dstType == TYP_FLOAT) ? CORINFO_HELP_LNG2FLT : CORINFO_HELP_LNG2DBL; - return fgMorphCastIntoHelper(tree, helper, oper); - } -#endif // TARGET_X86 else if (varTypeIsGC(srcType) != varTypeIsGC(dstType)) { // We are casting away GC information. we would like to just