From 496e50f1f076f3bc12af1fb8876ddf83ba8ef52a Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Sun, 19 Jan 2025 11:43:27 -0800 Subject: [PATCH 1/3] improve x86 integral to floating cast codegen --- src/coreclr/jit/codegen.h | 4 - src/coreclr/jit/codegenxarch.cpp | 58 +++++++------ src/coreclr/jit/instr.cpp | 75 ++++++++-------- src/coreclr/jit/lowerxarch.cpp | 10 +-- src/coreclr/jit/lsraxarch.cpp | 11 ++- src/coreclr/jit/morph.cpp | 85 +++++-------------- .../JitBlue/Runtime_106338/Runtime_106338.cs | 2 +- 7 files changed, 94 insertions(+), 151 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index d84ba8760dca00..d1d03f62898e2d 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1621,11 +1621,7 @@ class CodeGen final : public CodeGenInterface instruction ins_Copy(var_types dstType); instruction ins_Copy(regNumber srcReg, var_types dstType); -#if defined(TARGET_XARCH) - instruction ins_FloatConv(var_types to, var_types from, emitAttr attr); -#elif defined(TARGET_ARM) instruction ins_FloatConv(var_types to, var_types from); -#endif instruction ins_MathOp(genTreeOps oper, var_types type); void instGen_Return(unsigned stkArgSize); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b3090f94e2c4f9..60019544e95694 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7219,10 +7219,9 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode) } else { - instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType)); + instruction ins = ins_FloatConv(dstType, srcType); - // integral to floating-point conversions all have RMW semantics if VEX support - // is not available + // floating-point conversions all have RMW semantics if VEX support is not available bool isRMW = !compiler->canUseVexEncoding(); inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE); @@ -7299,29 +7298,18 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) emitAttr srcSize = EA_ATTR(genTypeSize(srcType)); noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG)))); - // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions - // here since they should have been lowered appropriately. - noway_assert(srcType != TYP_UINT); - assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) || compiler->canUseEvexEncodingDebugOnly()); + // Also we don't expect to see uint32 -> float/double here unless the EVEX unsigned conversion + // instructions are available, since they should have been lowered appropriately. + assert((srcType != TYP_UINT) || compiler->canUseEvexEncodingDebugOnly()); - if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) && compiler->canUseEvexEncoding()) - { - assert(compiler->canUseEvexEncodingDebugOnly()); - genConsumeOperands(treeNode->AsOp()); - instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType)); - GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); - genProduceReg(treeNode); - return; - } - - // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used + // To convert integral type to floating, the cvt[u]si2ss/sd instruction is used // which does a partial write to lower 4/8 bytes of xmm register keeping the other - // upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop, + // upper bytes unmodified. If "cvt[u]si2ss/sd xmmReg, r32/r64" occurs inside a loop, // the partial write could introduce a false dependency and could cause a stall // if there are further uses of xmmReg. We have such a case occurring with a // customer reported version of SpectralNorm benchmark, resulting in 2x perf // regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before - // cvtsi2ss/sd instruction. + // cvt[u]si2ss/sd instruction. genConsumeOperands(treeNode->AsOp()); GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(), @@ -7329,21 +7317,27 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // Note that here we need to specify srcType that will determine // the size of source reg/mem operand and rex.w prefix. - instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType)); + instruction ins = ins_FloatConv(dstType, srcType); // integral to floating-point conversions all have RMW semantics if VEX support // is not available const bool isRMW = !compiler->canUseVexEncoding(); - // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction - // will interpret ULONG value as LONG. Hence we need to adjust the - // result if sign-bit of srcType is set. - if (srcType == TYP_ULONG) + if (varTypeIsUnsigned(srcType) && compiler->canUseEvexEncoding()) + { + GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); + } + else if (srcType == TYP_ULONG) { - assert(dstType == TYP_DOUBLE); assert(op1->isUsedFromReg()); + // We will have selected an unsigned conversion instruction above, but we + // can't use it here because we have no EVEX support. Switch to the signed + // equivalent, and add code to fix up the result. + + ins = ins_FloatConv(dstType, genActualType(srcType)); + // The following LONG->DOUBLE cast machinery is based on clang's implementation // with "-ffp-model=strict" flag: // @@ -7374,7 +7368,15 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) BasicBlock* label = genCreateTempLabel(); inst_JMP(EJ_jns, label); - GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg); + if (dstType == TYP_FLOAT) + { + GetEmitter()->emitIns_R_R(INS_addss, EA_4BYTE, targetReg, targetReg); + } + else + { + assert(dstType == TYP_DOUBLE); + GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg); + } genDefineTempLabel(label); } else @@ -7447,7 +7449,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) // Note that we need to specify dstType here so that it will determine // the size of destination integer register and also the rex.w prefix. genConsumeOperands(treeNode->AsOp()); - instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType)); + instruction ins = ins_FloatConv(dstType, srcType); GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); genProduceReg(treeNode); } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 775cbd69237810..2b0f34ebb49bb1 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -2414,49 +2414,57 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) // Arguments: // to - Destination type. // from - Source type. -// attr - Input size. // // Returns: // The correct conversion instruction to use based on src and dst types. // -instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) +instruction CodeGen::ins_FloatConv(var_types to, var_types from) { - // AVX: Supports following conversions - // srcType = int16/int64 castToType = float - // AVX512: Supports following conversions - // srcType = ulong castToType = double/float - switch (from) { - // int/long -> float/double use the same instruction but type size would be different. case TYP_INT: + switch (to) + { + case TYP_FLOAT: + return INS_cvtsi2ss32; + case TYP_DOUBLE: + return INS_cvtsi2sd32; + default: + unreached(); + } + break; + case TYP_LONG: switch (to) { case TYP_FLOAT: - { - if (EA_SIZE(attr) == EA_4BYTE) - { - return INS_cvtsi2ss32; - } - else if (EA_SIZE(attr) == EA_8BYTE) - { - return INS_cvtsi2ss64; - } + return INS_cvtsi2ss64; + case TYP_DOUBLE: + return INS_cvtsi2sd64; + default: unreached(); - } + } + break; + + case TYP_UINT: + switch (to) + { + case TYP_FLOAT: + return INS_vcvtusi2ss32; case TYP_DOUBLE: - { - if (EA_SIZE(attr) == EA_4BYTE) - { - return INS_cvtsi2sd32; - } - else if (EA_SIZE(attr) == EA_8BYTE) - { - return INS_cvtsi2sd64; - } + return INS_vcvtusi2sd32; + default: unreached(); - } + } + break; + + case TYP_ULONG: + switch (to) + { + case TYP_FLOAT: + return INS_vcvtusi2ss64; + case TYP_DOUBLE: + return INS_vcvtusi2sd64; default: unreached(); } @@ -2502,17 +2510,6 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) } break; - case TYP_ULONG: - switch (to) - { - case TYP_DOUBLE: - return INS_vcvtusi2sd64; - case TYP_FLOAT: - return INS_vcvtusi2ss64; - default: - unreached(); - } - default: unreached(); } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c80d19e9fdd4f2..1778695a6ad529 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -843,26 +843,20 @@ GenTree* Lowering::LowerCast(GenTree* tree) srcType = varTypeToUnsigned(srcType); } - // We should never see the following casts as they are expected to be lowered - // appropriately or converted into helper calls by front-end. + // We should not see the following casts unless directly supported by hardware, + // as they are expected to be lowered appropriately or converted into helper calls by front-end. // srcType = float/double castToType = * and overflow detecting cast // Reason: must be converted to a helper call // srcType = float/double, castToType = ulong // Reason: must be converted to a helper call // srcType = uint castToType = float/double // Reason: uint -> float/double = uint -> long -> float/double - // srcType = ulong castToType = float - // Reason: ulong -> float = ulong -> double -> float if (varTypeIsFloating(srcType)) { noway_assert(!tree->gtOverflow()); assert(castToType != TYP_ULONG || comp->canUseEvexEncoding()); } else if (srcType == TYP_UINT) - { - noway_assert(!varTypeIsFloating(castToType)); - } - else if (srcType == TYP_ULONG) { assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding()); } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 5a1bd13cddd09d..07c0705941d5fc 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2805,7 +2805,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } else { - // Currently dstCount = 2 is only used for DivRem, which has special constriants and handled above + // Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above assert((dstCount == 0) || ((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem)))); } @@ -2828,14 +2828,13 @@ int LinearScan::BuildCast(GenTreeCast* cast) { GenTree* src = cast->gtGetOp1(); - const var_types srcType = genActualType(src->TypeGet()); + const var_types srcType = src->TypeGet(); const var_types castType = cast->gtCastType; - if ((srcType == TYP_LONG) && (castType == TYP_DOUBLE) && - !compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + if (cast->IsUnsigned() && varTypeIsLong(srcType) && varTypeIsFloating(castType) && !compiler->canUseEvexEncoding()) { - // We need two extra temp regs for LONG->DOUBLE cast - // if we don't have AVX512F available. + // We need two extra temp regs for ULONG->DOUBLE/FLOAT cast + // if we don't have EVEX unsigned conversions available. buildInternalIntRegisterDefForNode(cast); buildInternalIntRegisterDefForNode(cast); } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 8b2269bbd677de..7e38d7b78f8db6 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -290,38 +290,6 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) var_types dstType = tree->CastToType(); unsigned dstSize = genTypeSize(dstType); -#if defined(TARGET_AMD64) - // If AVX512 is present, we have intrinsic available to convert - // ulong directly to float. Hence, we need to combine the 2 nodes - // GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT) into a single - // node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT). At this point, we already - // have the 2 GT_CAST nodes in the tree and we are combining them below. - if (oper->OperIs(GT_CAST)) - { - GenTreeCast* innerCast = oper->AsCast(); - - if (innerCast->IsUnsigned()) - { - GenTree* innerOper = innerCast->CastOp(); - var_types innerSrcType = genActualType(innerOper); - var_types innerDstType = innerCast->CastToType(); - unsigned innerDstSize = genTypeSize(innerDstType); - innerSrcType = varTypeToUnsigned(innerSrcType); - - // Check if we are going from ulong->double->float - if ((innerSrcType == TYP_ULONG) && (innerDstType == TYP_DOUBLE) && (dstType == TYP_FLOAT)) - { - if (canUseEvexEncoding()) - { - // One optimized (combined) cast here - tree = gtNewCastNode(TYP_FLOAT, innerOper, true, TYP_FLOAT); - return fgMorphTree(tree); - } - } - } - } -#endif // TARGET_AMD64 - // See if the cast has to be done in two steps. R -> I if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType)) { @@ -442,16 +410,23 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) } #endif //! TARGET_64BIT -#ifdef TARGET_ARMARCH - // AArch, unlike x86/amd64, has instructions that can cast directly from - // all integers (except for longs on AArch32 of course) to floats. +#if defined(TARGET_ARMARCH) || defined(TARGET_XARCH) // Because there is no IL instruction conv.r4.un, uint/ulong -> float // casts are always imported as CAST(float <- CAST(double <- uint/ulong)). - // We can eliminate the redundant intermediate cast as an optimization. + // We can usually eliminate the redundant intermediate cast as an optimization. + // AArch and xarch+EVEX have instructions that can cast directly from + // all integers (except for longs on 32-bit of course) to floats. + // On x64, we also have the option of widening uint -> long and + // using the signed conversion instructions, and ulong -> float/double + // is handled directly in codegen, so we can allow all casts, + // regardless of hardware support. else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST) -#ifdef TARGET_ARM +#ifndef TARGET_64BIT && !varTypeIsLong(oper->AsCast()->CastOp()) -#endif +#endif // TARGET_64BIT +#ifdef TARGET_X86 + && canUseEvexEncoding() +#endif // TARGET_X86 ) { oper->gtType = TYP_FLOAT; @@ -459,7 +434,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) return fgMorphTree(oper); } -#endif // TARGET_ARMARCH +#endif // TARGET_ARMARCH || TARGET_XARCH #ifdef TARGET_ARM // converts long/ulong --> float/double casts into helper calls. @@ -485,35 +460,15 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #endif // TARGET_ARM #ifdef TARGET_AMD64 - // Do we have to do two step U4/8 -> R4/8 ? - // Codegen supports the following conversion as one-step operation - // a) Long -> R4/R8 - // b) U8 -> R8 - // - // The following conversions are performed as two-step operations using above. - // U4 -> R4/8 = U4-> Long -> R4/8 - // U8 -> R4 = U8 -> R8 -> R4 + // Do we have to do two step U4 -> R4/8 ? + // If we don't have the EVEX unsigned conversion instructions available, + // we will widen to long and use signed conversion: U4 -> Long -> R4/8. + // U8 -> R4/R8 is handled directly in codegen, so we ignore it here. else if (tree->IsUnsigned() && varTypeIsFloating(dstType)) { srcType = varTypeToUnsigned(srcType); - if (srcType == TYP_ULONG && !canUseEvexEncoding()) - { - if (dstType == TYP_FLOAT) - { - // Codegen can handle U8 -> R8 conversion. - // U8 -> R4 = U8 -> R8 -> R4 - // - change the dsttype to double - // - insert a cast from double to float - // - recurse into the resulting tree - tree->CastToType() = TYP_DOUBLE; - tree->gtType = TYP_DOUBLE; - tree = gtNewCastNode(TYP_FLOAT, tree, false, TYP_FLOAT); - - return fgMorphTree(tree); - } - } - else if (srcType == TYP_UINT) + if (srcType == TYP_UINT && !canUseEvexEncoding()) { oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); @@ -533,7 +488,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) { return fgMorphCastIntoHelper(tree, CORINFO_HELP_ULNG2DBL, oper); } - else if (srcType == TYP_UINT) + else if (srcType == TYP_UINT && !canUseEvexEncoding()) { oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG); oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT)); diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs index 5cc1b75d42fda1..ed1b5cd56ace7a 100644 --- a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs +++ b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs @@ -22,7 +22,7 @@ public static void TestEntryPoint() float vr11 = 4294967295U | vr10; uint result = BitConverter.SingleToUInt32Bits(vr11); - if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || ((RuntimeInformation.ProcessArchitecture == Architecture.X64) && Avx512F.IsSupported)) + if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || (RuntimeInformation.ProcessArchitecture == Architecture.X64)) { // Expected to cast ulong -> float directly Assert.Equal(1600094603U, result); From 77b15f0d603de4f74f607237968ed22e933a906c Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 22 Jan 2025 16:39:12 -0800 Subject: [PATCH 2/3] more cleanup --- src/coreclr/jit/codegenxarch.cpp | 30 +++++++++++------------------- src/coreclr/jit/lowerxarch.cpp | 25 +++++++------------------ src/coreclr/jit/lsraxarch.cpp | 2 +- src/coreclr/jit/morph.cpp | 13 ++++++------- 4 files changed, 25 insertions(+), 45 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 60019544e95694..55dbcd58eb866d 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7247,7 +7247,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode) void CodeGen::genIntToFloatCast(GenTree* treeNode) { // int type --> float/double conversions are always non-overflow ones - assert(treeNode->OperGet() == GT_CAST); + assert(treeNode->OperIs(GT_CAST)); assert(!treeNode->gtOverflow()); regNumber targetReg = treeNode->GetRegNum(); @@ -7278,7 +7278,7 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // operation. if (srcType == TYP_BYREF) { - noway_assert(op1->OperGet() == GT_LCL_ADDR); + noway_assert(op1->OperIs(GT_LCL_ADDR)); srcType = TYP_I_IMPL; } @@ -7298,10 +7298,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) emitAttr srcSize = EA_ATTR(genTypeSize(srcType)); noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG)))); - // Also we don't expect to see uint32 -> float/double here unless the EVEX unsigned conversion - // instructions are available, since they should have been lowered appropriately. - assert((srcType != TYP_UINT) || compiler->canUseEvexEncodingDebugOnly()); - // To convert integral type to floating, the cvt[u]si2ss/sd instruction is used // which does a partial write to lower 4/8 bytes of xmm register keeping the other // upper bytes unmodified. If "cvt[u]si2ss/sd xmmReg, r32/r64" occurs inside a loop, @@ -7312,23 +7308,13 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // cvt[u]si2ss/sd instruction. genConsumeOperands(treeNode->AsOp()); - GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(), - treeNode->GetRegNum(), INS_OPTS_NONE); + GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg, INS_OPTS_NONE); // Note that here we need to specify srcType that will determine // the size of source reg/mem operand and rex.w prefix. instruction ins = ins_FloatConv(dstType, srcType); - // integral to floating-point conversions all have RMW semantics if VEX support - // is not available - - const bool isRMW = !compiler->canUseVexEncoding(); - - if (varTypeIsUnsigned(srcType) && compiler->canUseEvexEncoding()) - { - GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); - } - else if (srcType == TYP_ULONG) + if (srcType == TYP_ULONG && !compiler->canUseEvexEncoding()) { assert(op1->isUsedFromReg()); @@ -7381,6 +7367,12 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) } else { + assert(varTypeIsIntOrI(srcType) || compiler->canUseEvexEncodingDebugOnly()); + + // integral to floating-point conversions all have RMW semantics if VEX support + // is not available + + const bool isRMW = !compiler->canUseVexEncoding(); inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE); } genProduceReg(treeNode); @@ -7406,7 +7398,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) { // we don't expect to see overflow detecting float/double --> int type conversions here // as they should have been converted into helper calls by front-end. - assert(treeNode->OperGet() == GT_CAST); + assert(treeNode->OperIs(GT_CAST)); assert(!treeNode->gtOverflow()); regNumber targetReg = treeNode->GetRegNum(); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 1778695a6ad529..50f9d641bcf745 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -800,10 +800,9 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) * GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double) * GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double) * - * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64 + * Unless the EVEX conversion instructions are available, casts from Uint32 * are morphed as follows by front-end and hence should not be seen here. * GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double) - * GT_CAST(uint64, float) = GT_CAST(GT_CAST(uint64, double), float) * * * Similarly casts from float/double to a smaller int type are transformed as follows: @@ -812,24 +811,14 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) * GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16) * GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16) * - * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit - * integer. The above transformations help us to leverage those instructions. - * * Note that for the following conversions we still depend on helper calls and * don't expect to see them here. - * i) GT_CAST(float/double, uint64) + * i) GT_CAST(float/double, uint64) when EVEX is not available * ii) GT_CAST(float/double, int type with overflow detection) - * - * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above. - * There are hardly any occurrences of this conversion operation in platform - * assemblies or in CQ perf benchmarks (1 occurrence in corelib, microsoft.jscript, - * 1 occurrence in Roslyn and no occurrences in system, system.core, system.numerics - * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that - * doing this optimization is a win, should consider generating in-lined code. */ GenTree* Lowering::LowerCast(GenTree* tree) { - assert(tree->OperGet() == GT_CAST); + assert(tree->OperIs(GT_CAST)); GenTree* castOp = tree->AsCast()->CastOp(); var_types castToType = tree->CastToType(); @@ -838,7 +827,7 @@ GenTree* Lowering::LowerCast(GenTree* tree) var_types tmpType = TYP_UNDEF; // force the srcType to unsigned if GT_UNSIGNED flag is set - if (tree->gtFlags & GTF_UNSIGNED) + if (tree->IsUnsigned()) { srcType = varTypeToUnsigned(srcType); } @@ -854,11 +843,11 @@ GenTree* Lowering::LowerCast(GenTree* tree) if (varTypeIsFloating(srcType)) { noway_assert(!tree->gtOverflow()); - assert(castToType != TYP_ULONG || comp->canUseEvexEncoding()); + assert(castToType != TYP_ULONG || comp->canUseEvexEncodingDebugOnly()); } else if (srcType == TYP_UINT) { - assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding()); + assert(castToType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly()); } #if defined(TARGET_AMD64) @@ -978,7 +967,7 @@ GenTree* Lowering::LowerCast(GenTree* tree) BlockRange().InsertAfter(newCast, newTree); LowerNode(newTree); - // usage 2 --> use thecompared mask with input value and max value to blend + // usage 2 --> use the compared mask with input value and max value to blend GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A) BlockRange().InsertAfter(newTree, control); GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree, diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 07c0705941d5fc..98129f9016cc10 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -3089,7 +3089,7 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) if (sizeOfSIMDVector >= 32) { - assert((sizeOfSIMDVector == 32) || ((sizeOfSIMDVector == 64) && compiler->canUseEvexEncoding())); + assert((sizeOfSIMDVector == 32) || ((sizeOfSIMDVector == 64) && compiler->canUseEvexEncodingDebugOnly())); compiler->GetEmitter()->SetContains256bitOrMoreAVX(true); } } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 7e38d7b78f8db6..33614d332c9165 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -418,12 +418,11 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // all integers (except for longs on 32-bit of course) to floats. // On x64, we also have the option of widening uint -> long and // using the signed conversion instructions, and ulong -> float/double - // is handled directly in codegen, so we can allow all casts, - // regardless of hardware support. + // is handled directly in codegen, so we can allow all casts. else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST) #ifndef TARGET_64BIT && !varTypeIsLong(oper->AsCast()->CastOp()) -#endif // TARGET_64BIT +#endif // !TARGET_64BIT #ifdef TARGET_X86 && canUseEvexEncoding() #endif // TARGET_X86 @@ -496,7 +495,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) return fgMorphCastIntoHelper(tree, CORINFO_HELP_LNG2DBL, oper); } } - else if (((tree->gtFlags & GTF_UNSIGNED) == 0) && (srcType == TYP_LONG) && varTypeIsFloating(dstType)) + else if (!tree->IsUnsigned() && (srcType == TYP_LONG) && varTypeIsFloating(dstType)) { oper = fgMorphCastIntoHelper(tree, CORINFO_HELP_LNG2DBL, oper); @@ -504,7 +503,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // we just use the one that converts to a TYP_DOUBLE // and then add a cast to TYP_FLOAT // - if ((dstType == TYP_FLOAT) && (oper->OperGet() == GT_CALL)) + if ((dstType == TYP_FLOAT) && oper->OperIs(GT_CALL)) { // Fix the return type to be TYP_DOUBLE // @@ -563,7 +562,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // the result of an AND is bound by its smaller operand, it may be // possible to prove that the cast won't overflow, which will in turn // allow the cast's operand to be transformed. - if (tree->gtOverflow() && (oper->OperGet() == GT_AND)) + if (tree->gtOverflow() && oper->OperIs(GT_AND)) { GenTree* andOp2 = oper->AsOp()->gtOp2; @@ -571,7 +570,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // than 2^{31} for a cast to int. int maxWidth = (dstType == TYP_UINT) ? 32 : 31; - if ((andOp2->OperGet() == GT_CNS_NATIVELONG) && ((andOp2->AsIntConCommon()->LngValue() >> maxWidth) == 0)) + if (andOp2->OperIs(GT_CNS_NATIVELONG) && ((andOp2->AsIntConCommon()->LngValue() >> maxWidth) == 0)) { tree->ClearOverflow(); tree->SetAllEffectsFlags(oper); From 4724cf601a481add0e40acdec3911c39b9b967b7 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 27 Jan 2025 11:53:21 -0800 Subject: [PATCH 3/3] more cleanup --- src/coreclr/jit/codegenxarch.cpp | 50 +++++++++++--------------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 9e76594818c954..5291ac6e131bc5 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7264,11 +7264,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) var_types srcType = op1->TypeGet(); assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType)); -#if !defined(TARGET_64BIT) - // We expect morph to replace long to float/double casts with helper calls - noway_assert(!varTypeIsLong(srcType)); -#endif // !defined(TARGET_64BIT) - // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we // ensure srcType of a cast is non gc-type. Codegen should never see BYREF as source type except // for GT_LCL_ADDR that represent stack addresses and can be considered as TYP_I_IMPL. In all other @@ -7281,21 +7276,14 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) srcType = TYP_I_IMPL; } - // force the srcType to unsigned if GT_UNSIGNED flag is set - if (treeNode->gtFlags & GTF_UNSIGNED) - { - srcType = varTypeToUnsigned(srcType); - } - - noway_assert(!varTypeIsGC(srcType)); - - // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long). + // At this point, we should not see a srcType that is not int or long. // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect // either the front-end or lowering phase to have generated two levels of cast. // The first one is for widening smaller int type to int32 and the second one is // to the float/double. - emitAttr srcSize = EA_ATTR(genTypeSize(srcType)); - noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG)))); + // On 32-bit, we expect morph to replace long to float/double casts with helper calls, + // so we should only see int here. + noway_assert(varTypeIsIntOrI(srcType)); // To convert integral type to floating, the cvt[u]si2ss/sd instruction is used // which does a partial write to lower 4/8 bytes of xmm register keeping the other @@ -7309,19 +7297,21 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) genConsumeOperands(treeNode->AsOp()); GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg, INS_OPTS_NONE); - // Note that here we need to specify srcType that will determine - // the size of source reg/mem operand and rex.w prefix. - instruction ins = ins_FloatConv(dstType, srcType); + // force the srcType to unsigned if GT_UNSIGNED flag is set + if (treeNode->IsUnsigned()) + { + srcType = varTypeToUnsigned(srcType); + } if (srcType == TYP_ULONG && !compiler->canUseEvexEncoding()) { assert(op1->isUsedFromReg()); - // We will have selected an unsigned conversion instruction above, but we - // can't use it here because we have no EVEX support. Switch to the signed - // equivalent, and add code to fix up the result. + // If we don't have the EVEX unsigned conversion instructions, use the signed + // long -> float/double conversion instruction instead and fix up the result. - ins = ins_FloatConv(dstType, genActualType(srcType)); + instruction convIns = ins_FloatConv(dstType, TYP_LONG); + instruction addIns = (dstType == TYP_FLOAT) ? INS_addss : INS_addsd; // The following LONG->DOUBLE cast machinery is based on clang's implementation // with "-ffp-model=strict" flag: @@ -7349,25 +7339,19 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) GetEmitter()->emitIns_R_R(INS_or, EA_8BYTE, tmpReg2, tmpReg1); GetEmitter()->emitIns_R_R(INS_test, EA_8BYTE, argReg, argReg); GetEmitter()->emitIns_R_R(INS_cmovns, EA_8BYTE, tmpReg2, argReg); - GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, tmpReg2); + GetEmitter()->emitIns_R_R(convIns, EA_8BYTE, targetReg, tmpReg2); BasicBlock* label = genCreateTempLabel(); inst_JMP(EJ_jns, label); - if (dstType == TYP_FLOAT) - { - GetEmitter()->emitIns_R_R(INS_addss, EA_4BYTE, targetReg, targetReg); - } - else - { - assert(dstType == TYP_DOUBLE); - GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg); - } + GetEmitter()->emitIns_R_R(addIns, EA_ATTR(genTypeSize(dstType)), targetReg, targetReg); genDefineTempLabel(label); } else { assert(varTypeIsIntOrI(srcType) || compiler->canUseEvexEncodingDebugOnly()); + instruction ins = ins_FloatConv(dstType, srcType); + // integral to floating-point conversions all have RMW semantics if VEX support // is not available