From 496e50f1f076f3bc12af1fb8876ddf83ba8ef52a Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Sun, 19 Jan 2025 11:43:27 -0800
Subject: [PATCH 1/3] improve x86 integral to floating cast codegen

---
 src/coreclr/jit/codegen.h                     |  4 -
 src/coreclr/jit/codegenxarch.cpp              | 58 +++++++------
 src/coreclr/jit/instr.cpp                     | 75 ++++++++--------
 src/coreclr/jit/lowerxarch.cpp                | 10 +--
 src/coreclr/jit/lsraxarch.cpp                 | 11 ++-
 src/coreclr/jit/morph.cpp                     | 85 +++++--------------
 .../JitBlue/Runtime_106338/Runtime_106338.cs  |  2 +-
 7 files changed, 94 insertions(+), 151 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index d84ba8760dca00..d1d03f62898e2d 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -1621,11 +1621,7 @@ class CodeGen final : public CodeGenInterface
 
     instruction ins_Copy(var_types dstType);
     instruction ins_Copy(regNumber srcReg, var_types dstType);
-#if defined(TARGET_XARCH)
-    instruction ins_FloatConv(var_types to, var_types from, emitAttr attr);
-#elif defined(TARGET_ARM)
     instruction ins_FloatConv(var_types to, var_types from);
-#endif
     instruction ins_MathOp(genTreeOps oper, var_types type);
 
     void instGen_Return(unsigned stkArgSize);
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index b3090f94e2c4f9..60019544e95694 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -7219,10 +7219,9 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
     }
     else
     {
-        instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(dstType));
+        instruction ins = ins_FloatConv(dstType, srcType);
 
-        // integral to floating-point conversions all have RMW semantics if VEX support
-        // is not available
+        // floating-point conversions all have RMW semantics if VEX support is not available
 
         bool isRMW = !compiler->canUseVexEncoding();
         inst_RV_RV_TT(ins, emitTypeSize(dstType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
@@ -7299,29 +7298,18 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
     noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
 
-    // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
-    // here since they should have been lowered appropriately.
-    noway_assert(srcType != TYP_UINT);
-    assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) || compiler->canUseEvexEncodingDebugOnly());
+    // Also we don't expect to see uint32 -> float/double here unless the EVEX unsigned conversion
+    // instructions are available, since they should have been lowered appropriately.
+    assert((srcType != TYP_UINT) || compiler->canUseEvexEncodingDebugOnly());
 
-    if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) && compiler->canUseEvexEncoding())
-    {
-        assert(compiler->canUseEvexEncodingDebugOnly());
-        genConsumeOperands(treeNode->AsOp());
-        instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
-        GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
-        genProduceReg(treeNode);
-        return;
-    }
-
-    // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
+    // To convert integral type to floating, the cvt[u]si2ss/sd instruction is used
     // which does a partial write to lower 4/8 bytes of xmm register keeping the other
-    // upper bytes unmodified.  If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
+    // upper bytes unmodified.  If "cvt[u]si2ss/sd xmmReg, r32/r64" occurs inside a loop,
     // the partial write could introduce a false dependency and could cause a stall
     // if there are further uses of xmmReg. We have such a case occurring with a
     // customer reported version of SpectralNorm benchmark, resulting in 2x perf
     // regression.  To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
-    // cvtsi2ss/sd instruction.
+    // cvt[u]si2ss/sd instruction.
 
     genConsumeOperands(treeNode->AsOp());
     GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(),
@@ -7329,21 +7317,27 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
 
     // Note that here we need to specify srcType that will determine
     // the size of source reg/mem operand and rex.w prefix.
-    instruction ins = ins_FloatConv(dstType, TYP_INT, emitTypeSize(srcType));
+    instruction ins = ins_FloatConv(dstType, srcType);
 
     // integral to floating-point conversions all have RMW semantics if VEX support
     // is not available
 
     const bool isRMW = !compiler->canUseVexEncoding();
 
-    // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
-    // will interpret ULONG value as LONG.  Hence we need to adjust the
-    // result if sign-bit of srcType is set.
-    if (srcType == TYP_ULONG)
+    if (varTypeIsUnsigned(srcType) && compiler->canUseEvexEncoding())
+    {
+        GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
+    }
+    else if (srcType == TYP_ULONG)
     {
-        assert(dstType == TYP_DOUBLE);
         assert(op1->isUsedFromReg());
 
+        // We will have selected an unsigned conversion instruction above, but we
+        // can't use it here because we have no EVEX support. Switch to the signed
+        // equivalent, and add code to fix up the result.
+
+        ins = ins_FloatConv(dstType, genActualType(srcType));
+
         // The following LONG->DOUBLE cast machinery is based on clang's implementation
         // with "-ffp-model=strict" flag:
         //
@@ -7374,7 +7368,15 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
 
         BasicBlock* label = genCreateTempLabel();
         inst_JMP(EJ_jns, label);
-        GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg);
+        if (dstType == TYP_FLOAT)
+        {
+            GetEmitter()->emitIns_R_R(INS_addss, EA_4BYTE, targetReg, targetReg);
+        }
+        else
+        {
+            assert(dstType == TYP_DOUBLE);
+            GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg);
+        }
         genDefineTempLabel(label);
     }
     else
@@ -7447,7 +7449,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
     // Note that we need to specify dstType here so that it will determine
     // the size of destination integer register and also the rex.w prefix.
     genConsumeOperands(treeNode->AsOp());
-    instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType));
+    instruction ins = ins_FloatConv(dstType, srcType);
     GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
     genProduceReg(treeNode);
 }
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index 775cbd69237810..2b0f34ebb49bb1 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -2414,49 +2414,57 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
 // Arguments:
 //    to - Destination type.
 //    from - Source type.
-//    attr - Input size.
 //
 // Returns:
 //    The correct conversion instruction to use based on src and dst types.
 //
-instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
+instruction CodeGen::ins_FloatConv(var_types to, var_types from)
 {
-    // AVX: Supports following conversions
-    //   srcType = int16/int64                     castToType = float
-    // AVX512: Supports following conversions
-    //   srcType = ulong                           castToType = double/float
-
     switch (from)
     {
-        // int/long -> float/double use the same instruction but type size would be different.
         case TYP_INT:
+            switch (to)
+            {
+                case TYP_FLOAT:
+                    return INS_cvtsi2ss32;
+                case TYP_DOUBLE:
+                    return INS_cvtsi2sd32;
+                default:
+                    unreached();
+            }
+            break;
+
         case TYP_LONG:
             switch (to)
             {
                 case TYP_FLOAT:
-                {
-                    if (EA_SIZE(attr) == EA_4BYTE)
-                    {
-                        return INS_cvtsi2ss32;
-                    }
-                    else if (EA_SIZE(attr) == EA_8BYTE)
-                    {
-                        return INS_cvtsi2ss64;
-                    }
+                    return INS_cvtsi2ss64;
+                case TYP_DOUBLE:
+                    return INS_cvtsi2sd64;
+                default:
                     unreached();
-                }
+            }
+            break;
+
+        case TYP_UINT:
+            switch (to)
+            {
+                case TYP_FLOAT:
+                    return INS_vcvtusi2ss32;
                 case TYP_DOUBLE:
-                {
-                    if (EA_SIZE(attr) == EA_4BYTE)
-                    {
-                        return INS_cvtsi2sd32;
-                    }
-                    else if (EA_SIZE(attr) == EA_8BYTE)
-                    {
-                        return INS_cvtsi2sd64;
-                    }
+                    return INS_vcvtusi2sd32;
+                default:
                     unreached();
-                }
+            }
+            break;
+
+        case TYP_ULONG:
+            switch (to)
+            {
+                case TYP_FLOAT:
+                    return INS_vcvtusi2ss64;
+                case TYP_DOUBLE:
+                    return INS_vcvtusi2sd64;
                 default:
                     unreached();
             }
@@ -2502,17 +2510,6 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr)
             }
             break;
 
-        case TYP_ULONG:
-            switch (to)
-            {
-                case TYP_DOUBLE:
-                    return INS_vcvtusi2sd64;
-                case TYP_FLOAT:
-                    return INS_vcvtusi2ss64;
-                default:
-                    unreached();
-            }
-
         default:
             unreached();
     }
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index c80d19e9fdd4f2..1778695a6ad529 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -843,26 +843,20 @@ GenTree* Lowering::LowerCast(GenTree* tree)
         srcType = varTypeToUnsigned(srcType);
     }
 
-    // We should never see the following casts as they are expected to be lowered
-    // appropriately or converted into helper calls by front-end.
+    // We should not see the following casts unless directly supported by hardware,
+    // as they are expected to be lowered appropriately or converted into helper calls by front-end.
     //   srcType = float/double                    castToType = * and overflow detecting cast
     //       Reason: must be converted to a helper call
     //   srcType = float/double,                   castToType = ulong
     //       Reason: must be converted to a helper call
     //   srcType = uint                            castToType = float/double
     //       Reason: uint -> float/double = uint -> long -> float/double
-    //   srcType = ulong                           castToType = float
-    //       Reason: ulong -> float = ulong -> double -> float
     if (varTypeIsFloating(srcType))
     {
         noway_assert(!tree->gtOverflow());
         assert(castToType != TYP_ULONG || comp->canUseEvexEncoding());
     }
     else if (srcType == TYP_UINT)
-    {
-        noway_assert(!varTypeIsFloating(castToType));
-    }
-    else if (srcType == TYP_ULONG)
     {
         assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding());
     }
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index 5a1bd13cddd09d..07c0705941d5fc 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -2805,7 +2805,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
     }
     else
     {
-        // Currently dstCount = 2 is only used for DivRem, which has special constriants and handled above
+        // Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above
         assert((dstCount == 0) ||
                ((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
     }
@@ -2828,14 +2828,13 @@ int LinearScan::BuildCast(GenTreeCast* cast)
 {
     GenTree* src = cast->gtGetOp1();
 
-    const var_types srcType  = genActualType(src->TypeGet());
+    const var_types srcType  = src->TypeGet();
     const var_types castType = cast->gtCastType;
 
-    if ((srcType == TYP_LONG) && (castType == TYP_DOUBLE) &&
-        !compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+    if (cast->IsUnsigned() && varTypeIsLong(srcType) && varTypeIsFloating(castType) && !compiler->canUseEvexEncoding())
     {
-        // We need two extra temp regs for LONG->DOUBLE cast
-        // if we don't have AVX512F available.
+        // We need two extra temp regs for ULONG->DOUBLE/FLOAT cast
+        // if we don't have EVEX unsigned conversions available.
         buildInternalIntRegisterDefForNode(cast);
         buildInternalIntRegisterDefForNode(cast);
     }
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 8b2269bbd677de..7e38d7b78f8db6 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -290,38 +290,6 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
     var_types dstType = tree->CastToType();
     unsigned  dstSize = genTypeSize(dstType);
 
-#if defined(TARGET_AMD64)
-    // If AVX512 is present, we have intrinsic available to convert
-    // ulong directly to float. Hence, we need to combine the 2 nodes
-    // GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT) into a single
-    // node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT). At this point, we already
-    // have the 2 GT_CAST nodes in the tree and we are combining them below.
-    if (oper->OperIs(GT_CAST))
-    {
-        GenTreeCast* innerCast = oper->AsCast();
-
-        if (innerCast->IsUnsigned())
-        {
-            GenTree*  innerOper    = innerCast->CastOp();
-            var_types innerSrcType = genActualType(innerOper);
-            var_types innerDstType = innerCast->CastToType();
-            unsigned  innerDstSize = genTypeSize(innerDstType);
-            innerSrcType           = varTypeToUnsigned(innerSrcType);
-
-            // Check if we are going from ulong->double->float
-            if ((innerSrcType == TYP_ULONG) && (innerDstType == TYP_DOUBLE) && (dstType == TYP_FLOAT))
-            {
-                if (canUseEvexEncoding())
-                {
-                    // One optimized (combined) cast here
-                    tree = gtNewCastNode(TYP_FLOAT, innerOper, true, TYP_FLOAT);
-                    return fgMorphTree(tree);
-                }
-            }
-        }
-    }
-#endif // TARGET_AMD64
-
     // See if the cast has to be done in two steps.  R -> I
     if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType))
     {
@@ -442,16 +410,23 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
     }
 #endif //! TARGET_64BIT
 
-#ifdef TARGET_ARMARCH
-    // AArch, unlike x86/amd64, has instructions that can cast directly from
-    // all integers (except for longs on AArch32 of course) to floats.
+#if defined(TARGET_ARMARCH) || defined(TARGET_XARCH)
     // Because there is no IL instruction conv.r4.un, uint/ulong -> float
     // casts are always imported as CAST(float <- CAST(double <- uint/ulong)).
-    // We can eliminate the redundant intermediate cast as an optimization.
+    // We can usually eliminate the redundant intermediate cast as an optimization.
+    // AArch and xarch+EVEX have instructions that can cast directly from
+    // all integers (except for longs on 32-bit of course) to floats.
+    // On x64, we also have the option of widening uint -> long and
+    // using the signed conversion instructions, and ulong -> float/double
+    // is handled directly in codegen, so we can allow all casts,
+    // regardless of hardware support.
     else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST)
-#ifdef TARGET_ARM
+#ifndef TARGET_64BIT
              && !varTypeIsLong(oper->AsCast()->CastOp())
-#endif
+#endif // TARGET_64BIT
+#ifdef TARGET_X86
+             && canUseEvexEncoding()
+#endif // TARGET_X86
     )
     {
         oper->gtType       = TYP_FLOAT;
@@ -459,7 +434,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
 
         return fgMorphTree(oper);
     }
-#endif // TARGET_ARMARCH
+#endif // TARGET_ARMARCH || TARGET_XARCH
 
 #ifdef TARGET_ARM
     // converts long/ulong --> float/double casts into helper calls.
@@ -485,35 +460,15 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
 #endif // TARGET_ARM
 
 #ifdef TARGET_AMD64
-    // Do we have to do two step U4/8 -> R4/8 ?
-    // Codegen supports the following conversion as one-step operation
-    // a) Long -> R4/R8
-    // b) U8 -> R8
-    //
-    // The following conversions are performed as two-step operations using above.
-    // U4 -> R4/8 = U4-> Long -> R4/8
-    // U8 -> R4   = U8 -> R8 -> R4
+    // Do we have to do two step U4 -> R4/8 ?
+    // If we don't have the EVEX unsigned conversion instructions available,
+    // we will widen to long and use signed conversion: U4 -> Long -> R4/8.
+    // U8 -> R4/R8 is handled directly in codegen, so we ignore it here.
     else if (tree->IsUnsigned() && varTypeIsFloating(dstType))
     {
         srcType = varTypeToUnsigned(srcType);
 
-        if (srcType == TYP_ULONG && !canUseEvexEncoding())
-        {
-            if (dstType == TYP_FLOAT)
-            {
-                // Codegen can handle U8 -> R8 conversion.
-                // U8 -> R4 =  U8 -> R8 -> R4
-                // - change the dsttype to double
-                // - insert a cast from double to float
-                // - recurse into the resulting tree
-                tree->CastToType() = TYP_DOUBLE;
-                tree->gtType       = TYP_DOUBLE;
-                tree               = gtNewCastNode(TYP_FLOAT, tree, false, TYP_FLOAT);
-
-                return fgMorphTree(tree);
-            }
-        }
-        else if (srcType == TYP_UINT)
+        if (srcType == TYP_UINT && !canUseEvexEncoding())
         {
             oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG);
             oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
@@ -533,7 +488,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
         {
             return fgMorphCastIntoHelper(tree, CORINFO_HELP_ULNG2DBL, oper);
         }
-        else if (srcType == TYP_UINT)
+        else if (srcType == TYP_UINT && !canUseEvexEncoding())
         {
             oper = gtNewCastNode(TYP_LONG, oper, true, TYP_LONG);
             oper->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs
index 5cc1b75d42fda1..ed1b5cd56ace7a 100644
--- a/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs
+++ b/src/tests/JIT/Regression/JitBlue/Runtime_106338/Runtime_106338.cs
@@ -22,7 +22,7 @@ public static void TestEntryPoint()
         float vr11 = 4294967295U | vr10;
         uint result = BitConverter.SingleToUInt32Bits(vr11);
 
-        if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || ((RuntimeInformation.ProcessArchitecture == Architecture.X64) && Avx512F.IsSupported))
+        if ((RuntimeInformation.ProcessArchitecture == Architecture.Arm64) || (RuntimeInformation.ProcessArchitecture == Architecture.X64))
         {
             // Expected to cast ulong -> float directly
             Assert.Equal(1600094603U, result);

From 77b15f0d603de4f74f607237968ed22e933a906c Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Wed, 22 Jan 2025 16:39:12 -0800
Subject: [PATCH 2/3] more cleanup

---
 src/coreclr/jit/codegenxarch.cpp | 30 +++++++++++-------------------
 src/coreclr/jit/lowerxarch.cpp   | 25 +++++++------------------
 src/coreclr/jit/lsraxarch.cpp    |  2 +-
 src/coreclr/jit/morph.cpp        | 13 ++++++-------
 4 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 60019544e95694..55dbcd58eb866d 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -7247,7 +7247,7 @@ void CodeGen::genFloatToFloatCast(GenTree* treeNode)
 void CodeGen::genIntToFloatCast(GenTree* treeNode)
 {
     // int type --> float/double conversions are always non-overflow ones
-    assert(treeNode->OperGet() == GT_CAST);
+    assert(treeNode->OperIs(GT_CAST));
     assert(!treeNode->gtOverflow());
 
     regNumber targetReg = treeNode->GetRegNum();
@@ -7278,7 +7278,7 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     // operation.
     if (srcType == TYP_BYREF)
     {
-        noway_assert(op1->OperGet() == GT_LCL_ADDR);
+        noway_assert(op1->OperIs(GT_LCL_ADDR));
         srcType = TYP_I_IMPL;
     }
 
@@ -7298,10 +7298,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
     noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
 
-    // Also we don't expect to see uint32 -> float/double here unless the EVEX unsigned conversion
-    // instructions are available, since they should have been lowered appropriately.
-    assert((srcType != TYP_UINT) || compiler->canUseEvexEncodingDebugOnly());
-
     // To convert integral type to floating, the cvt[u]si2ss/sd instruction is used
     // which does a partial write to lower 4/8 bytes of xmm register keeping the other
     // upper bytes unmodified.  If "cvt[u]si2ss/sd xmmReg, r32/r64" occurs inside a loop,
@@ -7312,23 +7308,13 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     // cvt[u]si2ss/sd instruction.
 
     genConsumeOperands(treeNode->AsOp());
-    GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, treeNode->GetRegNum(), treeNode->GetRegNum(),
-                                     treeNode->GetRegNum(), INS_OPTS_NONE);
+    GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg, INS_OPTS_NONE);
 
     // Note that here we need to specify srcType that will determine
     // the size of source reg/mem operand and rex.w prefix.
     instruction ins = ins_FloatConv(dstType, srcType);
 
-    // integral to floating-point conversions all have RMW semantics if VEX support
-    // is not available
-
-    const bool isRMW = !compiler->canUseVexEncoding();
-
-    if (varTypeIsUnsigned(srcType) && compiler->canUseEvexEncoding())
-    {
-        GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
-    }
-    else if (srcType == TYP_ULONG)
+    if (srcType == TYP_ULONG && !compiler->canUseEvexEncoding())
     {
         assert(op1->isUsedFromReg());
 
@@ -7381,6 +7367,12 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     }
     else
     {
+        assert(varTypeIsIntOrI(srcType) || compiler->canUseEvexEncodingDebugOnly());
+
+        // integral to floating-point conversions all have RMW semantics if VEX support
+        // is not available
+
+        const bool isRMW = !compiler->canUseVexEncoding();
         inst_RV_RV_TT(ins, emitTypeSize(srcType), targetReg, targetReg, op1, isRMW, INS_OPTS_NONE);
     }
     genProduceReg(treeNode);
@@ -7406,7 +7398,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
 {
     // we don't expect to see overflow detecting float/double --> int type conversions here
     // as they should have been converted into helper calls by front-end.
-    assert(treeNode->OperGet() == GT_CAST);
+    assert(treeNode->OperIs(GT_CAST));
     assert(!treeNode->gtOverflow());
 
     regNumber targetReg = treeNode->GetRegNum();
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 1778695a6ad529..50f9d641bcf745 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -800,10 +800,9 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
  * GT_CAST(int16, float/double)    =   GT_CAST(GT_CAST(int16, int32), float/double)
  * GT_CAST(uint16, float/double)   =   GT_CAST(GT_CAST(uint16, int32), float/double)
  *
- * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
+ * Unless the EVEX conversion instructions are available, casts from Uint32
  * are morphed as follows by front-end and hence should not be seen here.
  * GT_CAST(uint32, float/double)   =   GT_CAST(GT_CAST(uint32, long), float/double)
- * GT_CAST(uint64, float)          =   GT_CAST(GT_CAST(uint64, double), float)
  *
  *
  * Similarly casts from float/double to a smaller int type are transformed as follows:
@@ -812,24 +811,14 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
  * GT_CAST(float/double, int16)    =   GT_CAST(GT_CAST(double/double, int32), int16)
  * GT_CAST(float/double, uint16)   =   GT_CAST(GT_CAST(double/double, int32), uint16)
  *
- * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
- * integer.  The above transformations help us to leverage those instructions.
- *
  * Note that for the following conversions we still depend on helper calls and
  * don't expect to see them here.
- *  i) GT_CAST(float/double, uint64)
+ *  i) GT_CAST(float/double, uint64) when EVEX is not available
  * ii) GT_CAST(float/double, int type with overflow detection)
- *
- * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
- * There are hardly any occurrences of this conversion operation in platform
- * assemblies or in CQ perf benchmarks (1 occurrence in corelib, microsoft.jscript,
- * 1 occurrence in Roslyn and no occurrences in system, system.core, system.numerics
- * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
- * doing this optimization is a win, should consider generating in-lined code.
  */
 GenTree* Lowering::LowerCast(GenTree* tree)
 {
-    assert(tree->OperGet() == GT_CAST);
+    assert(tree->OperIs(GT_CAST));
 
     GenTree*  castOp     = tree->AsCast()->CastOp();
     var_types castToType = tree->CastToType();
@@ -838,7 +827,7 @@ GenTree* Lowering::LowerCast(GenTree* tree)
     var_types tmpType    = TYP_UNDEF;
 
     // force the srcType to unsigned if GT_UNSIGNED flag is set
-    if (tree->gtFlags & GTF_UNSIGNED)
+    if (tree->IsUnsigned())
     {
         srcType = varTypeToUnsigned(srcType);
     }
@@ -854,11 +843,11 @@ GenTree* Lowering::LowerCast(GenTree* tree)
     if (varTypeIsFloating(srcType))
     {
         noway_assert(!tree->gtOverflow());
-        assert(castToType != TYP_ULONG || comp->canUseEvexEncoding());
+        assert(castToType != TYP_ULONG || comp->canUseEvexEncodingDebugOnly());
     }
     else if (srcType == TYP_UINT)
     {
-        assert(castToType != TYP_FLOAT || comp->canUseEvexEncoding());
+        assert(castToType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly());
     }
 
 #if defined(TARGET_AMD64)
@@ -978,7 +967,7 @@ GenTree* Lowering::LowerCast(GenTree* tree)
                 BlockRange().InsertAfter(newCast, newTree);
                 LowerNode(newTree);
 
-                // usage 2 --> use thecompared mask with input value and max value to blend
+                // usage 2 --> use the compared mask with input value and max value to blend
                 GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A)
                 BlockRange().InsertAfter(newTree, control);
                 GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree,
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index 07c0705941d5fc..98129f9016cc10 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -3089,7 +3089,7 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/)
 
     if (sizeOfSIMDVector >= 32)
     {
-        assert((sizeOfSIMDVector == 32) || ((sizeOfSIMDVector == 64) && compiler->canUseEvexEncoding()));
+        assert((sizeOfSIMDVector == 32) || ((sizeOfSIMDVector == 64) && compiler->canUseEvexEncodingDebugOnly()));
         compiler->GetEmitter()->SetContains256bitOrMoreAVX(true);
     }
 }
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 7e38d7b78f8db6..33614d332c9165 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -418,12 +418,11 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
     // all integers (except for longs on 32-bit of course) to floats.
     // On x64, we also have the option of widening uint -> long and
     // using the signed conversion instructions, and ulong -> float/double
-    // is handled directly in codegen, so we can allow all casts,
-    // regardless of hardware support.
+    // is handled directly in codegen, so we can allow all casts.
     else if ((dstType == TYP_FLOAT) && (srcType == TYP_DOUBLE) && oper->OperIs(GT_CAST)
 #ifndef TARGET_64BIT
              && !varTypeIsLong(oper->AsCast()->CastOp())
-#endif // TARGET_64BIT
+#endif // !TARGET_64BIT
 #ifdef TARGET_X86
              && canUseEvexEncoding()
 #endif // TARGET_X86
@@ -496,7 +495,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
             return fgMorphCastIntoHelper(tree, CORINFO_HELP_LNG2DBL, oper);
         }
     }
-    else if (((tree->gtFlags & GTF_UNSIGNED) == 0) && (srcType == TYP_LONG) && varTypeIsFloating(dstType))
+    else if (!tree->IsUnsigned() && (srcType == TYP_LONG) && varTypeIsFloating(dstType))
     {
         oper = fgMorphCastIntoHelper(tree, CORINFO_HELP_LNG2DBL, oper);
 
@@ -504,7 +503,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
         // we just use the one that converts to a TYP_DOUBLE
         // and then add a cast to TYP_FLOAT
         //
-        if ((dstType == TYP_FLOAT) && (oper->OperGet() == GT_CALL))
+        if ((dstType == TYP_FLOAT) && oper->OperIs(GT_CALL))
         {
             // Fix the return type to be TYP_DOUBLE
             //
@@ -563,7 +562,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
         // the result of an AND is bound by its smaller operand, it may be
         // possible to prove that the cast won't overflow, which will in turn
         // allow the cast's operand to be transformed.
-        if (tree->gtOverflow() && (oper->OperGet() == GT_AND))
+        if (tree->gtOverflow() && oper->OperIs(GT_AND))
         {
             GenTree* andOp2 = oper->AsOp()->gtOp2;
 
@@ -571,7 +570,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
             // than 2^{31} for a cast to int.
             int maxWidth = (dstType == TYP_UINT) ? 32 : 31;
 
-            if ((andOp2->OperGet() == GT_CNS_NATIVELONG) && ((andOp2->AsIntConCommon()->LngValue() >> maxWidth) == 0))
+            if (andOp2->OperIs(GT_CNS_NATIVELONG) && ((andOp2->AsIntConCommon()->LngValue() >> maxWidth) == 0))
             {
                 tree->ClearOverflow();
                 tree->SetAllEffectsFlags(oper);

From 4724cf601a481add0e40acdec3911c39b9b967b7 Mon Sep 17 00:00:00 2001
From: Clinton Ingram <clinton.ingram@outlook.com>
Date: Mon, 27 Jan 2025 11:53:21 -0800
Subject: [PATCH 3/3] more cleanup

---
 src/coreclr/jit/codegenxarch.cpp | 50 +++++++++++---------------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 9e76594818c954..5291ac6e131bc5 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -7264,11 +7264,6 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     var_types srcType = op1->TypeGet();
     assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
 
-#if !defined(TARGET_64BIT)
-    // We expect morph to replace long to float/double casts with helper calls
-    noway_assert(!varTypeIsLong(srcType));
-#endif // !defined(TARGET_64BIT)
-
     // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
     // ensure srcType of a cast is non gc-type.  Codegen should never see BYREF as source type except
     // for GT_LCL_ADDR that represent stack addresses and can be considered as TYP_I_IMPL. In all other
@@ -7281,21 +7276,14 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
         srcType = TYP_I_IMPL;
     }
 
-    // force the srcType to unsigned if GT_UNSIGNED flag is set
-    if (treeNode->gtFlags & GTF_UNSIGNED)
-    {
-        srcType = varTypeToUnsigned(srcType);
-    }
-
-    noway_assert(!varTypeIsGC(srcType));
-
-    // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
+    // At this point, we should not see a srcType that is not int or long.
     // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
     // either the front-end or lowering phase to have generated two levels of cast.
     // The first one is for widening smaller int type to int32 and the second one is
     // to the float/double.
-    emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
-    noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
+    // On 32-bit, we expect morph to replace long to float/double casts with helper calls,
+    // so we should only see int here.
+    noway_assert(varTypeIsIntOrI(srcType));
 
     // To convert integral type to floating, the cvt[u]si2ss/sd instruction is used
     // which does a partial write to lower 4/8 bytes of xmm register keeping the other
@@ -7309,19 +7297,21 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
     genConsumeOperands(treeNode->AsOp());
     GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg, INS_OPTS_NONE);
 
-    // Note that here we need to specify srcType that will determine
-    // the size of source reg/mem operand and rex.w prefix.
-    instruction ins = ins_FloatConv(dstType, srcType);
+    // force the srcType to unsigned if GT_UNSIGNED flag is set
+    if (treeNode->IsUnsigned())
+    {
+        srcType = varTypeToUnsigned(srcType);
+    }
 
     if (srcType == TYP_ULONG && !compiler->canUseEvexEncoding())
     {
         assert(op1->isUsedFromReg());
 
-        // We will have selected an unsigned conversion instruction above, but we
-        // can't use it here because we have no EVEX support. Switch to the signed
-        // equivalent, and add code to fix up the result.
+        // If we don't have the EVEX unsigned conversion instructions, use the signed
+        // long -> float/double conversion instruction instead and fix up the result.
 
-        ins = ins_FloatConv(dstType, genActualType(srcType));
+        instruction convIns = ins_FloatConv(dstType, TYP_LONG);
+        instruction addIns  = (dstType == TYP_FLOAT) ? INS_addss : INS_addsd;
 
         // The following LONG->DOUBLE cast machinery is based on clang's implementation
         // with "-ffp-model=strict" flag:
@@ -7349,25 +7339,19 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode)
         GetEmitter()->emitIns_R_R(INS_or, EA_8BYTE, tmpReg2, tmpReg1);
         GetEmitter()->emitIns_R_R(INS_test, EA_8BYTE, argReg, argReg);
         GetEmitter()->emitIns_R_R(INS_cmovns, EA_8BYTE, tmpReg2, argReg);
-        GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, tmpReg2);
+        GetEmitter()->emitIns_R_R(convIns, EA_8BYTE, targetReg, tmpReg2);
 
         BasicBlock* label = genCreateTempLabel();
         inst_JMP(EJ_jns, label);
-        if (dstType == TYP_FLOAT)
-        {
-            GetEmitter()->emitIns_R_R(INS_addss, EA_4BYTE, targetReg, targetReg);
-        }
-        else
-        {
-            assert(dstType == TYP_DOUBLE);
-            GetEmitter()->emitIns_R_R(INS_addsd, EA_8BYTE, targetReg, targetReg);
-        }
+        GetEmitter()->emitIns_R_R(addIns, EA_ATTR(genTypeSize(dstType)), targetReg, targetReg);
         genDefineTempLabel(label);
     }
     else
     {
         assert(varTypeIsIntOrI(srcType) || compiler->canUseEvexEncodingDebugOnly());
 
+        instruction ins = ins_FloatConv(dstType, srcType);
+
         // integral to floating-point conversions all have RMW semantics if VEX support
         // is not available