diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 86e89956002121..23c79087d4f8c6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7395,7 +7395,19 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions // here since they should have been lowered appropriately. noway_assert(srcType != TYP_UINT); - noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT)); + assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) || + compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + + if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) && + compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + genConsumeOperands(treeNode->AsOp()); + instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType)); + GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); + genProduceReg(treeNode); + return; + } // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used // which does a partial write to lower 4/8 bytes of xmm register keeping the other @@ -7509,7 +7521,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) // We shouldn't be seeing uint64 here as it should have been converted // into a helper call by either front-end or lowering phase. - noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG)))); + assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG)))); // If the dstType is TYP_UINT, we have 32-bits to encode the // float number. Any of 33rd or above bits can be the sign bit. diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 3755002297c1f6..3d6aa46842621c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1487,7 +1487,6 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi: - case INS_vcvttss2usi: { if (attr == EA_8BYTE) { @@ -2692,7 +2691,8 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi: - case INS_vcvttss2usi: + case INS_vcvttss2usi32: + case INS_vcvttss2usi64: { // These SSE instructions write to a general purpose integer register. return false; @@ -11479,12 +11479,18 @@ void emitter::emitDispIns( case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi: - case INS_vcvttss2usi: { printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE)); break; } + case INS_vcvttss2usi32: + case INS_vcvttss2usi64: + { + printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_4BYTE)); + break; + } + #ifdef TARGET_AMD64 case INS_movsxd: { @@ -18743,23 +18749,32 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cvtsi2sd64: case INS_cvtsi2ss64: case INS_vcvtsd2usi: - case INS_vcvttsd2usi: - case INS_vcvtusi2sd32: - case INS_vcvtusi2sd64: case INS_vcvtusi2ss32: case INS_vcvtusi2ss64: + case INS_vcvttsd2usi: + case INS_vcvttss2usi32: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_7C; break; + case INS_vcvtusi2sd64: + case INS_vcvtusi2sd32: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency += PERFSCORE_LATENCY_5C; + break; + case INS_cvttss2si: case INS_cvtss2si: case INS_vcvtss2usi: - case INS_vcvttss2usi: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C; break; + case INS_vcvttss2usi64: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency += PERFSCORE_LATENCY_8C; + break; + case INS_cvtss2sd: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_5C; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 097dafbc0d67f3..1d60e326384aa1 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -852,7 +852,7 @@ HARDWARE_INTRINSIC(AVX512F, CompareUnordered, HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi32, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Byte, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128ByteWithSaturation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Int16, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqw, INS_vpmovqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1009,7 +1009,7 @@ HARDWARE_INTRINSIC(AVX512F_VL, TernaryLogic, HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi64, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 8b3dd97964756c..0205217a3fcbb3 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -2304,7 +2304,10 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) // instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) { - // AVX: For now we support only conversion from Int/Long -> float + // AVX: Supports following conversions + // srcType = int16/int64 castToType = float + // AVX512: Supports following conversions + // srcType = ulong castToType = double/float switch (from) { @@ -2374,6 +2377,17 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) } break; + case TYP_ULONG: + switch (to) + { + case TYP_DOUBLE: + return INS_vcvtusi2sd64; + case TYP_FLOAT: + return INS_vcvtusi2ss64; + default: + unreached(); + } + default: unreached(); } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index bd905bb8db71d3..1891e5fc6da097 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -641,7 +641,8 @@ INST3(vcvtss2usi, "cvtss2usi", IUM_WR, BAD_CODE, BAD_ INST3(vcvttpd2udq, "cvttpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned DWORDs INST3(vcvttps2udq, "cvttps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned DWORDs INST3(vcvttsd2usi, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned DWORD/QWORD -INST3(vcvttss2usi, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD +INST3(vcvttss2usi32, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD +INST3(vcvttss2usi64, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD INST3(vcvtudq2pd, "cvtudq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to doubles INST3(vcvtudq2ps, "cvtudq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to singles INST3(vcvtusi2sd32, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to double diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 1096f473680763..c7328a7d4c0023 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -811,7 +811,7 @@ void Lowering::LowerCast(GenTree* tree) } else if (srcType == TYP_ULONG) { - noway_assert(castToType != TYP_FLOAT); + assert(castToType != TYP_FLOAT || comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); } // Case of src is a small type and dst is a floating point type. diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index d2a286508408ef..3babf06e541618 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -293,6 +293,38 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) var_types dstType = tree->CastToType(); unsigned dstSize = genTypeSize(dstType); +#if defined(TARGET_AMD64) + // If AVX512 is present, we have intrinsic available to convert + // ulong directly to float. Hence, we need to combine the 2 nodes + // GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT) into a single + // node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT). At this point, we already + // have the 2 GT_CAST nodes in the tree and we are combining them below. + if (oper->OperIs(GT_CAST)) + { + GenTreeCast* innerCast = oper->AsCast(); + + if (innerCast->IsUnsigned()) + { + GenTree* innerOper = innerCast->CastOp(); + var_types innerSrcType = genActualType(innerOper); + var_types innerDstType = innerCast->CastToType(); + unsigned innerDstSize = genTypeSize(innerDstType); + innerSrcType = varTypeToUnsigned(innerSrcType); + + // Check if we are going from ulong->double->float + if ((innerSrcType == TYP_ULONG) && (innerDstType == TYP_DOUBLE) && (dstType == TYP_FLOAT)) + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + // One optimized (combined) cast here + tree = gtNewCastNode(TYP_FLOAT, innerOper, true, TYP_FLOAT); + return fgMorphTree(tree); + } + } + } + } +#endif // TARGET_AMD64 + // See if the cast has to be done in two steps. R -> I if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType)) { @@ -449,7 +481,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) { srcType = varTypeToUnsigned(srcType); - if (srcType == TYP_ULONG) + if (srcType == TYP_ULONG && !compOpportunisticallyDependsOn(InstructionSet_AVX512F)) { if (dstType == TYP_FLOAT) {