From 23540797a4feb3af61956193b38319c60571e423 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 23 Jan 2025 20:31:38 -0800 Subject: [PATCH 1/2] enable containment for rorx/sarx/shlx/shrx --- src/coreclr/jit/codegenxarch.cpp | 70 +++++++++++++-------------- src/coreclr/jit/emitxarch.cpp | 81 +++++++++++++++++++------------- src/coreclr/jit/instrsxarch.h | 2 - src/coreclr/jit/lowerxarch.cpp | 27 +++++++++-- src/coreclr/jit/lsraxarch.cpp | 15 +++--- 5 files changed, 115 insertions(+), 80 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b3090f94e2c4f9..b2d12bc8a6c96c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -4774,17 +4774,16 @@ instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type) // tree - the bit shift node (that specifies the type of bit shift to perform). // // Assumptions: -// a) All GenTrees are register allocated. -// b) The shift-by-amount in tree->AsOp()->gtOp2 is either a contained constant or -// it's a register-allocated expression. If it is in a register that is -// not RCX, it will be moved to RCX (so RCX better not be in use!). +// The shift-by-amount in tree->AsOp()->gtOp2 is either a contained constant or it's a +// register-allocated expression. If not using BMI2 instructions and op2 is in a register +// that is not RCX, it will be moved to RCX (so RCX better not be in use!). // void CodeGen::genCodeForShift(GenTree* tree) { // Only the non-RMW case here. assert(tree->OperIsShiftOrRotate()); - assert(tree->AsOp()->gtOp1->isUsedFromReg()); assert(tree->GetRegNum() != REG_NA); + assert(tree->AsOp()->gtOp1->isUsedFromReg() || compiler->compIsaSupportedDebugOnly(InstructionSet_BMI2)); genConsumeOperands(tree->AsOp()); @@ -4795,12 +4794,13 @@ void CodeGen::genCodeForShift(GenTree* tree) regNumber operandReg = operand->GetRegNum(); GenTree* shiftBy = tree->gtGetOp2(); + emitAttr size = emitTypeSize(tree); if (shiftBy->isContainedIntOrIImmed()) { - emitAttr size = emitTypeSize(tree); + assert(tree->OperIsRotate() || (operandReg != REG_NA)); - bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags(); + bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtSetFlags(); // Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg" if (mightOptimizeLsh && shiftBy->IsIntegralConst(1)) @@ -4814,14 +4814,14 @@ void CodeGen::genCodeForShift(GenTree* tree) GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0); } } - // Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will - // remove a 'mov'. + // Optimize "X<<2" to "lea [reg*4]" + // We only do this when the dst and src registers are different since it will remove a 'mov'. else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0); } - // Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will - // remove a 'mov'. + // Optimize "X<<3" to "lea [reg*8]" + // We only do this when the dst and src registers are different since it will remove a 'mov'. else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg) { GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0); @@ -4830,56 +4830,58 @@ void CodeGen::genCodeForShift(GenTree* tree) { int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue(); -#if defined(TARGET_64BIT) - // Try to emit rorx if BMI2 is available instead of mov+rol - // it makes sense only for 64bit integers - if ((genActualType(targetType) == TYP_LONG) && (tree->GetRegNum() != operandReg) && - compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && tree->OperIs(GT_ROL, GT_ROR) && - (shiftByValue > 0) && (shiftByValue < 64)) + if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2)) { - const int value = tree->OperIs(GT_ROL) ? (64 - shiftByValue) : shiftByValue; - GetEmitter()->emitIns_R_R_I(INS_rorx, size, tree->GetRegNum(), operandReg, value); - genProduceReg(tree); - return; + // If we have a contained source operand, we must emit rorx. + // We may also use rorx for 64bit values when a mov would otherwise be required, + // because rorx is smaller than mov+rol/ror when REX prefix is included. + + if ((operandReg == REG_NA) || ((varTypeIsLong(targetType) && (tree->GetRegNum() != operandReg)))) + { + // There is no 'rolx', so for rol, we use rorx with the shift value corrected. + if (tree->OperIs(GT_ROL)) + { + shiftByValue &= (size * 8 - 1); + shiftByValue = (size * 8 - shiftByValue); + } + + inst_RV_TT_IV(INS_rorx, size, tree->GetRegNum(), operand, shiftByValue, INS_OPTS_NONE); + genProduceReg(tree); + return; + } } -#endif - // First, move the operand to the destination register and - // later on perform the shift in-place. + + // We were unable to optimize away from mov+ins, so emit both in case target reg is different. // (LSRA will try to avoid this situation through preferencing.) inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true); inst_RV_SH(ins, size, tree->GetRegNum(), shiftByValue); } } -#if defined(TARGET_64BIT) else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2)) { - // Try to emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr. + // Emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr. switch (tree->OperGet()) { case GT_LSH: ins = INS_shlx; break; - case GT_RSH: ins = INS_sarx; break; - case GT_RSZ: ins = INS_shrx; break; - default: unreached(); } - regNumber shiftByReg = shiftBy->GetRegNum(); - emitAttr size = emitTypeSize(tree); - // The order of operandReg and shiftByReg are swapped to follow shlx, sarx and shrx encoding spec. - GetEmitter()->emitIns_R_R_R(ins, size, tree->GetRegNum(), shiftByReg, operandReg); + // The order of operand and shiftBy are swapped to follow shlx, sarx and shrx encoding spec. + inst_RV_RV_TT(ins, size, tree->GetRegNum(), shiftBy->GetRegNum(), operand, /*isRMW*/ false, INS_OPTS_NONE); } -#endif else { + assert(operandReg != REG_NA); + // We must have the number of bits to shift stored in ECX, since we constrained this node to // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single // register destination requirement. diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 77f8787a3e2457..477aa56d13db2f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1692,11 +1692,7 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const switch (ins) { case INS_cvtss2si: - case INS_cvttss2si32: - case INS_cvttss2si64: case INS_cvtsd2si: - case INS_cvttsd2si32: - case INS_cvttsd2si64: case INS_movd: case INS_movnti: case INS_andn: @@ -1709,11 +1705,9 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const case INS_pdep: case INS_pext: case INS_rorx: -#if defined(TARGET_AMD64) case INS_sarx: case INS_shlx: case INS_shrx: -#endif // TARGET_AMD64 case INS_vcvtsd2usi: case INS_vcvtss2usi: { @@ -2378,32 +2372,25 @@ emitter::code_t emitter::emitExtractVexPrefix(instruction ins, code_t& code) con case INS_rorx: case INS_pdep: case INS_mulx: -// TODO: Unblock when enabled for x86 -#ifdef TARGET_AMD64 case INS_shrx: -#endif { vexPrefix |= 0x03; break; } case INS_pext: -// TODO: Unblock when enabled for x86 -#ifdef TARGET_AMD64 case INS_sarx: -#endif { vexPrefix |= 0x02; break; } -// TODO: Unblock when enabled for x86 -#ifdef TARGET_AMD64 + case INS_shlx: { vexPrefix |= 0x01; break; } -#endif + default: { vexPrefix |= 0x00; @@ -3099,11 +3086,9 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) case INS_pextrw: case INS_pextrw_sse41: case INS_rorx: -#ifdef TARGET_AMD64 case INS_shlx: case INS_sarx: case INS_shrx: -#endif case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi32: @@ -3352,10 +3337,8 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const switch (ins) { case INS_crc32: -#if defined(TARGET_AMD64) case INS_sarx: case INS_shrx: -#endif // TARGET_AMD64 { // When the prefix is 0x0F38 or 0x0F3A, we must use the 3-byte encoding // These are special cases where the pp-bit is 0xF2 or 0xF3 and not 0x66 @@ -7708,7 +7691,7 @@ void emitter::emitIns_R_R_C_R(instruction ins, } //------------------------------------------------------------------------ -// emitIns_R_R_R_S: emits the code for a instruction that takes a register operand, a variable index + +// emitIns_R_R_S_R: emits the code for a instruction that takes a register operand, a variable index + // offset, another register operand, and that returns a value in register // // Arguments: @@ -11854,6 +11837,19 @@ void emitter::emitDispIns( case IF_RRW_RRD_ARD: case IF_RWR_RWR_ARD: { + if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx)) + { + // These instructions have their operands swapped to simplify the emitter implementation. + // They will appear here as IF_RWR_RRD_ARD but should actually + // display as if they were IF_RWR_ARD_RRD. + + printf("%s", emitRegName(id->idReg1(), attr)); + printf(", %s", sstr); + emitDispAddrMode(id); + printf(", %s", emitRegName(id->idReg2(), attr)); + break; + } + printf("%s", emitRegName(id->idReg1(), attr)); emitDispEmbMasking(id); printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr); @@ -12151,6 +12147,20 @@ void emitter::emitDispIns( case IF_RRW_RRD_SRD: case IF_RWR_RWR_SRD: { + if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx)) + { + // These instructions have their operands swapped to simplify the emitter implementation. + // They will appear here as IF_RWR_RRD_SRD but should actually + // display as if they were IF_RWR_SRD_RRD. + + printf("%s", emitRegName(id->idReg1(), attr)); + printf(", %s", sstr); + emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(), + id->idDebugOnlyInfo()->idVarRefOffs, asmfm); + printf(", %s", emitRegName(id->idReg2(), attr)); + break; + } + printf("%s", emitRegName(id->idReg1(), attr)); emitDispEmbMasking(id); printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr); @@ -12355,17 +12365,12 @@ void emitter::emitDispIns( regNumber reg2 = id->idReg2(); regNumber reg3 = id->idReg3(); - if (ins == INS_bextr || ins == INS_bzhi -#ifdef TARGET_AMD64 - || ins == INS_shrx || ins == INS_shlx || ins == INS_sarx -#endif - ) + if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx)) { - // BMI bextr,bzhi, shrx, shlx and sarx encode the reg2 in VEX.vvvv and reg3 in modRM, - // which is different from most of other instructions - regNumber tmp = reg2; - reg2 = reg3; - reg3 = tmp; + // These instructions have their operands swapped to simplify the emitter implementation. + // They encode reg3 in VEX.vvvv and reg2 in modRM, which is opposite most instructions. + // We swap them back here so they will display in the correct order. + std::swap(reg2, reg3); } emitAttr attr3 = attr; @@ -12699,6 +12704,20 @@ void emitter::emitDispIns( case IF_RRW_RRD_MRD: case IF_RWR_RWR_MRD: { + if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx)) + { + // These instructions have their operands swapped to simplify the emitter implementation. + // They will appear here as IF_RWR_RRD_MRD but should actually + // display as if they were IF_RWR_MRD_RRD. + + printf("%s", emitRegName(id->idReg1(), attr)); + printf(", %s", sstr); + offs = emitGetInsDsp(id); + emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC); + printf(", %s", emitRegName(id->idReg2(), attr)); + break; + } + printf("%s", emitRegName(id->idReg1(), attr)); emitDispEmbMasking(id); printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr); @@ -20541,7 +20560,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } -#ifdef TARGET_AMD64 case INS_shlx: case INS_sarx: case INS_shrx: @@ -20550,7 +20568,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; } -#endif case INS_vpmovb2m: case INS_vpmovw2m: diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 3c6285405a0747..41944f3a42e5ce 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -600,11 +600,9 @@ INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, REX_WX | Encoding_VEX) -#ifdef TARGET_AMD64 INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags -#endif INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c80d19e9fdd4f2..28ea4ead1bf14a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -8493,21 +8493,40 @@ void Lowering::ContainCheckDivOrMod(GenTreeOp* node) void Lowering::ContainCheckShiftRotate(GenTreeOp* node) { assert(node->OperIsShiftOrRotate()); + + GenTree* source = node->gtOp1; + GenTree* shiftBy = node->gtOp2; + #ifdef TARGET_X86 - GenTree* source = node->gtOp1; if (node->OperIsShiftLong()) { - assert(source->OperGet() == GT_LONG); + assert(source->OperIs(GT_LONG)); MakeSrcContained(node, source); } -#endif +#endif // TARGET_X86 - GenTree* shiftBy = node->gtOp2; if (IsContainableImmed(node, shiftBy) && (shiftBy->AsIntConCommon()->IconValue() <= 255) && (shiftBy->AsIntConCommon()->IconValue() >= 0)) { MakeSrcContained(node, shiftBy); } + + bool canContainSource = !source->isContained() && (genTypeSize(source) >= genTypeSize(node)); + + // BMI2 rotate and shift instructions take memory operands but do not set flags. + // rorx takes imm8 for the rotate amount; shlx/shrx/sarx take r32/64 for shift amount. + if (canContainSource && !node->gtSetFlags() && (shiftBy->isContained() != node->OperIsShift()) && + comp->compOpportunisticallyDependsOn(InstructionSet_BMI2)) + { + if (IsContainableMemoryOp(source) && IsSafeToContainMem(node, source)) + { + MakeSrcContained(node, source); + } + else if (IsSafeToMarkRegOptional(node, source)) + { + MakeSrcRegOptional(node, source); + } + } } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 5a1bd13cddd09d..cfab7d2935020d 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1056,18 +1056,16 @@ int LinearScan::BuildShiftRotate(GenTree* tree) { assert(shiftBy->OperIsConst()); } -#if defined(TARGET_64BIT) - else if (tree->OperIsShift() && !tree->isContained() && + else if (!tree->isContained() && (tree->OperIsShift() || source->isContained()) && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2)) { - // shlx (as opposed to mov+shl) instructions handles all register forms, but it does not handle contained form - // for memory operand. Likewise for sarx and shrx. + // We don'thave any specific register requirements here, so skip the logic that + // reserves RCX or preferences the source reg. srcCount += BuildOperandUses(source, srcCandidates); srcCount += BuildOperandUses(shiftBy, srcCandidates); BuildDef(tree, dstCandidates); return srcCount; } -#endif else { srcCandidates = availableIntRegs & ~SRBM_RCX; @@ -1088,9 +1086,9 @@ int LinearScan::BuildShiftRotate(GenTree* tree) #ifdef TARGET_X86 // The first operand of a GT_LSH_HI and GT_RSH_LO oper is a GT_LONG so that // we can have a three operand form. - if (tree->OperGet() == GT_LSH_HI || tree->OperGet() == GT_RSH_LO) + if (tree->OperIs(GT_LSH_HI) || tree->OperIs(GT_RSH_LO)) { - assert((source->OperGet() == GT_LONG) && source->isContained()); + assert(source->OperIs(GT_LONG) && source->isContained()); GenTree* sourceLo = source->gtGetOp1(); GenTree* sourceHi = source->gtGetOp2(); @@ -1100,7 +1098,7 @@ int LinearScan::BuildShiftRotate(GenTree* tree) if (!tree->isContained()) { - if (tree->OperGet() == GT_LSH_HI) + if (tree->OperIs(GT_LSH_HI)) { setDelayFree(sourceLoUse); } @@ -1121,6 +1119,7 @@ int LinearScan::BuildShiftRotate(GenTree* tree) { srcCount += BuildOperandUses(source, srcCandidates); } + if (!tree->isContained()) { if (!shiftBy->isContained()) From e1fdc0a8ebb0acf499f62b1f196816554914771b Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 17 Mar 2025 20:20:59 -0700 Subject: [PATCH 2/2] update emitExtractEvexPrefix --- src/coreclr/jit/emitxarch.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b5a7a5871c9c68..ff4d19fee3fa46 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -2868,32 +2868,25 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case INS_rorx: case INS_pdep: case INS_mulx: -// TODO: Unblock when enabled for x86 -#ifdef TARGET_AMD64 case INS_shrx: -#endif { evexPrefix |= (0x03 << 8); break; } case INS_pext: -// TODO: Unblock when enabled for x86 -#ifdef TARGET_AMD64 case INS_sarx: -#endif { evexPrefix |= (0x02 << 8); break; } -// TODO: Unblock when enabled for x86 -#ifdef TARGET_AMD64 + case INS_shlx: { evexPrefix |= (0x01 << 8); break; } -#endif + default: { break;