Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 33 additions & 32 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4808,17 +4808,16 @@ instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
// tree - the bit shift node (that specifies the type of bit shift to perform).
//
// Assumptions:
// a) All GenTrees are register allocated.
// b) The shift-by-amount in tree->AsOp()->gtOp2 is either a contained constant or
// it's a register-allocated expression. If it is in a register that is
// not RCX, it will be moved to RCX (so RCX better not be in use!).
// The shift-by-amount in tree->AsOp()->gtOp2 is either a contained constant or it's a
// register-allocated expression. If not using BMI2 instructions and op2 is in a register
// that is not RCX, it will be moved to RCX (so RCX better not be in use!).
//
void CodeGen::genCodeForShift(GenTree* tree)
{
// Only the non-RMW case here.
assert(tree->OperIsShiftOrRotate());
assert(tree->AsOp()->gtOp1->isUsedFromReg());
assert(tree->GetRegNum() != REG_NA);
assert(tree->AsOp()->gtOp1->isUsedFromReg() || compiler->compIsaSupportedDebugOnly(InstructionSet_BMI2));

genConsumeOperands(tree->AsOp());

Expand All @@ -4829,12 +4828,13 @@ void CodeGen::genCodeForShift(GenTree* tree)
regNumber operandReg = operand->GetRegNum();

GenTree* shiftBy = tree->gtGetOp2();
emitAttr size = emitTypeSize(tree);

if (shiftBy->isContainedIntOrIImmed())
{
emitAttr size = emitTypeSize(tree);
assert(tree->OperIsRotate() || (operandReg != REG_NA));

bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags();
bool mightOptimizeLsh = tree->OperIs(GT_LSH) && !tree->gtSetFlags();

// Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg"
if (mightOptimizeLsh && shiftBy->IsIntegralConst(1))
Expand All @@ -4848,14 +4848,14 @@ void CodeGen::genCodeForShift(GenTree* tree)
GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0);
}
}
// Optimize "X<<2" to "lea [reg*4]" - we only do this when the dst and src registers are different since it will
// remove a 'mov'.
// Optimize "X<<2" to "lea [reg*4]"
// We only do this when the dst and src registers are different since it will remove a 'mov'.
else if (mightOptimizeLsh && shiftBy->IsIntegralConst(2) && tree->GetRegNum() != operandReg)
{
GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 4, 0);
}
// Optimize "X<<3" to "lea [reg*8]" - we only do this when the dst and src registers are different since it will
// remove a 'mov'.
// Optimize "X<<3" to "lea [reg*8]"
// We only do this when the dst and src registers are different since it will remove a 'mov'.
else if (mightOptimizeLsh && shiftBy->IsIntegralConst(3) && tree->GetRegNum() != operandReg)
{
GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), REG_NA, operandReg, 8, 0);
Expand All @@ -4864,53 +4864,54 @@ void CodeGen::genCodeForShift(GenTree* tree)
{
int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();

#if defined(TARGET_64BIT)
// Try to emit rorx if BMI2 is available instead of mov+rol
// it makes sense only for 64bit integers
if ((genActualType(targetType) == TYP_LONG) && (tree->GetRegNum() != operandReg) &&
compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && tree->OperIs(GT_ROL, GT_ROR) &&
(shiftByValue > 0) && (shiftByValue < 64))
if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2))
{
const int value = tree->OperIs(GT_ROL) ? (64 - shiftByValue) : shiftByValue;
GetEmitter()->emitIns_R_R_I(INS_rorx, size, tree->GetRegNum(), operandReg, value);
genProduceReg(tree);
return;
// If we have a contained source operand, we must emit rorx.
// We may also use rorx for 64bit values when a mov would otherwise be required,
// because rorx is smaller than mov+rol/ror when REX prefix is included.

if ((operandReg == REG_NA) || ((varTypeIsLong(targetType) && (tree->GetRegNum() != operandReg))))
{
// There is no 'rolx', so for rol, we use rorx with the shift value adjusted.
if (tree->OperIs(GT_ROL))
{
shiftByValue &= (size * BITS_PER_BYTE - 1);
shiftByValue = (size * BITS_PER_BYTE - shiftByValue);
}

inst_RV_TT_IV(INS_rorx, size, tree->GetRegNum(), operand, shiftByValue, INS_OPTS_NONE);
genProduceReg(tree);
return;
}
}
#endif

ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
GetEmitter()->emitIns_BASE_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue);
genProduceReg(tree);
return;
}
}
#if defined(TARGET_64BIT)
else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2))
{
// Try to emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr.
// Emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr.
switch (tree->OperGet())
{
case GT_LSH:
ins = INS_shlx;
break;

case GT_RSH:
ins = INS_sarx;
break;

case GT_RSZ:
ins = INS_shrx;
break;

default:
unreached();
}

regNumber shiftByReg = shiftBy->GetRegNum();
emitAttr size = emitTypeSize(tree);
// The order of operandReg and shiftByReg are swapped to follow shlx, sarx and shrx encoding spec.
GetEmitter()->emitIns_R_R_R(ins, size, tree->GetRegNum(), shiftByReg, operandReg);
// The order of operand and shiftBy are swapped to follow shlx, sarx and shrx encoding spec.
inst_RV_RV_TT(ins, size, tree->GetRegNum(), shiftBy->GetRegNum(), operand, /*isRMW*/ false, INS_OPTS_NONE);
}
#endif
else
{
// We must have the number of bits to shift stored in ECX, since we constrained this node to
Expand Down
92 changes: 51 additions & 41 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2268,11 +2268,7 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
switch (ins)
{
case INS_cvtss2si:
case INS_cvttss2si32:
case INS_cvttss2si64:
case INS_cvtsd2si:
case INS_cvttsd2si32:
case INS_cvttsd2si64:
case INS_movd:
case INS_movnti:
case INS_andn:
Expand All @@ -2285,11 +2281,9 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
case INS_pdep:
case INS_pext:
case INS_rorx:
#if defined(TARGET_AMD64)
case INS_sarx:
case INS_shlx:
case INS_shrx:
#endif // TARGET_AMD64
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
{
Expand Down Expand Up @@ -2874,32 +2868,25 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
case INS_rorx:
case INS_pdep:
case INS_mulx:
// TODO: Unblock when enabled for x86
#ifdef TARGET_AMD64
case INS_shrx:
#endif
{
evexPrefix |= (0x03 << 8);
break;
}

case INS_pext:
// TODO: Unblock when enabled for x86
#ifdef TARGET_AMD64
case INS_sarx:
#endif
{
evexPrefix |= (0x02 << 8);
break;
}
// TODO: Unblock when enabled for x86
#ifdef TARGET_AMD64

case INS_shlx:
{
evexPrefix |= (0x01 << 8);
break;
}
#endif

default:
{
break;
Expand Down Expand Up @@ -3088,32 +3075,25 @@ emitter::code_t emitter::emitExtractVexPrefix(instruction ins, code_t& code) con
case INS_rorx:
case INS_pdep:
case INS_mulx:
// TODO: Unblock when enabled for x86
#ifdef TARGET_AMD64
case INS_shrx:
#endif
{
vexPrefix |= 0x03;
break;
}

case INS_pext:
// TODO: Unblock when enabled for x86
#ifdef TARGET_AMD64
case INS_sarx:
#endif
{
vexPrefix |= 0x02;
break;
}
// TODO: Unblock when enabled for x86
#ifdef TARGET_AMD64

case INS_shlx:
{
vexPrefix |= 0x01;
break;
}
#endif

default:
{
vexPrefix |= 0x00;
Expand Down Expand Up @@ -3831,11 +3811,9 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
case INS_pextrw:
case INS_pextrw_sse41:
case INS_rorx:
#ifdef TARGET_AMD64
case INS_shlx:
case INS_sarx:
case INS_shrx:
#endif
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi32:
Expand Down Expand Up @@ -4084,10 +4062,8 @@ unsigned emitter::emitGetVexPrefixSize(instrDesc* id) const
switch (ins)
{
case INS_crc32:
#if defined(TARGET_AMD64)
case INS_sarx:
case INS_shrx:
#endif // TARGET_AMD64
{
// When the prefix is 0x0F38 or 0x0F3A, we must use the 3-byte encoding
// These are special cases where the pp-bit is 0xF2 or 0xF3 and not 0x66
Expand Down Expand Up @@ -8629,7 +8605,7 @@ void emitter::emitIns_R_R_C_R(instruction ins,
}

//------------------------------------------------------------------------
// emitIns_R_R_R_S: emits the code for a instruction that takes a register operand, a variable index +
// emitIns_R_R_S_R: emits the code for a instruction that takes a register operand, a variable index +
// offset, another register operand, and that returns a value in register
//
// Arguments:
Expand Down Expand Up @@ -12897,6 +12873,19 @@ void emitter::emitDispIns(
case IF_RRW_RRD_ARD:
case IF_RWR_RWR_ARD:
{
if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx))
{
// These instructions have their operands swapped to simplify the emitter implementation.
// They will appear here as IF_RWR_RRD_ARD but should actually
// display as if they were IF_RWR_ARD_RRD.

printf("%s", emitRegName(id->idReg1(), attr));
printf(", %s", sstr);
emitDispAddrMode(id);
printf(", %s", emitRegName(id->idReg2(), attr));
break;
}

printf("%s", emitRegName(id->idReg1(), attr));
emitDispEmbMasking(id);
printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr);
Expand Down Expand Up @@ -13194,6 +13183,20 @@ void emitter::emitDispIns(
case IF_RRW_RRD_SRD:
case IF_RWR_RWR_SRD:
{
if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx))
{
// These instructions have their operands swapped to simplify the emitter implementation.
// They will appear here as IF_RWR_RRD_SRD but should actually
// display as if they were IF_RWR_SRD_RRD.

printf("%s", emitRegName(id->idReg1(), attr));
printf(", %s", sstr);
emitDispFrameRef(id->idAddr()->iiaLclVar.lvaVarNum(), id->idAddr()->iiaLclVar.lvaOffset(),
id->idDebugOnlyInfo()->idVarRefOffs, asmfm);
printf(", %s", emitRegName(id->idReg2(), attr));
break;
}

printf("%s", emitRegName(id->idReg1(), attr));
emitDispEmbMasking(id);
printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr);
Expand Down Expand Up @@ -13420,17 +13423,12 @@ void emitter::emitDispIns(
regNumber reg2 = id->idReg2();
regNumber reg3 = id->idReg3();

if (ins == INS_bextr || ins == INS_bzhi
#ifdef TARGET_AMD64
|| ins == INS_shrx || ins == INS_shlx || ins == INS_sarx
#endif
)
if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx))
{
// BMI bextr,bzhi, shrx, shlx and sarx encode the reg2 in VEX.vvvv and reg3 in modRM,
// which is different from most of other instructions
regNumber tmp = reg2;
reg2 = reg3;
reg3 = tmp;
// These instructions have their operands swapped to simplify the emitter implementation.
// They encode reg3 in VEX.vvvv and reg2 in modRM, which is opposite most instructions.
// We swap them back here so they will display in the correct order.
std::swap(reg2, reg3);
}

emitAttr attr3 = attr;
Expand Down Expand Up @@ -13777,6 +13775,20 @@ void emitter::emitDispIns(
case IF_RRW_RRD_MRD:
case IF_RWR_RWR_MRD:
{
if ((ins == INS_bextr) || (ins == INS_bzhi) || (ins == INS_sarx) || (ins == INS_shlx) || (ins == INS_shrx))
{
// These instructions have their operands swapped to simplify the emitter implementation.
// They will appear here as IF_RWR_RRD_MRD but should actually
// display as if they were IF_RWR_MRD_RRD.

printf("%s", emitRegName(id->idReg1(), attr));
printf(", %s", sstr);
offs = emitGetInsDsp(id);
emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC);
printf(", %s", emitRegName(id->idReg2(), attr));
break;
}

printf("%s", emitRegName(id->idReg1(), attr));
emitDispEmbMasking(id);
printf(", %s, %s", emitRegName(id->idReg2(), attr), sstr);
Expand Down Expand Up @@ -21903,7 +21915,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
}

#ifdef TARGET_AMD64
case INS_shlx:
case INS_sarx:
case INS_shrx:
Expand All @@ -21912,7 +21923,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
break;
}
#endif

case INS_vpmovb2m:
case INS_vpmovw2m:
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,11 +607,9 @@ INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE,
INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit
INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract
INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX)
#ifdef TARGET_AMD64
INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags
INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags
INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags
#endif

INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)

Expand Down
Loading
Loading