diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index a06ba959917d8a..7c798357ee939e 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -649,6 +649,7 @@ class CodeGen final : public CodeGenInterface #if defined(TARGET_AMD64) void genAmd64EmitterUnitTestsSse2(); void genAmd64EmitterUnitTestsApx(); + void genAmd64EmitterUnitTestsCCMP(); #endif #endif // defined(DEBUG) @@ -911,6 +912,9 @@ class CodeGen final : public CodeGenInterface #ifdef TARGET_ARM64 void genCodeForCCMP(GenTreeCCMP* ccmp); #endif +#ifdef TARGET_AMD64 + void genCodeForCCMP(GenTreeCCMP* ccmp); +#endif // TARGET_AMD64 void genCodeForSelect(GenTreeOp* select); void genIntrinsic(GenTreeIntrinsic* treeNode); void genPutArgStk(GenTreePutArgStk* treeNode); @@ -1655,6 +1659,8 @@ class CodeGen final : public CodeGenInterface static insOpts ShiftOpToInsOpts(genTreeOps op); #elif defined(TARGET_XARCH) static instruction JumpKindToCmov(emitJumpKind condition); + static instruction JumpKindToCcmp(emitJumpKind condition); + static insOpts OptsFromCFlags(insCflags flags); #endif #if !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 641267f6860581..72d834b4f9d989 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -2702,6 +2702,10 @@ void CodeGen::genEmitterUnitTests() { genAmd64EmitterUnitTestsApx(); } + if (unitTestSectionAll || (strstr(unitTestSection, "ccmp") != nullptr)) + { + genAmd64EmitterUnitTestsCCMP(); + } #elif defined(TARGET_ARM64) if (unitTestSectionAll || (strstr(unitTestSection, "general") != nullptr)) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index bd80187eea6a44..9c5f912a2dd614 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -433,12 +433,13 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size, else { // For section constant, the immediate will be relocatable - GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags)); + GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, + INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags)); } } else { - GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm DEBUGARG(targetHandle) DEBUGARG(gtFlags)); + GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags)); } } regSet.verifyRegUsed(reg); @@ -769,12 +770,20 @@ void CodeGen::genCodeForNegNot(GenTree* tree) { GenTree* operand = tree->gtGetOp1(); assert(operand->isUsedFromReg()); - regNumber operandReg = genConsumeReg(operand); + regNumber operandReg = genConsumeReg(operand); + instruction ins = genGetInsForOper(tree->OperGet(), targetType); - inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); + if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (targetReg != operandReg)) + { + GetEmitter()->emitIns_R_R(ins, emitTypeSize(operand), targetReg, operandReg, INS_OPTS_EVEX_nd); + } + else + { + inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); - instruction ins = genGetInsForOper(tree->OperGet(), targetType); - inst_RV(ins, targetReg, targetType); + instruction ins = genGetInsForOper(tree->OperGet(), targetType); + inst_RV(ins, targetReg, targetType); + } } genProduceReg(tree); @@ -1189,12 +1198,49 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) // reg3 = reg3 op reg2 else { - var_types op1Type = op1->TypeGet(); - inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false); - regSet.verifyRegUsed(targetReg); - gcInfo.gcMarkRegPtrVal(targetReg, op1Type); - dst = treeNode; - src = op2; + if (JitConfig.JitEnableApxNDD() && emit->IsApxNDDEncodableInstruction(ins) && !varTypeIsFloating(treeNode)) + { + // TODO-xarch-apx: + // APX can provide optimal code gen in this case using NDD feature: + // reg3 = op1 op op2 without extra mov + + // see if it can be optimized by inc/dec + if (oper == GT_ADD && op2->isContainedIntOrIImmed() && !treeNode->gtOverflowEx()) + { + if (op2->IsIntegralConst(1)) + { + emit->emitIns_R_R(INS_inc, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd); + genProduceReg(treeNode); + return; + } + else if (op2->IsIntegralConst(-1)) + { + emit->emitIns_R_R(INS_dec, emitTypeSize(treeNode), targetReg, op1reg, INS_OPTS_EVEX_nd); + genProduceReg(treeNode); + return; + } + } + + assert(op1reg != targetReg); + assert(op2reg != targetReg); + emit->emitInsBinary(ins, emitTypeSize(treeNode), op1, op2, targetReg); + if (treeNode->gtOverflowEx()) + { + assert(oper == GT_ADD || oper == GT_SUB); + genCheckOverflow(treeNode); + } + genProduceReg(treeNode); + return; + } + else + { + var_types op1Type = op1->TypeGet(); + inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false); + regSet.verifyRegUsed(targetReg); + gcInfo.gcMarkRegPtrVal(targetReg, op1Type); + dst = treeNode; + src = op2; + } } // try to use an inc or dec @@ -1213,6 +1259,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) return; } } + regNumber r = emit->emitInsBinary(ins, emitTypeSize(treeNode), dst, src); noway_assert(r == targetReg); @@ -1326,6 +1373,25 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode) } assert(regOp->isUsedFromReg()); + if (JitConfig.JitEnableApxNDD() && emit->IsApxNDDEncodableInstruction(ins) && + regOp->GetRegNum() != mulTargetReg) + { + // use NDD form to optimize this form: + // mov targetReg, regOp + // imul targetReg, rmOp + // to imul targetReg, regOp rmOp. + emit->emitInsBinary(ins, size, regOp, rmOp, mulTargetReg); + if (requiresOverflowCheck) + { + // Overflow checking is only used for non-floating point types + noway_assert(!varTypeIsFloating(treeNode)); + + genCheckOverflow(treeNode); + } + genProduceReg(treeNode); + return; + } + // Setup targetReg when neither of the source operands was a matching register inst_Mov(targetType, mulTargetReg, regOp->GetRegNum(), /* canSkip */ true); @@ -1579,6 +1645,46 @@ instruction CodeGen::JumpKindToCmov(emitJumpKind condition) return s_table[condition]; } +//------------------------------------------------------------------------ +// JumpKindToCcmp: +// Convert an emitJumpKind to the corresponding ccmp instruction. +// +// Arguments: +// condition - the condition +// +// Returns: +// A ccmp instruction. +// +instruction CodeGen::JumpKindToCcmp(emitJumpKind condition) +{ + static constexpr instruction s_table[EJ_COUNT] = { + INS_none, INS_none, INS_ccmpo, INS_ccmpno, INS_ccmpb, INS_ccmpae, INS_ccmpe, INS_ccmpne, INS_ccmpbe, + INS_ccmpa, INS_ccmps, INS_ccmpns, INS_none, INS_none, INS_ccmpl, INS_ccmpge, INS_ccmple, INS_ccmpg, + }; + + static_assert_no_msg(s_table[EJ_NONE] == INS_none); + static_assert_no_msg(s_table[EJ_jmp] == INS_none); + static_assert_no_msg(s_table[EJ_jo] == INS_ccmpo); + static_assert_no_msg(s_table[EJ_jno] == INS_ccmpno); + static_assert_no_msg(s_table[EJ_jb] == INS_ccmpb); + static_assert_no_msg(s_table[EJ_jae] == INS_ccmpae); + static_assert_no_msg(s_table[EJ_je] == INS_ccmpe); + static_assert_no_msg(s_table[EJ_jne] == INS_ccmpne); + static_assert_no_msg(s_table[EJ_jbe] == INS_ccmpbe); + static_assert_no_msg(s_table[EJ_ja] == INS_ccmpa); + static_assert_no_msg(s_table[EJ_js] == INS_ccmps); + static_assert_no_msg(s_table[EJ_jns] == INS_ccmpns); + static_assert_no_msg(s_table[EJ_jp] == INS_none); + static_assert_no_msg(s_table[EJ_jnp] == INS_none); + static_assert_no_msg(s_table[EJ_jl] == INS_ccmpl); + static_assert_no_msg(s_table[EJ_jge] == INS_ccmpge); + static_assert_no_msg(s_table[EJ_jle] == INS_ccmple); + static_assert_no_msg(s_table[EJ_jg] == INS_ccmpg); + + assert((condition >= EJ_NONE) && (condition < EJ_COUNT)); + return s_table[condition]; +} + //------------------------------------------------------------------------ // genCodeForCompare: Produce code for a GT_SELECT/GT_SELECTCC node. // @@ -2266,6 +2372,12 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) // Do nothing; these nodes are simply markers for debug info. break; +#if defined(TARGET_AMD64) + case GT_CCMP: + genCodeForCCMP(treeNode->AsCCMP()); + break; +#endif + default: { #ifdef DEBUG @@ -4438,23 +4550,23 @@ void CodeGen::genCodeForLockAdd(GenTreeOp* node) if (imm == 1) { // inc [addr] - GetEmitter()->emitIns_AR(INS_inc, size, addr->GetRegNum(), 0); + GetEmitter()->emitIns_AR(INS_inc_no_evex, size, addr->GetRegNum(), 0); } else if (imm == -1) { // dec [addr] - GetEmitter()->emitIns_AR(INS_dec, size, addr->GetRegNum(), 0); + GetEmitter()->emitIns_AR(INS_dec_no_evex, size, addr->GetRegNum(), 0); } else { // add [addr], imm - GetEmitter()->emitIns_I_AR(INS_add, size, imm, addr->GetRegNum(), 0); + GetEmitter()->emitIns_I_AR(INS_add_no_evex, size, imm, addr->GetRegNum(), 0); } } else { // add [addr], data - GetEmitter()->emitIns_AR_R(INS_add, size, data->GetRegNum(), addr->GetRegNum(), 0); + GetEmitter()->emitIns_AR_R(INS_add_no_evex, size, data->GetRegNum(), addr->GetRegNum(), 0); } } @@ -4481,7 +4593,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* node) if (node->OperIs(GT_XORR, GT_XAND)) { - const instruction ins = node->OperIs(GT_XORR) ? INS_or : INS_and; + const instruction ins = node->OperIs(GT_XORR) ? INS_or_no_evex : INS_and_no_evex; if (node->IsUnusedValue()) { @@ -4873,6 +4985,25 @@ void CodeGen::genCodeForShift(GenTree* tree) genProduceReg(tree); return; } + + if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && + (tree->GetRegNum() != operandReg)) + { + ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue); + // If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov. + // this case might be rarely hit. + if (shiftByValue == 1) + { + GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd); + } + else + { + GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue, + INS_OPTS_EVEX_nd); + } + genProduceReg(tree); + return; + } #endif // First, move the operand to the destination register and // later on perform the shift in-place. @@ -4919,6 +5050,16 @@ void CodeGen::genCodeForShift(GenTree* tree) // The operand to be shifted must not be in ECX noway_assert(operandReg != REG_RCX); + if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && + (tree->GetRegNum() != operandReg)) + { + // If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov. + // this case might be rarely hit. + GetEmitter()->emitIns_R_R(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, INS_OPTS_EVEX_nd); + genProduceReg(tree); + return; + } + inst_Mov(targetType, tree->GetRegNum(), operandReg, /* canSkip */ true); inst_RV(ins, tree->GetRegNum(), targetType); } @@ -8968,8 +9109,141 @@ void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regSet.verifyRegistersUsed(killMask); } +#ifdef TARGET_AMD64 + +insOpts CodeGen::OptsFromCFlags(insCflags flags) +{ + unsigned opts = 0x0; + if (flags & INS_FLAGS_CF) + opts |= INS_OPTS_EVEX_dfv_cf; + if (flags & INS_FLAGS_ZF) + opts |= INS_OPTS_EVEX_dfv_zf; + if (flags & INS_FLAGS_SF) + opts |= INS_OPTS_EVEX_dfv_sf; + if (flags & INS_FLAGS_OF) + opts |= INS_OPTS_EVEX_dfv_of; + return (insOpts)opts; +} + +void CodeGen::genCodeForCCMP(GenTreeCCMP* ccmp) +{ + emitter* emit = GetEmitter(); + assert(emit->UsePromotedEVEXEncoding()); + + genConsumeOperands(ccmp); + GenTree* op1 = ccmp->gtGetOp1(); + GenTree* op2 = ccmp->gtGetOp2(); + var_types op1Type = genActualType(op1->TypeGet()); + var_types op2Type = genActualType(op2->TypeGet()); + emitAttr cmpSize = emitActualTypeSize(op1Type); + regNumber srcReg1 = op1->GetRegNum(); + + // No float support or swapping op1 and op2 to generate cmp reg, imm. + assert(!varTypeIsFloating(op2Type)); + assert(!op1->isContainedIntOrIImmed()); + + // For the ccmp flags, invert the condition of the compare. + // For the condition, use the previous compare. + const GenConditionDesc& condDesc = GenConditionDesc::Get(ccmp->gtCondition); + instruction ccmpIns = JumpKindToCcmp(condDesc.jumpKind1); + insOpts opts = OptsFromCFlags(ccmp->gtFlagsVal); + + if (op2->isContainedIntOrIImmed()) + { + GenTreeIntConCommon* intConst = op2->AsIntConCommon(); + emit->emitIns_R_I(ccmpIns, cmpSize, srcReg1, (int)intConst->IconValue(), opts); + } + else + { + regNumber srcReg2 = op2->GetRegNum(); + emit->emitIns_R_R(ccmpIns, cmpSize, srcReg1, srcReg2, opts); + } +} +#endif // TARGET_AMD64 + #if defined(DEBUG) && defined(TARGET_AMD64) +/***************************************************************************** + * Unit tests for the CCMP instructions. + */ + +void CodeGen::genAmd64EmitterUnitTestsCCMP() +{ + emitter* theEmitter = GetEmitter(); + genDefineTempLabel(genCreateTempLabel()); + + // #ifdef COMMENTOUT + + // ============ + // Test RR form + // ============ + + // Test all sizes + theEmitter->emitIns_R_R(INS_ccmpe, EA_4BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_R(INS_ccmpe, EA_8BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_R(INS_ccmpe, EA_2BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_R(INS_ccmpe, EA_1BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf); + + // Test all CC codes + for (uint32_t ins = INS_FIRST_CCMP_INSTRUCTION + 1; ins < INS_LAST_CCMP_INSTRUCTION; ins++) + { + theEmitter->emitIns_R_R((instruction)ins, EA_4BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf); + } + + // Test all dfv + for (int i = 0; i < 16; i++) + { + theEmitter->emitIns_R_R(INS_ccmpe, EA_4BYTE, REG_RAX, REG_RCX, (insOpts)(i << INS_OPTS_EVEX_dfv_byte_offset)); + } + + // ============ + // Test RS form + // ============ + + // Test all sizes + theEmitter->emitIns_R_S(INS_ccmpe, EA_4BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_S(INS_ccmpe, EA_8BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_S(INS_ccmpe, EA_2BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_S(INS_ccmpe, EA_1BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf); + + // Test all CC codes + for (uint32_t ins = INS_FIRST_CCMP_INSTRUCTION + 1; ins < INS_LAST_CCMP_INSTRUCTION; ins++) + { + theEmitter->emitIns_R_S((instruction)ins, EA_4BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf); + } + + // Test all dfv + for (int i = 0; i < 16; i++) + { + theEmitter->emitIns_R_S(INS_ccmpe, EA_4BYTE, REG_RAX, 0, 0, (insOpts)(i << INS_OPTS_EVEX_dfv_byte_offset)); + } + + // ============ + // Test RI form (test small and large sizes and constants) + // ============ + + theEmitter->emitIns_R_I(INS_ccmpe, EA_4BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_I(INS_ccmpe, EA_4BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf); + + theEmitter->emitIns_R_I(INS_ccmpe, EA_8BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_I(INS_ccmpe, EA_8BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf); + + theEmitter->emitIns_R_I(INS_ccmpe, EA_2BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_I(INS_ccmpe, EA_2BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf); + + theEmitter->emitIns_R_I(INS_ccmpe, EA_1BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_I(INS_ccmpe, EA_1BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf); + + // ============ + // Test RC form + // ============ + + CORINFO_FIELD_HANDLE hnd = theEmitter->emitFltOrDblConst(1.0f, EA_4BYTE); + theEmitter->emitIns_R_C(INS_ccmpe, EA_4BYTE, REG_RAX, hnd, 0, INS_OPTS_EVEX_dfv_cf); + theEmitter->emitIns_R_C(INS_ccmpe, EA_4BYTE, REG_RAX, hnd, 4, INS_OPTS_EVEX_dfv_cf); + // #endif +} + /***************************************************************************** * Unit tests for the SSE2 instructions. */ @@ -9270,6 +9544,87 @@ void CodeGen::genAmd64EmitterUnitTestsApx() theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); + + // APX-EVEX + + theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R(INS_shl_1, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_I(INS_shl_N, EA_2BYTE, REG_R11, REG_ECX, 7, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_inc, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R(INS_dec, EA_2BYTE, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_R(INS_cmovo, EA_4BYTE, REG_R12, REG_R11, REG_EAX, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, REG_ECX, INS_OPTS_EVEX_nd); + theEmitter->emitIns_R_R_S(INS_imul, EA_4BYTE, REG_R12, REG_R11, 0, 1, INS_OPTS_EVEX_nd); + + theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_R12, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_shl, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_shl_1, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_I(INS_shl_N, EA_2BYTE, REG_R11, 7, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_imul, EA_4BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_imul, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_I(INS_imul_15, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R(INS_imulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_tzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_lzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_popcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_S(INS_tzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_lzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_popcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, + (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd)); + + theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf); + + theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf); + theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1); } #endif // defined(DEBUG) && defined(TARGET_AMD64) @@ -11314,7 +11669,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind) if (barrierKind == BARRIER_FULL) { instGen(INS_lock); - GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0); + GetEmitter()->emitIns_I_AR(INS_or_no_evex, EA_4BYTE, 0, REG_SPBASE, 0); } } diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index a3d854e17eff0f..d3b58a53f3ff41 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2298,7 +2298,11 @@ void Compiler::compSetProcessor() } if (canUseApxEncoding()) { + // TODO-Xarch-apx: + // At this stage, since no machine will pass the CPUID check for APX, we need a special stress mode that + // enables REX2 on incompatible platform, `DoJitStressRex2Encoding` is expected to be removed eventually. codeGen->GetEmitter()->SetUseRex2Encoding(true); + codeGen->GetEmitter()->SetUsePromotedEVEXEncoding(true); } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 5160d287cb3119..2ada22e7741463 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7091,6 +7091,8 @@ class Compiler bool optSwitchConvert(BasicBlock* firstBlock, int testsCount, ssize_t* testValues, weight_t falseLikelihood, GenTree* nodeToTest); bool optSwitchDetectAndConvert(BasicBlock* firstBlock); + bool optSwitchDetectLikely(BasicBlock* firstBlock); + PhaseStatus optInvertLoops(); // Invert loops so they're entered at top and tested at bottom. PhaseStatus optOptimizeFlow(); // Simplify flow graph and do tail duplication PhaseStatus optOptimizeLayout(); // Optimize the BasicBlock layout of the method @@ -9999,6 +10001,23 @@ class Compiler #ifdef DEBUG return JitConfig.JitStressEvexEncoding() || JitConfig.JitStressRex2Encoding(); #endif // DEBUG + return false; + } + + //------------------------------------------------------------------------ + // DoJitStressPromotedEvexEncoding- Answer the question: Do we force promoted EVEX encoding. + // + // Returns: + // `true` if user requests promoted EVEX encoding. + // + bool DoJitStressPromotedEvexEncoding() const + { +#ifdef DEBUG + if (JitConfig.JitStressPromotedEvexEncoding()) + { + return true; + } +#endif // DEBUG return false; } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index dc0f977b608622..00a391e6dc5655 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -471,6 +471,7 @@ class emitter SetUseVEXEncoding(false); SetUseEvexEncoding(false); SetUseRex2Encoding(false); + SetUsePromotedEVEXEncoding(false); #endif // TARGET_XARCH emitDataSecCur = nullptr; @@ -793,7 +794,15 @@ class emitter // For normal and embedded broadcast intrinsics, EVEX.L'L has the same semantic, vector length. // For embedded rounding, EVEX.L'L semantic changes to indicate the rounding mode. // Multiple bits in _idEvexbContext are used to inform emitter to specially handle the EVEX.L'L bits. - unsigned _idEvexbContext : 2; + unsigned _idCustom5 : 1; + unsigned _idCustom6 : 1; + +#define _idEvexbContext (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE \ + */ +#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */ +#define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */ +#define _idEvexDFV (_idCustom4 << 3) | (_idCustom3 << 2) | (_idCustom2 << 1) | _idCustom1 + #endif // TARGET_XARCH #ifdef TARGET_ARM64 @@ -1009,6 +1018,7 @@ class emitter regNumber _idReg3 : REGNUM_BITS; regNumber _idReg4 : REGNUM_BITS; }; + #elif defined(TARGET_LOONGARCH64) struct { @@ -1657,38 +1667,17 @@ class emitter #ifdef TARGET_XARCH bool idIsEvexbContextSet() const { - return _idEvexbContext != 0; + return idGetEvexbContext() != 0; } void idSetEvexbContext(insOpts instOptions) { assert(!idIsEvexbContextSet()); + assert(idGetEvexbContext() == 0); + unsigned value = static_cast(instOptions & INS_OPTS_EVEX_b_MASK); - switch (instOptions & INS_OPTS_EVEX_b_MASK) - { - case INS_OPTS_EVEX_eb_er_rd: - { - _idEvexbContext = 1; - break; - } - - case INS_OPTS_EVEX_er_ru: - { - _idEvexbContext = 2; - break; - } - - case INS_OPTS_EVEX_er_rz: - { - _idEvexbContext = 3; - break; - } - - default: - { - unreached(); - } - } + _idCustom5 = ((value >> 0) & 1); + _idCustom6 = ((value >> 1) & 1); } unsigned idGetEvexbContext() const @@ -1728,6 +1717,43 @@ class emitter assert(!idIsEvexZContextSet()); _idEvexZContext = 1; } + + bool idIsEvexNdContextSet() const + { + return _idEvexNdContext != 0; + } + + void idSetEvexNdContext() + { + assert(!idIsEvexNdContextSet()); + _idEvexNdContext = 1; + } + + bool idIsEvexNfContextSet() const + { + return _idEvexNfContext != 0; + } + + void idSetEvexNfContext() + { + assert(!idIsEvexNfContextSet()); + _idEvexNfContext = 1; + } + + unsigned idGetEvexDFV() const + { + return _idEvexDFV; + } + + void idSetEvexDFV(insOpts instOptions) + { + unsigned value = static_cast((instOptions & INS_OPTS_EVEX_dfv_MASK) >> 8); + + _idCustom1 = ((value >> 0) & 1); + _idCustom2 = ((value >> 1) & 1); + _idCustom3 = ((value >> 2) & 1); + _idCustom4 = ((value >> 3) & 1); + } #endif #ifdef TARGET_ARMARCH @@ -2531,7 +2557,12 @@ class emitter CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); #endif // FEATURE_MASKED_HW_INTRINSICS #endif // FEATURE_SIMD + +#if defined(TARGET_XARCH) + regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg = REG_NA); +#else regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src); +#endif regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2); void emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem); void emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem); diff --git a/src/coreclr/jit/emitfmtsxarch.h b/src/coreclr/jit/emitfmtsxarch.h index f893fce8d07eea..a94a7c1b3e7d5b 100644 --- a/src/coreclr/jit/emitfmtsxarch.h +++ b/src/coreclr/jit/emitfmtsxarch.h @@ -140,6 +140,7 @@ IF_DEF(RRW_RRW, IS_R1_RW|IS_R2_RW, NONE) // r/w IF_DEF(RRD_RRD_CNS, IS_R1_RD|IS_R2_RD, SCNS) // read reg1, read reg2, const IF_DEF(RWR_RRD_CNS, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, const IF_DEF(RRW_RRD_CNS, IS_R1_RW|IS_R2_RD, SCNS) // r/w reg1, read reg2, const +IF_DEF(RWR_RRD_SHF, IS_R1_WR|IS_R2_RD, SCNS) // write reg1, read reg2, shift IF_DEF(RRD_RRD_RRD, IS_R1_RD|IS_R2_RD|IS_R3_RD, NONE) // read reg1, read reg2, read reg3 IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg1, read reg2, read reg3 diff --git a/src/coreclr/jit/emitjmps.h b/src/coreclr/jit/emitjmps.h index 6c9861c91a1b17..3b74567b8557d1 100644 --- a/src/coreclr/jit/emitjmps.h +++ b/src/coreclr/jit/emitjmps.h @@ -8,6 +8,8 @@ #if defined(TARGET_XARCH) +// todo-anthony : should we define the cc for ccmp here? + // jump reverse instruction JMP_SMALL(jmp , jmp , jmp ) JMP_SMALL(jo , jno , jo ) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 70f54f021c9375..4fa2eb4ad455f7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -85,6 +85,11 @@ bool emitter::IsAvx512OnlyInstruction(instruction ins) return (ins >= INS_FIRST_AVX512_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); } +bool emitter::IsApxOnlyInstruction(instruction ins) +{ + return (ins >= INS_FIRST_APX_INSTRUCTION) && (ins <= INS_LAST_APX_INSTRUCTION); +} + bool emitter::IsFMAInstruction(instruction ins) { return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION); @@ -236,6 +241,18 @@ bool emitter::HasRex2Encoding(instruction ins) const return (flags & Encoding_REX2) != 0; } +bool emitter::HasApxNdd(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_Has_NDD) != 0; +} + +bool emitter::HasApxNf(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_Has_NF) != 0; +} + bool emitter::IsVexEncodableInstruction(instruction ins) const { if (!UseVEXEncoding()) @@ -260,19 +277,7 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const { return false; } - - switch (ins) - { - case INS_pclmulqdq: - { - return emitComp->compOpportunisticallyDependsOn(InstructionSet_PCLMULQDQ_V256); - } - - default: - { - return HasEvexEncoding(ins); - } - } + return HasEvexEncoding(ins); } //------------------------------------------------------------------------ @@ -286,6 +291,8 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const // bool emitter::IsRex2EncodableInstruction(instruction ins) const { + // TODO-Xarch-apx: we have special stress mode for REX2 on non-compatible machine, that will + // force UseRex2Encoding return true regardless of the CPUID results. if (!UseRex2Encoding()) { return false; @@ -293,6 +300,116 @@ bool emitter::IsRex2EncodableInstruction(instruction ins) const return HasRex2Encoding(ins); } +//------------------------------------------------------------------------ +// IsApxNDDEncodableInstruction: Answer the question- does this instruction have apx ndd form. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins has apx ndd form. +// +bool emitter::IsApxNDDEncodableInstruction(instruction ins) const +{ + if (!UsePromotedEVEXEncoding()) + { + return false; + } + + return HasApxNdd(ins); +} + +//------------------------------------------------------------------------ +// IsApxNFEncodableInstruction: Answer the question - does this instruction have Evex.nf supported +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins is Evex.nf supported. +// +bool emitter::IsApxNFEncodableInstruction(instruction ins) const +{ + if (!UsePromotedEVEXEncoding()) + { + return false; + } + + return HasApxNf(ins); +} + +//------------------------------------------------------------------------ +// IsApxExtendedEvexInstruction: Answer the question - does this instruction have apx extended evex form. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins has apx extended evex form. +// +bool emitter::IsApxExtendedEvexInstruction(instruction ins) const +{ + if (!UsePromotedEVEXEncoding()) + { + return false; + } + + if (HasApxNdd(ins) || HasApxNf(ins)) + { + return true; + } + + if (IsApxOnlyInstruction(ins)) + { + return true; + } + + return false; +} + +//------------------------------------------------------------------------ +// IsShiftInstruction: Answer the question- is this instruction a shift instruction. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins is a shift instruction. +// +bool emitter::IsShiftInstruction(instruction ins) const +{ + switch (ins) + { + case INS_rcl_1: + case INS_rcr_1: + case INS_rol_1: + case INS_ror_1: + case INS_shl_1: + case INS_shr_1: + case INS_sar_1: + + case INS_rcl: + case INS_rcr: + case INS_rol: + case INS_ror: + case INS_shl: + case INS_shr: + case INS_sar: + + case INS_rcl_N: + case INS_rcr_N: + case INS_rol_N: + case INS_ror_N: + case INS_shl_N: + case INS_shr_N: + case INS_sar_N: + return true; + + default: + return false; + } +} + //------------------------------------------------------------------------ // IsLegacyMap1: Answer the question- Is this instruction on legacy-map-1 // @@ -316,15 +433,15 @@ bool emitter::IsLegacyMap1(code_t code) const // 2-byte return true; } - if ((code & 0xFFFF0000) == 0x000F0000) + if ((code & 0xFF0000) == 0x0F0000) { // 3-byte return true; } - if ((code & 0xFF00FF00) == 0x0F000000) + if ((code & 0xFF000000) == 0x0F000000) { - // 4-byte, need to check if PP is a prefix. + // 4-byte, need to check if PP is prefixs BYTE prefix = (BYTE)((code & 0xFF0000) >> 16); return ((prefix == 0xF2) || (prefix == 0xF3) || (prefix == 0x66)); } @@ -647,6 +764,53 @@ bool emitter::IsRexW1EvexInstruction(instruction ins) return false; } +inline bool emitter::IsCCMP(instruction ins) +{ + return (ins > INS_FIRST_CCMP_INSTRUCTION && ins < INS_LAST_CCMP_INSTRUCTION); +} + +inline insCC emitter::GetCCFromCCMP(instruction ins) +{ + assert(IsCCMP(ins)); + switch (ins) + { + case INS_ccmpo: + return INS_CC_O; + case INS_ccmpno: + return INS_CC_NO; + case INS_ccmpb: + return INS_CC_B; + case INS_ccmpae: + return INS_CC_AE; + case INS_ccmpe: + return INS_CC_E; + case INS_ccmpne: + return INS_CC_NE; + case INS_ccmpbe: + return INS_CC_BE; + case INS_ccmpa: + return INS_CC_A; + case INS_ccmps: + return INS_CC_S; + case INS_ccmpns: + return INS_CC_NS; + case INS_ccmpt: + return INS_CC_TRUE; + case INS_ccmpf: + return INS_CC_FALSE; + case INS_ccmpl: + return INS_CC_L; + case INS_ccmpge: + return INS_CC_GE; + case INS_ccmple: + return INS_CC_LE; + case INS_ccmpg: + return INS_CC_G; + default: + unreached(); + } +} + #ifdef TARGET_64BIT //------------------------------------------------------------------------ // AreUpperBitsZero: check if some previously emitted @@ -1343,6 +1507,13 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const return true; } + if (id->idIsEvexNfContextSet() && IsBMIInstruction(ins)) + { + // Only a few BMI instructions shall be promoted to APX-EVEX due to NF feature. + // TODO-Ruihan: convert the check into forms like Has* as above. + return true; + } + #if defined(DEBUG) if (emitComp->DoJitStressEvexEncoding()) { @@ -1354,6 +1525,12 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const // check above so we need to still return false here to preserve semantics. return !HasKMaskRegisterDest(ins); } + + if (IsApxExtendedEvexInstruction(ins) && emitComp->DoJitStressPromotedEvexEncoding()) + { + // This path will be hit when we stress APX-EVEX and encode VEX with Extended EVEX. + return (IsBMIInstruction(ins) && HasApxNf(ins)); + } #endif // DEBUG if ((ins == INS_pslldq) || (ins == INS_psrldq)) @@ -1381,7 +1558,7 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const // TODO-xarch-apx: // At this stage, we are only using REX2 in the case that non-simd integer instructions // with EGPRs being used in its operands, it could be either direct register uses, or - // memory addressing operands, i.e. index and base. + // memory addresssig operands, i.e. index and base. instruction ins = id->idIns(); if (!IsRex2EncodableInstruction(ins)) { @@ -1408,6 +1585,54 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const return false; } +//------------------------------------------------------------------------ +// TakesApxExtendedEvexPrefix: Checks if the instruction should be legacy-promoted-evex encoded. +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// true if this instruction requires a legacy-promoted-evex prefix. +// +bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const +{ + // TODO-XArch-APX: come and fix the cases + instruction ins = id->idIns(); + if (!IsApxExtendedEvexInstruction(ins)) + { + return false; + } + + if (IsAvx512OrPriorInstruction(ins)) + { + // This check should reject any instruction not from legacy map-0 or 1. + return false; + } + + if (id->idIsEvexNdContextSet()) + { + return true; + } + + if (id->idIsEvexNfContextSet()) + { + return true; + } + +#if defined(DEBUG) + if (emitComp->DoJitStressPromotedEvexEncoding()) + { + return true; + } +#endif // DEBUG + if (IsApxOnlyInstruction(ins)) + { + return true; + } + + return false; +} + // Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section // 2.6. // Add base EVEX prefix without setting W, R, X, or B bits @@ -1442,6 +1667,10 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const #define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL #define ZBIT_IN_BYTE_EVEX_PREFIX 0x0000008000000000ULL +#define MAP4_IN_BYTE_EVEX_PREFIX 0x4000000000000ULL +#define NDBIT_IN_BYTE_EVEX_PREFIX 0x1000000000ULL +#define NFBIT_IN_BYTE_EVEX_PREFIX 0x400000000ULL +#define EXTENDED_EVEX_PP_BITS 0x10000000000ULL //------------------------------------------------------------------------ // AddEvexPrefix: Add default EVEX prefix with only LL' bits set. // @@ -1456,7 +1685,18 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAttr attr) { // Only AVX512 instructions require EVEX prefix - assert(IsEvexEncodableInstruction(id->idIns())); + // After APX, some instructions in legacy or vex space will be promoted to EVEX. + instruction ins = id->idIns(); + assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + + if (instrIsExtendedReg3opImul(ins)) + { + // the only case imul(0x68) will need EVEX prefix is EVEX.NF feature enabled. + // imul(0x68) opcode comes with ModR/M.REG byte to indicate implicit register use, + // when it is using extended registers (>= REG_R8), it comes with built-in REX prefix, + // remove them first and add the counter part in EVEX. + code &= 0xFFFFFFFF; + } // Shouldn't have already added EVEX prefix assert(!hasEvexPrefix(code)); @@ -1465,6 +1705,56 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt code |= DEFAULT_BYTE_EVEX_PREFIX; + if (IsApxExtendedEvexInstruction(ins)) + { + if (!HasEvexEncoding(ins)) + { + // Legacy-promoted insutrcions are not labeled with Encoding_EVEX. + code |= MAP4_IN_BYTE_EVEX_PREFIX; + } + + // TODO-XArch-apx: + // verify if it is actually safe to reuse the Evex.nd with Evex.b on instrDesc. + if (id->idIsEvexNdContextSet()) + { + code |= NDBIT_IN_BYTE_EVEX_PREFIX; + } + + if (id->idIsEvexNfContextSet()) + { + code |= NFBIT_IN_BYTE_EVEX_PREFIX; + } + + if (attr == EA_2BYTE) + { + code |= EXTENDED_EVEX_PP_BITS; + } + + if (instrIsExtendedReg3opImul(ins)) + { + // EVEX.R3 + // TODO-XArch-APX: + // A few sidenotes: based on how JIT defined IMUL, we may need to extend + // the definition to `IMUL_31` to cover EGPRs. And it can be defined in a + // similar way that opcodes comes with built-in REX2 prefix, and convert + // it to EVEX when needed with some helper functions. + code &= 0xFF7FFFFFFFFFFFFFULL; + } +#ifdef TARGET_AMD64 + if (IsCCMP(ins)) + { + code &= 0xFFFF87F0FFFFFFFF; + code |= ((size_t)id->idGetEvexDFV()) << 43; + code |= ((size_t)GetCCFromCCMP(ins)) << 32; + } +#endif + + return code; + } + + // No APX-NDD instructions should reach code below. + assert(!IsApxExtendedEvexInstruction(ins)); + if (attr == EA_32BYTE) { // Set EVEX.L'L bits to 01 in case of instructions that operate on 256-bits. @@ -1559,6 +1849,12 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt default: { +#ifdef TARGET_AMD64 + if (IsCCMP(id->idIns())) // Special case for conditional ins such as CCMP, CCMOV + { + break; + } +#endif unsigned aaaContext = id->idGetEvexAaaContext(); if (aaaContext != 0) @@ -2008,6 +2304,14 @@ emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code) } } #ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + // If the instruction is not VEX/EVEX encodable, and has EVEX prefix, + // then it is legacy promoted EVEX. + assert(hasEvexPrefix(code)); + assert(IsApxExtendedEvexInstruction(ins)); + return emitter::code_t(code | 0x0000800000000000ULL); + } else if (hasRex2Prefix(code)) { return emitter::code_t(code | 0x000800000000ULL); @@ -2046,13 +2350,18 @@ emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code) return code & 0xFF7FFFFFFFFFFFULL; } } -#ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + assert(IsApxExtendedEvexInstruction(ins)); + // R-bit is added in bit-inverted form. + return code & 0xFF7FFFFFFFFFFFFFULL; + } else if (TakesRex2Prefix(id)) { assert(IsRex2EncodableInstruction(ins)); return code |= 0xD50400000000ULL; // REX2.B3 } -#endif // TARGET_AMD64 return code | 0x4400000000ULL; } @@ -2082,13 +2391,18 @@ emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code) return code & 0xFFBFFFFFFFFFFFULL; } } -#ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + assert(IsApxExtendedEvexInstruction(ins)); + // X-bit is added in bit-inverted form. + return code & 0xFFBFFFFFFFFFFFFFULL; + } else if (TakesRex2Prefix(id)) { assert(IsRex2EncodableInstruction(ins)); return code |= 0xD50200000000ULL; // REX2.B3 } -#endif // TARGET_AMD64 return code | 0x4200000000ULL; } @@ -2118,13 +2432,17 @@ emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code) return code & 0xFFDFFFFFFFFFFFULL; } } -#ifdef TARGET_AMD64 + else if (TakesApxExtendedEvexPrefix(id)) + { + assert(IsApxExtendedEvexInstruction(ins)); + // R-bit is added in bit-inverted form. + return code & 0xFFDFFFFFFFFFFFFFULL; + } else if (TakesRex2Prefix(id)) { assert(IsRex2EncodableInstruction(ins)); return code |= 0xD50100000000ULL; // REX2.B3 } -#endif // TARGET_AMD64 return code | 0x4100000000ULL; } @@ -2207,7 +2525,7 @@ bool isPrefix(BYTE b) // emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) const { - assert(IsEvexEncodableInstruction(ins)); + assert(IsEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); code_t evexPrefix = (code >> 32) & 0xFFFFFFFF; code &= 0x00000000FFFFFFFFLL; @@ -2233,6 +2551,14 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case 0x66: { // None of the existing BMI instructions should be EVEX encoded. + // After APX, BMI instructions can be EVEX encoded with NF feature. + if (IsBMIInstruction(ins)) + { + // if BMI instructions reaches this part, then it should be APX-EVEX. + // although the opcode of all the BMI instructions are defined with 0x66, + // but it should not, skip this check. + break; + } assert(!IsBMIInstruction(ins)); evexPrefix |= (0x01 << 8); break; @@ -2298,6 +2624,12 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co case 0x0F: { + if (((evexPrefix >> 16) & 0x07) == 0x04) + { + // MAP index equal to 4 indicates this instruction is a promoted legacy instruction. + // the MAP ID has been set when EVEX prefix is added. + break; + } evexPrefix |= (0x01 << 16); break; } @@ -2758,6 +3090,11 @@ unsigned emitter::emitGetRexPrefixSize(instrDesc* id, instruction ins) return 0; } + if (TakesApxExtendedEvexPrefix(id)) + { + return 0; + } + if (TakesRex2Prefix(id)) { return 0; @@ -2868,10 +3205,20 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const adjustedSize++; } #ifdef TARGET_AMD64 - else if (IsRex2EncodableInstruction(ins)) + else if (IsRex2EncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)) { unsigned prefixAdjustedSize = 0; - if (TakesRex2Prefix(id)) + if (TakesApxExtendedEvexPrefix(id)) + { + prefixAdjustedSize = 4; + // If the opcode will be prefixed by EVEX, then all the map-1-legacy instructions can remove the escape + // prefix + if (IsLegacyMap1(code)) + { + prefixAdjustedSize -= 1; + } + } + else if (TakesRex2Prefix(id)) { prefixAdjustedSize = 2; // If the opcode will be prefixed by REX2, then all the map-1-legacy instructions can remove the escape @@ -2882,15 +3229,14 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const } } - adjustedSize = prefixAdjustedSize; - emitAttr attr = id->idOpSize(); - - if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx)) + if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx) && !TakesApxExtendedEvexPrefix(id)) { // Most 16-bit operand instructions will need a 0x66 prefix. - adjustedSize++; + prefixAdjustedSize++; } + + adjustedSize = prefixAdjustedSize; } #endif // TARGET_AMD64 else @@ -2942,6 +3288,14 @@ unsigned emitter::emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexP if (includeRexPrefixSize && hasRexPrefix(code)) { + if (instrIsExtendedReg3opImul(id->idIns()) && TakesApxExtendedEvexPrefix(id)) + { + // there is a special case when calculating the size of IMUL with APX-EVEX, + // IMUL_08 or beyond will have a built-in REX prefix with its opcode, + // so it will hit this branch, but when IMUL is encoded with APX-EVEX, + // the size of REX is included in the prefix size, where should be calculated outside. + return 0; + } return 1; } @@ -3583,7 +3937,7 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi { // We are assuming that we only use/encode SPL, BPL, SIL and DIL // not the corresponding AH, CH, DH, or BH - *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX + *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX } #endif // TARGET_AMD64 @@ -3623,7 +3977,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi } if (false /*reg >= REG_R16 && reg <= REG_R31*/) { - // seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`. + // Seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`. assert(TakesRex2Prefix(id)); *code |= 0x004000000000ULL; // REX2.R4 } @@ -3632,7 +3986,7 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi { // We are assuming that we only use/encode SPL, BPL, SIL and DIL // not the corresponding AH, CH, DH, or BH - *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX + *code = (hasRex2Prefix(*code) || hasEvexPrefix(*code)) ? *code : AddRexPrefix(ins, *code); // REX } #endif // TARGET_AMD64 @@ -3652,7 +4006,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber instruction ins = id->idIns(); assert(reg < REG_STK); - assert(IsVexOrEvexEncodableInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); assert(hasVexOrEvexPrefix(code)); // Get 4-bit register encoding @@ -3699,6 +4053,25 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber return code ^ regBits; } } + else + { + assert(TakesApxExtendedEvexPrefix(id)); + assert(hasEvexPrefix(code)); +#if defined(TARGET_AMD64) + // TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. + // Rather see these paths cleaned up. + regBits = HighAwareRegEncoding(reg); + + if (false /*reg >= REG_R16 && reg <= REG_R31*/) + { + // Have to set the EVEX V' bit + code = AddEvexVPrimePrefix(code); + } +#endif + // Shift count = 5-bytes of opcode + 0-2 bits for EVEX + regBits <<= 43; + return code ^ regBits; + } return code ^ regBits; } @@ -3734,7 +4107,7 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod } if (false /*reg >= REG_R16 && reg <= REG_R31*/) { - // seperate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`. + // Seperate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`. assert(TakesRex2Prefix(id)); *code |= 0x002000000000ULL; // REX2.X4 } @@ -4130,7 +4503,9 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) if ((code & 0xFF00) != 0) { - sz += IsAvx512OrPriorInstruction(ins) ? emitInsSize(id, code, includeRexPrefixSize) : 5; + sz += (IsAvx512OrPriorInstruction(ins) || TakesApxExtendedEvexPrefix(id)) + ? emitInsSize(id, code, includeRexPrefixSize) + : 5; } else { @@ -4258,7 +4633,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { bool compressedFitsInByte = false; TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); @@ -4302,7 +4677,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); } @@ -4469,7 +4844,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) } else { - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -5414,17 +5789,37 @@ void emitter::emitInsStoreLcl(instruction ins, emitAttr attr, GenTreeLclVarCommo // attr - the instruction operand size // dst - the destination and first source operand // src - the second source operand +// targetReg - target register of this binary node (only used for APX-NDD form) // // Assumptions: // i) caller of this routine needs to call genConsumeReg() // ii) caller of this routine needs to call genProduceReg() -regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src) +regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src, regNumber targetReg) { // We can only have one memory operand and only src can be a constant operand // However, the handling for a given operand type (mem, cns, or other) is fairly // consistent regardless of whether they are src or dst. As such, we will find // the type of each operand and only check them against src/dst where relevant. + bool useNDD = UsePromotedEVEXEncoding() && (targetReg != REG_NA); +#if !defined(TARGET_AMD64) + // APX does not support 32-bit system. + assert(!useNDD); +#else + if (useNDD) + { + assert(IsApxNDDEncodableInstruction(ins)); + // targetReg has to be an actual register if using NDD. + assert(targetReg < REG_STK); + // make sure target register is not either of the src registers. + assert(dst->isUsedFromReg()); + regNumber dstreg = dst->GetRegNum(); + regNumber srcreg = src->isUsedFromReg() ? src->GetRegNum() : REG_NA; + assert(targetReg != dstreg); + assert(targetReg != srcreg); + } +#endif + GenTree* memOp = nullptr; GenTree* cnsOp = nullptr; GenTree* otherOp = nullptr; @@ -5436,6 +5831,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G assert(dst->isUsedFromMemory() || (dst->GetRegNum() == REG_NA) || instrIs3opImul(ins)); assert(!src->isUsedFromMemory()); + // APX code cannot hit this path. + assert(!useNDD); + memOp = dst; if (src->isContained()) @@ -5543,6 +5941,9 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G assert(otherOp == nullptr); assert(src->IsCnsIntOrI()); + // APX code cannot hit this path. + assert(!useNDD); + id = emitNewInstrAmdCns(attr, memIndir->Offset(), (int)src->AsIntConCommon()->IconValue()); } else @@ -5560,6 +5961,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G assert(id != nullptr); id->idIns(ins); // Set the instruction. + if (useNDD) + { + assert(memOp == src); + id->idReg1(targetReg); + id->idReg2(dst->GetRegNum()); + id->idSetEvexNdContext(); + } // Determine the instruction format insFormat fmt = IF_NONE; @@ -5575,12 +5983,13 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G } else { - fmt = emitInsModeFormat(ins, IF_RRD_ARD); + fmt = useNDD ? emitInsModeFormat(ins, IF_RWR_RRD_ARD) : emitInsModeFormat(ins, IF_RRD_ARD); } } else { assert(memOp == dst); + assert(!useNDD); if (cnsOp != nullptr) { @@ -5619,6 +6028,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G else { assert(memOp == dst); + assert(!useNDD); if (cnsOp != nullptr) { @@ -5641,7 +6051,7 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G dispIns(id); emitCurIGsize += sz; - return (memOp == src) ? dst->GetRegNum() : REG_NA; + return (memOp == src) ? (useNDD ? targetReg : dst->GetRegNum()) : REG_NA; } } } @@ -5689,15 +6099,24 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G } else { - // src is a stack based local variable - // dst is a register - emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset); + if (useNDD) + { + emitIns_R_R_S(ins, attr, targetReg, dst->GetRegNum(), varNum, offset, INS_OPTS_EVEX_nd); + return targetReg; + } + else + { + // src is a stack based local variable + // dst is a register + emitIns_R_S(ins, attr, dst->GetRegNum(), varNum, offset); + } } } else { assert(memOp == dst); assert((dst->GetRegNum() == REG_NA) || dst->IsRegOptional()); + assert(!useNDD); if (cnsOp != nullptr) { @@ -5729,10 +6148,20 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G { assert(!dst->isContained()); GenTreeIntConCommon* intCns = src->AsIntConCommon(); - emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue()); + + if (useNDD) + { + emitIns_R_R_I(ins, attr, targetReg, dst->GetRegNum(), (int)intCns->IconValue(), INS_OPTS_EVEX_nd); + return targetReg; + } + else + { + emitIns_R_I(ins, attr, dst->GetRegNum(), intCns->IconValue()); + } } else { + assert(!useNDD); assert(src->IsCnsFltOrDbl()); GenTreeDblCon* dblCns = src->AsDblCon(); @@ -5751,7 +6180,15 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G } else { - emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum()); + if (useNDD) + { + emitIns_R_R_R(ins, attr, targetReg, dst->GetRegNum(), src->GetRegNum(), INS_OPTS_EVEX_nd); + return targetReg; + } + else + { + emitIns_R_R(ins, attr, dst->GetRegNum(), src->GetRegNum()); + } } } @@ -5902,7 +6339,7 @@ void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeI * Add an instruction referencing a single register. */ -void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) +void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions /* = INS_OPTS_NONE */) { emitAttr size = EA_SIZE(attr); @@ -5978,6 +6415,8 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) id->idInsFmt(fmt); id->idReg1(reg); + SetEvexNfIfNeeded(id, instOptions); + // Vex bytes sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); @@ -6050,10 +6489,11 @@ void emitter::emitStoreSimd12ToLclOffset(unsigned varNum, unsigned offset, regNu * Add an instruction referencing a register and a constant. */ -void emitter::emitIns_R_I(instruction ins, - emitAttr attr, - regNumber reg, - ssize_t val DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags)) +void emitter::emitIns_R_I(instruction ins, + emitAttr attr, + regNumber reg, + ssize_t val, + insOpts instOptions DEBUGARG(size_t targetHandle) DEBUGARG(GenTreeFlags gtFlags)) { emitAttr size = EA_SIZE(attr); @@ -6193,6 +6633,9 @@ void emitter::emitIns_R_I(instruction ins, id->idDebugOnlyInfo()->idMemCookie = targetHandle; #endif + SetEvexNfIfNeeded(id, instOptions); + SetEvexDFVIfNeeded(id, instOptions); + if (isSimdInsAndValInByte) { bool includeRexPrefixSize = true; @@ -6206,8 +6649,14 @@ void emitter::emitIns_R_I(instruction ins, sz += emitInsSize(id, insCodeMI(ins), includeRexPrefixSize); } - sz += emitGetAdjustedSize(id, insCodeMI(ins)); +#ifdef TARGET_AMD64 + if (reg == REG_EAX && !instrIs3opImul(ins) && TakesApxExtendedEvexPrefix(id)) + { + // ACC form is not promoted into EVEX space, need to emit with MI form. + sz += 1; + } +#endif // TARGET_AMD64 // Do we need a REX prefix for AMD64? We need one if we are using any extended register (REX.R), or if we have a // 64-bit sized operand (REX.W). Note that IMUL in our encoding is special, with a "built-in", implicit, target @@ -6981,6 +7430,15 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum id->idReg1(reg1); id->idReg2(reg2); + SetEvexNdIfNeeded(id, instOptions); + SetEvexNfIfNeeded(id, instOptions); + SetEvexDFVIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + id->idInsFmt(IF_RWR_RRD); + } + if ((instOptions & INS_OPTS_EVEX_b_MASK) != INS_OPTS_NONE) { // if EVEX.b needs to be set in this path, then it should be embedded rounding. @@ -7034,6 +7492,30 @@ void emitter::emitIns_R_R_I( assert((instOptions & INS_OPTS_EVEX_b_MASK) == 0); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNdIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + // need to fix the instruction opcode for legacy instructions as they has different opcode for RI form. + code = insCodeMI(ins); + // need to fix the instructions format for NDD legacy instructions. + insFormat fmt; + switch (ins) + { + case INS_shl_N: + case INS_shr_N: + case INS_sar_N: + case INS_ror_N: + case INS_rol_N: + fmt = IF_RWR_RRD_SHF; + break; + + default: + fmt = IF_RWR_RRD_CNS; + break; + } + id->idInsFmt(fmt); + } UNATIVE_OFFSET sz = emitInsSizeRR(id, code, ival); id->idCodeSize(sz); @@ -7045,7 +7527,7 @@ void emitter::emitIns_R_R_I( void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs) { assert(ins == INS_prefetcht0 || ins == INS_prefetcht1 || ins == INS_prefetcht2 || ins == INS_prefetchnta || - ins == INS_inc || ins == INS_dec); + ins == INS_inc || ins == INS_dec || ins == INS_inc_no_evex || ins == INS_dec_no_evex); instrDesc* id = emitNewInstrAmd(attr, offs); @@ -7398,8 +7880,8 @@ void emitter::emitIns_R_R_C(instruction ins, void emitter::emitIns_R_R_R( instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, insOpts instOptions) { - assert(IsAvx512OrPriorInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins)); instrDesc* id = emitNewInstr(attr); id->idIns(ins); @@ -7415,6 +7897,14 @@ void emitter::emitIns_R_R_R( id->idSetEvexbContext(instOptions); } SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNdIfNeeded(id, instOptions); + SetEvexNfIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + // need to fix the instructions format for NDD legacy instructions. + id->idInsFmt(IF_RWR_RRD_RRD); + } UNATIVE_OFFSET sz = emitInsSizeRR(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -7426,8 +7916,8 @@ void emitter::emitIns_R_R_R( void emitter::emitIns_R_R_S( instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions) { - assert(IsAvx512OrPriorInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsApxExtendedEvexInstruction(ins)); instrDesc* id = emitNewInstr(attr); @@ -7439,6 +7929,12 @@ void emitter::emitIns_R_R_S( SetEvexBroadcastIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNdIfNeeded(id, instOptions); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + id->idInsFmt(IF_RWR_RRD_SRD); + } #ifdef DEBUG id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs; @@ -7852,6 +8348,7 @@ void emitter::emitIns_R_C( { SetEvexBroadcastIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexDFVIfNeeded(id, instOptions); sz = emitInsSizeCV(id, insCodeRM(ins)); } @@ -9689,6 +10186,8 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int va SetEvexBroadcastIfNeeded(id, instOptions); SetEvexEmbMaskIfNeeded(id, instOptions); + SetEvexNfIfNeeded(id, instOptions); + SetEvexDFVIfNeeded(id, instOptions); UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs); id->idCodeSize(sz); @@ -11393,6 +11892,13 @@ void emitter::emitDispEmbRounding(instrDesc* id) const { return; } + + if (IsApxExtendedEvexInstruction(id->idIns())) + { + // Apx-Evex.nd shared the same bit(s) with Evex.b, + // for ndd case, we don't need to display any thing special. + return; + } assert(!id->idHasMem()); unsigned roundingMode = id->idGetEvexbContext(); if (roundingMode == 1) @@ -11573,9 +12079,38 @@ void emitter::emitDispIns( /* Display the instruction name */ +#ifdef TARGET_AMD64 + if (IsApxNFEncodableInstruction(id->idIns()) && id->idIsEvexNfContextSet()) + { + // print the EVEX.NF indication in psudeo prefix style. + printf("{nf} "); + } +#endif // TARGET_AMD64 + sstr = codeGen->genInsDisplayName(id); printf(" %-9s", sstr); +#ifdef TARGET_AMD64 + if (IsCCMP(id->idIns())) + { + // print finite set notation for DFV + unsigned dfv = id->idGetEvexDFV(); + char dfvstr[20] = {0}; + int len = 0; + if (dfv & INS_FLAGS_OF) + len += snprintf(dfvstr + len, 4, "of,"); + if (dfv & INS_FLAGS_SF) + len += snprintf(dfvstr + len, 4, "sf,"); + if (dfv & INS_FLAGS_ZF) + len += snprintf(dfvstr + len, 4, "zf,"); + if (dfv & INS_FLAGS_CF) + len += snprintf(dfvstr + len, 4, "cf,"); + if (len) + dfvstr[len - 1] = 0; + printf("{dfv=%s} ", dfvstr); + } +#endif // TARGET_AMD64 + #ifndef HOST_UNIX if (strnlen_s(sstr, 10) >= 9) #else // HOST_UNIX @@ -12323,6 +12858,20 @@ void emitter::emitDispIns( break; } + case INS_rol: + case INS_ror: + case INS_rcl: + case INS_rcr: + case INS_shl: + case INS_shr: + case INS_sar: + { + printf("%s", emitRegName(id->idReg1(), attr)); + printf(", %s", emitRegName(id->idReg2(), attr)); + emitDispShift(ins, (BYTE)0); + break; + } + default: { printf("%s", emitRegName(id->idReg1(), attr)); @@ -12340,8 +12889,8 @@ void emitter::emitDispIns( case IF_RRW_RRD_RRD: case IF_RWR_RWR_RRD: { - assert(IsVexOrEvexEncodableInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins) || IsApxExtendedEvexInstruction(ins)); regNumber reg2 = id->idReg2(); regNumber reg3 = id->idReg3(); @@ -12568,6 +13117,19 @@ void emitter::emitDispIns( break; } + case IF_RWR_RRD_SHF: + { + assert(IsApxExtendedEvexInstruction(id->idIns())); + printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr)); + + emitGetInsCns(id, &cnsVal); + val = cnsVal.cnsVal; + + emitDispShift(ins, (BYTE)val); + + break; + } + case IF_RRD_MRD: case IF_RWR_MRD: case IF_RRW_MRD: @@ -13516,12 +14078,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) break; case EA_2BYTE: - - /* Output a size prefix for a 16-bit operand */ - - dst += emitOutputByte(dst, 0x66); - + { + // Output a size prefix for a 16-bit operand + if (TakesApxExtendedEvexPrefix(id)) + { + assert(IsApxExtendedEvexInstruction(ins)); + assert(hasEvexPrefix(code)); + // Evex.pp should already be added when adding the prefix. + assert((code & EXTENDED_EVEX_PP_BITS) != 0); + } + else + { + dst += emitOutputByte(dst, 0x66); + } FALLTHROUGH; + } case EA_4BYTE: #ifdef TARGET_AMD64 @@ -13565,7 +14136,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -14103,6 +14674,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); break; + case IF_RWR_RRD_ARD: + assert(((id->idGCref() == GCT_BYREF) && + (ins == INS_add || ins == INS_sub || ins == INS_sub_hide || insIsCMOV(ins))) || + ((id->idGCref() == GCT_GCREF) && insIsCMOV(ins))); + assert(id->idIsEvexNdContextSet()); + emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); + break; + case IF_ARD_RRD: case IF_AWR_RRD: break; @@ -14349,14 +14928,24 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) switch (size) { case EA_1BYTE: +#ifdef TARGET_AMD64 + assert((ins != INS_lzcnt_evex) && (ins != INS_tzcnt_evex) && (ins != INS_popcnt_evex)); +#endif // TARGET_AMD64 break; case EA_2BYTE: // Output a size prefix for a 16-bit operand - dst += emitOutputByte(dst, 0x66); + { + if (!TakesApxExtendedEvexPrefix(id)) + { + dst += emitOutputByte(dst, 0x66); + } + } FALLTHROUGH; case EA_4BYTE: + code |= 0x01; + break; #ifdef TARGET_AMD64 case EA_8BYTE: #endif // TARGET_AMD64 @@ -14365,9 +14954,21 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) * Note that incrementing "code" for INS_call (0xFF) would * overflow, whereas setting the lower bit to 1 just works out */ - - code |= 0x01; - break; + { + if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + code = AddRexWPrefix(id, code); + } +#ifdef TARGET_AMD64 + if ((ins != INS_lzcnt_evex) && (ins != INS_tzcnt_evex) && (ins != INS_popcnt_evex)) + // These instructions do not support 1-byte inputs and the opcode is exact. +#endif // TARGET_AMD64 + { + code |= 0x01; + } + break; + } #ifdef TARGET_X86 case EA_8BYTE: @@ -14401,7 +15002,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this int dspAsByte = dsp; - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -14455,7 +15056,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -14602,6 +15203,15 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); break; + case IF_RWR_RRD_SRD: // Register Read/Write, Stack Read (So we need to update GC live for register) + + // reg could have been a GCREF as GCREF + int=BYREF + // or BYREF+/-int=BYREF + assert(id->idGCref() == GCT_BYREF && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide)); + assert(id->idIsEvexNdContextSet()); + emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst); + break; + case IF_SRW_CNS: case IF_SRW_RRD: case IF_SRW_RRW: @@ -15187,7 +15797,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Can't use the compact form, use the long form ins = (instruction)(ins + 1); - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); @@ -15200,10 +15810,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRex2Prefix(id)) - { - code = AddRex2Prefix(ins, code); - } + code = AddX86PrefixIfNeeded(id, code, size); if (TakesRexWPrefix(id)) { @@ -15338,15 +15945,16 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) default: assert(id->idGCref() == GCT_NONE); - - code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); + code = insCodeMR(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, reg, size, code); if (size != EA_1BYTE) { // Set the 'w' bit to get the large version code |= 0x1; - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); @@ -15491,7 +16099,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } #ifdef FEATURE_HW_INTRINSICS else if ((ins == INS_bsf) || (ins == INS_bsr) || (ins == INS_crc32) || (ins == INS_lzcnt) || (ins == INS_popcnt) || - (ins == INS_tzcnt)) + (ins == INS_tzcnt) +#ifdef TARGET_AMD64 + || (ins == INS_lzcnt_evex) || (ins == INS_tzcnt_evex) || (ins == INS_popcnt_evex) +#endif // TARGET_AMD64 + ) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); @@ -15502,7 +16114,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) code |= 0x0100; } - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { assert(ins == INS_crc32); dst += emitOutputByte(dst, 0x66); @@ -15515,15 +16127,22 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) #endif // FEATURE_HW_INTRINSICS else { - assert(!TakesSimdPrefix(id)); + // TODO-XArch-APX: + // Ruihan: + // some instructions with NDD form might go into this path with EVEX prefix. + // might consider having a seperate path with checks like: TakesApxExtendedEvexPrefix + // essentially, we need to make it clear on the priority and necessity of REX2 and EVEX: + // REX2 is needed iff EGPRs are involved. + // EVEX is needed when NDD, NF or other features are involved. + // So the logic should be: + // checking if those new features are used, then check if EGPRs are involved. + // EGPRs will be supported by EVEX anyway, so don't need to check in the first place. + assert(!TakesSimdPrefix(id) || TakesApxExtendedEvexPrefix(id)); code = insCodeMR(ins); - if (TakesRex2Prefix(id)) - { - code = AddRex2Prefix(ins, code); - } + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeMRreg(id, code); - if (ins != INS_test) + if (ins != INS_test && !IsShiftInstruction(ins)) { code |= 2; } @@ -15537,11 +16156,31 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) case EA_2BYTE: // Output a size prefix for a 16-bit operand - dst += emitOutputByte(dst, 0x66); - FALLTHROUGH; + if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + // Evex.pp should already be added when adding the prefix. + assert((code & EXTENDED_EVEX_PP_BITS) != 0); + } + else + { + dst += emitOutputByte(dst, 0x66); + } + + code |= 0x1; + break; case EA_4BYTE: // Set the 'w' bit to get the large version + +#ifdef TARGET_AMD64 + if (TakesApxExtendedEvexPrefix(id)) + { + assert(hasEvexPrefix(code)); + // Evex.pp should already be added when adding the prefix + assert((code & EXTENDED_EVEX_PP_BITS) == 0); + } +#endif code |= 0x1; break; @@ -15588,10 +16227,24 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } - unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code); - regCode |= insEncodeReg012(id, regFor012Bits, size, &code); + unsigned regCode; + if (!id->idIsEvexNdContextSet() || !IsApxNDDEncodableInstruction(ins)) + { + regCode = insEncodeReg345(id, regFor345Bits, size, &code); + regCode |= insEncodeReg012(id, regFor012Bits, size, &code); + } + else + { + // unary ins with NDD form use Evex.vvvvv for dst, and ModRM.rm for src + code = insEncodeReg3456(id, reg1, size, code); + regCode = insEncodeReg012(id, reg2, size, &code); + } +#ifdef TARGET_AMD64 + if (TakesSimdPrefix(id) && !IsCCMP(ins)) +#else if (TakesSimdPrefix(id)) +#endif { // In case of AVX instructions that take 3 operands, we generally want to encode reg1 // as first source. In this case, reg1 is both a source and a destination. @@ -15647,6 +16300,11 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regCode)); } + else if (IsApxNDDEncodableInstruction(ins) && id->idIsEvexNdContextSet()) + { + dst += emitOutputByte(dst, (code & 0xFF)); + dst += emitOutputByte(dst, (0xC0 | regCode | (code >> 8))); + } else { dst += emitOutputWord(dst, code); @@ -15849,8 +16507,9 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code_t code; instruction ins = id->idIns(); - assert(IsVexOrEvexEncodableInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins) || isAvx512Blendv(ins) || IsKInstruction(ins) || + IsApxExtendedEvexInstruction(ins)); regNumber targetReg = id->idReg1(); regNumber src1 = id->idReg2(); regNumber src2 = id->idReg3(); @@ -15859,6 +16518,51 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code = insCodeRM(ins); code = AddX86PrefixIfNeeded(id, code, size); + if (IsApxExtendedEvexInstruction(ins) && !IsBMIInstruction(ins)) + { + // TODO-XArch-apx: + // For rm-like operand encoding instructions: + // legacy promoted EVEX encoding has introduced different semantic: + // op1 - vvvvv + // op2 - MODRM.REG + // op3 - MODRM.R/M + regNumber tmp = src1; + src1 = targetReg; + targetReg = tmp; + + switch (size) + { + case EA_1BYTE: + // TODO-APX : verify We should never end up here. Atleast for instructions I have looked into, we + // promote to int to do operation + noway_assert(RBM_BYTE_REGS & genRegMask(src1)); + noway_assert(RBM_BYTE_REGS & genRegMask(src2)); + noway_assert(RBM_BYTE_REGS & genRegMask(targetReg)); + break; + + case EA_2BYTE: + case EA_4BYTE: + // Set the 'w' bit to get the large version + code = insIsCMOV(ins) ? code : (code | (0x01)); + break; + +#ifdef TARGET_AMD64 + case EA_8BYTE: + // TODO-AMD64-CQ: Better way to not emit REX.W when we don't need it + // Don't need to zero out the high bits explicitly + code = AddRexWPrefix(id, code); // TODO-APX : Revisit. does xor or other cases need to be handled + // differently? see emitOutputRR + // Set the 'w' bit to get the large version + code = insIsCMOV(ins) ? code : (code | (0x01)); + break; + +#endif // TARGET_AMD64 + + default: + assert(!"unexpected size"); + } + } + code = insEncodeRMreg(id, code); if (TakesRexWPrefix(id)) @@ -15906,7 +16610,75 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) dst += emitOutputByte(dst, (0xC0 | regCode)); } - noway_assert(!id->idGCref()); + // noway_assert(!id->idGCref()); + if (id->idGCref()) + { + assert(IsApxExtendedEvexInstruction(ins)); + assert(id->idInsFmt() == IF_RWR_RRD_RRD); + switch (id->idIns()) + { + /* + This must be one of the following cases: + + xor reg, reg to assign NULL + + and r1 , r2 if (ptr1 && ptr2) ... + or r1 , r2 if (ptr1 || ptr2) ... + + add r1 , r2 to compute a normal byref + sub r1 , r2 to compute a strange byref (VC only) + + */ + case INS_xor: + assert(src1 == src2); + emitGCregLiveUpd(id->idGCref(), targetReg, dst); + break; + + case INS_or: + case INS_and: + emitGCregDeadUpd(targetReg, dst); + break; + + case INS_add: + case INS_sub: + case INS_sub_hide: + assert(id->idGCref() == GCT_BYREF); + +#if 0 +#ifdef DEBUG + // Due to elided register moves, we can't have the following assert. + // For example, consider: + // t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx + // /--* t85 byref + // * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx + // Here, V01 is type `long` on entry, then is stored as a byref. But because + // the register allocator assigned the same register, no instruction was + // generated, and we only (currently) make gcref/byref changes in emitter GC info + // when an instruction is generated. We still generate correct GC info, as this + // instruction, if writing a GC ref even through reading a long, will go live here. + // These situations typically occur due to unsafe casting, such as with Span. + + regMaskTP regMask; + regMask = genRegMask(src1) | genRegMask(src2); + + // r1/r2 could have been a GCREF as GCREF + int=BYREF + // or BYREF+/-int=BYREF + assert(((regMask & emitThisGCrefRegs) && (ins == INS_add)) || + ((regMask & emitThisByrefRegs) && (ins == INS_add || ins == INS_sub || ins == INS_sub_hide))); +#endif // DEBUG +#endif // 0 + + // Mark r1 as holding a byref + emitGCregLiveUpd(GCT_BYREF, targetReg, dst); + break; + + default: +#ifdef DEBUG + emitDispIns(id, false, false, false); +#endif + assert(!"unexpected GC reg update instruction"); + } + } if (!emitInsCanOnlyWriteSSE2OrAVXReg(id)) { @@ -16089,6 +16861,12 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) useACC = true; } } + + if (TakesApxExtendedEvexPrefix(id)) + { + // ACC form does not have support for promoted EVEX. + useACC = false; + } } else { @@ -16144,7 +16922,10 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) case EA_2BYTE: // Output a size prefix for a 16-bit operand - dst += emitOutputByte(dst, 0x66); + if (!TakesApxExtendedEvexPrefix(id)) + { + dst += emitOutputByte(dst, 0x66); + } FALLTHROUGH; case EA_4BYTE: @@ -16822,7 +17603,23 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const // ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) { - assert(TakesEvexPrefix(id)); + assert(TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)); + + if (!hasTupleTypeInfo(id->idIns())) + { + // After APX, some instructions with APX features will be promoted + // to APX-EVEX, we will re-use the existing displacement emitting + // path, but for those instructions with no tuple information, + // APX-EVEX treat the scaling factor to be 1 constantly. + instruction ins = id->idIns(); + // TODO-XArch-APX: + // This assert may need tweak if BMI1 instructions are promoted + // into EVEX for multiple features, currently only EVEX.NF. + assert(IsApxExtendedEvexInstruction(id->idIns())); + *dspInByte = ((signed char)dsp == (ssize_t)dsp); + return dsp; + } + insTupleType tt = insTupleTypeInfo(id->idIns()); assert(hasTupleTypeInfo(id->idIns())); @@ -17477,7 +18274,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Output a size prefix for a 16-bit operand - if (size == EA_2BYTE) + if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) { dst += emitOutputByte(dst, 0x66); } @@ -17493,6 +18290,37 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) break; } + case IF_RWR_RRD_SHF: + { + assert(IsApxExtendedEvexInstruction(ins)); + code = insCodeMR(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); + + // set the W bit + if (size != EA_1BYTE) + { + code |= 1; + } + + // Emit the REX prefix if it exists + if (TakesRexWPrefix(id)) + { + code = AddRexWPrefix(id, code); + } + + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + dst += emitOutputWord(dst, code); + dst += emitOutputByte(dst, emitGetInsSC(id)); + sz = emitSizeOfInsDsc_CNS(id); + + // Update GC info. + assert(!id->idGCref()); + emitGCregDeadUpd(id->idReg1(), dst); + break; + } + case IF_RRD_RRD: case IF_RWR_RRD: case IF_RRW_RRD: @@ -17566,7 +18394,105 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // Also, determine which operand goes where in the ModRM byte. regNumber mReg; regNumber rReg; - if (hasCodeMR(ins)) + if (IsApxExtendedEvexInstruction(ins)) + { + assert(hasCodeMI(ins)); + code = insCodeMI(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg1(), size, code); + mReg = id->idReg2(); + code = insEncodeMIreg(id, mReg, size, code); + rReg = REG_NA; + ssize_t val = emitGetInsSC(id); + bool valInByte = ((signed char)val == (target_ssize_t)val) && (ins != INS_mov) && (ins != INS_test); + + switch (size) + { + case EA_1BYTE: + break; + + case EA_2BYTE: + code |= EXTENDED_EVEX_PP_BITS; + FALLTHROUGH; + + case EA_4BYTE: + code |= 1; + break; + +#ifdef TARGET_AMD64 + case EA_8BYTE: + code = AddRexWPrefix(id, code); + code |= 1; + break; +#endif // TARGET_AMD64 + + default: + assert(!"unexpected size"); + } + + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + + if (valInByte && size > EA_1BYTE) + { + code |= 2; + dst += emitOutputWord(dst, code); + dst += emitOutputByte(dst, val); + } + else + { + dst += emitOutputWord(dst, code); + switch (size) + { + case EA_1BYTE: + dst += emitOutputByte(dst, val); + break; + case EA_2BYTE: + dst += emitOutputWord(dst, val); + break; + case EA_4BYTE: + dst += emitOutputLong(dst, val); + break; +#ifdef TARGET_AMD64 + case EA_8BYTE: + dst += emitOutputLong(dst, val); + break; +#endif // TARGET_AMD64 + default: + break; + } + + if (id->idIsCnsReloc()) + { + emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)(size_t)val, IMAGE_REL_BASED_HIGHLOW); + assert(size == EA_4BYTE); + } + } + + sz = emitSizeOfInsDsc_CNS(id); + + if (!emitInsCanOnlyWriteSSE2OrAVXReg(id)) + { + emitGCregDeadUpd(id->idReg1(), dst); + } + + switch (id->idInsFmt()) + { + case IF_RWR_RRD_CNS: + assert(!instrIs3opImul(ins)); + + emitGCregDeadUpd(id->idReg1(), dst); + break; + + default: +#ifdef DEBUG + emitDispIns(id, false, false, false); +#endif + assert(!"unexpected GC ref instruction format"); + } + + break; + } + else if (hasCodeMR(ins)) { code = insCodeMR(ins); // Emit the VEX prefix if it exists @@ -17801,6 +18727,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); + if (id->idIsEvexNdContextSet() && TakesApxExtendedEvexPrefix(id)) + { + // TODO-XArchh-apx: + // Ruihan: I'm not sure why instructions on this path can be with instruction + // format other than IF_RWR_RRD_ARD, fix here for debug purpose only, + // need revisit. + id->idInsFmt(IF_RWR_RRD_ARD); + + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg1(), size, code); + regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8); + dst = emitOutputAM(dst, id, code | regcode); + + sz = emitSizeOfInsDsc_AMD(id); + break; + } + if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { // Special case 4-byte AVX instructions as the @@ -18068,7 +19011,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_RRD_SRD: case IF_RWR_RWR_SRD: { - assert(IsVexOrEvexEncodableInstruction(ins)); + assert(IsVexOrEvexEncodableInstruction(ins) || IsApxExtendedEvexInstruction(ins)); + + if (id->idIsEvexNdContextSet() && IsApxNDDEncodableInstruction(ins)) + { + // EVEX.vvvv has different semantic for APX-EVEX NDD instructions. + code = insCodeRM(ins); + code = AddX86PrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg1(), size, code); + regcode = (insEncodeReg345(id, id->idReg2(), size, &code) << 8); + dst = emitOutputSV(dst, id, code | regcode); + sz = sizeof(instrDesc); + break; + } code = insCodeRM(ins); code = AddX86PrefixIfNeeded(id, code, size); @@ -18895,7 +19850,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_inc: + case INS_inc_no_evex: case INS_dec: + case INS_dec_no_evex: case INS_neg: case INS_not: if (memFmt == IF_NONE) @@ -18937,6 +19894,26 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cmovge: case INS_cmovle: case INS_cmovg: +#ifdef TARGET_AMD64 + // todo-xarch-apx: we need to double check the logic for ccmp + case INS_ccmpo: + case INS_ccmpno: + case INS_ccmpb: + case INS_ccmpae: + case INS_ccmpe: + case INS_ccmpne: + case INS_ccmpbe: + case INS_ccmpa: + case INS_ccmps: + case INS_ccmpns: + case INS_ccmpt: + case INS_ccmpf: + case INS_ccmpl: + case INS_ccmpge: + case INS_ccmple: + case INS_ccmpg: +#endif + if (memFmt == IF_NONE) { result.insThroughput = PERFSCORE_THROUGHPUT_4X; @@ -18980,10 +19957,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_add: + case INS_add_no_evex: case INS_sub: case INS_sub_hide: case INS_and: + case INS_and_no_evex: case INS_or: + case INS_or_no_evex: case INS_xor: if (memFmt == IF_NONE) { @@ -19133,6 +20113,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case IF_RRW: + // TODO-XArch-APX: to be verified if this data is correct for NDD form. + case IF_RWR_RRD: // ins reg, cl result.insThroughput = PERFSCORE_THROUGHPUT_2C; result.insLatency = PERFSCORE_LATENCY_2C; @@ -19160,6 +20142,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (insFmt) { case IF_RRW: + // TODO-XArch-APX: to be verified if this data is correct for NDD form. + case IF_RWR_RRD: // ins reg, 1 result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; @@ -19193,6 +20177,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (insFmt) { case IF_RRW_SHF: + // TODO-XArch-APX: to be verified if this data is correct for NDD form. + case IF_RWR_RRD_SHF: // ins reg, cns result.insThroughput = PERFSCORE_THROUGHPUT_2X; break; @@ -20128,6 +21114,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vshuff64x2: case INS_vshufi32x4: case INS_vshufi64x2: +#ifdef TARGET_AMD64 + case INS_popcnt_evex: + case INS_lzcnt_evex: + case INS_tzcnt_evex: +#endif // TARGET_AMD64 { result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_3C; @@ -20660,7 +21651,6 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insThroughput = PERFSCORE_THROUGHPUT_1C; break; } - default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 5f820c7c022c20..ea3cedeacc3743 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -128,15 +128,22 @@ static bool IsAVXVNNIInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); static bool IsKInstruction(instruction ins); static bool IsKInstructionWithLBit(instruction ins); +static bool IsApxOnlyInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); bool HasVexEncoding(instruction ins) const; bool HasEvexEncoding(instruction ins) const; bool HasRex2Encoding(instruction ins) const; +bool HasApxNdd(instruction ins) const; +bool HasApxNf(instruction ins) const; bool IsVexEncodableInstruction(instruction ins) const; bool IsEvexEncodableInstruction(instruction ins) const; bool IsRex2EncodableInstruction(instruction ins) const; +bool IsApxNDDEncodableInstruction(instruction ins) const; +bool IsApxNFEncodableInstruction(instruction ins) const; +bool IsApxExtendedEvexInstruction(instruction ins) const; +bool IsShiftInstruction(instruction ins) const; bool IsLegacyMap1(code_t code) const; bool IsVexOrEvexEncodableInstruction(instruction ins) const; @@ -332,6 +339,18 @@ void SetUseRex2Encoding(bool value) useRex2Encodings = value; } +// Is Promoted EVEX encoding supported. +bool usePromotedEVEXEncodings; +bool UsePromotedEVEXEncoding() const +{ + return usePromotedEVEXEncodings; +} + +void SetUsePromotedEVEXEncoding(bool value) +{ + usePromotedEVEXEncodings = value; +} + //------------------------------------------------------------------------ // UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported // contains Evex prefix. @@ -349,6 +368,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_CODE 0x6200000000000000ULL bool TakesEvexPrefix(const instrDesc* id) const; +bool TakesApxExtendedEvexPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -405,11 +425,7 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) // code_t AddX86PrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) { - // TODO-xarch-apx: - // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality - // of these functions are overlapping. - - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { return AddEvexPrefix(id, code, size); } @@ -445,7 +461,7 @@ code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitA // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality // of these functions are overlapping. - if (TakesEvexPrefix(id)) + if (TakesEvexPrefix(id) || TakesApxExtendedEvexPrefix(id)) { return !hasEvexPrefix(code) ? AddEvexPrefix(id, code, size) : code; } @@ -511,6 +527,65 @@ void SetEvexEmbMaskIfNeeded(instrDesc* id, insOpts instOptions) } } +//------------------------------------------------------------------------ +// SetEvexNdIfNeeded: set NDD form - new data destination if needed. +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexNdIfNeeded(instrDesc* id, insOpts instOptions) +{ + if ((instOptions & INS_OPTS_EVEX_nd_MASK) != 0) + { + assert(UsePromotedEVEXEncoding()); + assert(IsApxNDDEncodableInstruction(id->idIns())); + id->idSetEvexNdContext(); + } + else + { + assert((instOptions & INS_OPTS_EVEX_nd_MASK) == 0); + } +} + +//------------------------------------------------------------------------ +// SetEvexNdIfNeeded: set Evex.nf on instrDesc +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexNfIfNeeded(instrDesc* id, insOpts instOptions) +{ + if ((instOptions & INS_OPTS_EVEX_nf_MASK) != 0) + { + assert(UsePromotedEVEXEncoding()); + assert(IsApxNFEncodableInstruction(id->idIns())); + id->idSetEvexNfContext(); + } + else + { + assert((instOptions & INS_OPTS_EVEX_nf_MASK) == 0); + } +} + +//------------------------------------------------------------------------ +// SetEvexDFVIfNeeded: set default flag values on an instrDesc +// +// Arguments: +// id - instruction descriptor +// instOptions - emit options +// +void SetEvexDFVIfNeeded(instrDesc* id, insOpts instOptions) +{ + if ((instOptions & INS_OPTS_EVEX_dfv_MASK) != 0) + { + assert(UsePromotedEVEXEncoding()); + assert(IsCCMP(id->idIns())); + id->idSetEvexDFV(instOptions); + } +} + //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix. // Check if the prefix already exists befpre adding. @@ -623,6 +698,9 @@ static bool IsRexW1Instruction(instruction ins); static bool IsRexWXInstruction(instruction ins); static bool IsRexW1EvexInstruction(instruction ins); +static bool IsCCMP(instruction ins); +static insCC GetCCFromCCMP(instruction ins); + bool isAvx512Blendv(instruction ins) { return ins == INS_vblendmps || ins == INS_vblendmpd || ins == INS_vpblendmb || ins == INS_vpblendmd || @@ -753,7 +831,7 @@ void emitIns_Data16(); void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val); -void emitIns_R(instruction ins, emitAttr attr, regNumber reg); +void emitIns_R(instruction ins, emitAttr attr, regNumber reg, insOpts instOptions = INS_OPTS_NONE); void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs); @@ -762,7 +840,9 @@ void emitIns_A(instruction ins, emitAttr attr, GenTreeIndir* indir); void emitIns_R_I(instruction ins, emitAttr attr, regNumber reg, - ssize_t val DEBUGARG(size_t targetHandle = 0) DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY)); + ssize_t val, + insOpts instOptions = INS_OPTS_NONE DEBUGARG(size_t targetHandle = 0) + DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY)); void emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regNumber srgReg, bool canSkip); diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index c0e147ee1ee956..30b22c5f6b76c8 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -9480,7 +9480,7 @@ struct GenTreeOpCC : public GenTreeOp #endif // DEBUGGABLE_GENTREE }; -#ifdef TARGET_ARM64 +#if defined(TARGET_ARM64) enum insCflags : unsigned { INS_FLAGS_NONE, @@ -9503,6 +9503,71 @@ enum insCflags : unsigned INS_FLAGS_NZC, INS_FLAGS_NZCV, }; +#elif defined(TARGET_XARCH) +enum insCflags : unsigned +{ + INS_FLAGS_NONE = 0x0, + INS_FLAGS_CF = 0x1, + INS_FLAGS_ZF = 0x2, + INS_FLAGS_SF = 0x4, + INS_FLAGS_OF = 0x8 +}; + +// todo-apx-xarch : this data structure might not be necessary, but nice to have the CC +// encoded somewhere +enum insCC : unsigned +{ + INS_CC_O = 0x0, // OF = 1 + + INS_CC_NO = 0x1, // OF = 0 + + INS_CC_B = 0x2, // CF = 1 + INS_CC_C = 0x2, // CF = 1 + INS_CC_NAE = 0x2, // CF = 1 + + INS_CC_NB = 0x3, // CF = 0 + INS_CC_NC = 0x3, // CF = 0 + INS_CC_AE = 0x3, // CF = 0 + + INS_CC_E = 0x4, // ZF = 1 + INS_CC_Z = 0x4, // ZF = 1 + + INS_CC_NE = 0x5, // ZF = 0 + INS_CC_NZ = 0x5, // ZF = 0 + + INS_CC_BE = 0x6, // (CF OR ZF) = 1 + INS_CC_NA = 0x6, // (CF OR ZF) = 1 + + INS_CC_NBE = 0x7, // (CF OR ZF) = 0 + INS_CC_A = 0x7, // (CF OR ZF) = 0 + + INS_CC_S = 0x8, // (SF = 1) + + INS_CC_NS = 0x9, // (SF = 0) + + // no parity flag in ccmp/ctest + + // 0b1010 special always evals to true + INS_CC_TRUE = 0xA, + + // 0b1011 special always evals to false + INS_CC_FALSE = 0xB, + + INS_CC_L = 0xC, // (SF XOR OF) = 1 + INS_CC_NGE = 0xC, // (SF XOR OF) = 1 + + INS_CC_NL = 0xD, // (SF XOR OF) = 0 + INS_CC_GE = 0xD, // (SF XOR OF) = 0 + + INS_CC_LE = 0xE, // (SF XOR OF) OR ZF) = 1 + INS_CC_NG = 0xE, // (SF XOR OF) OR ZF) = 1 + + INS_CC_NLE = 0xF, // (SF XOR OF) OR ZF) = 0 + INS_CC_G = 0xF, // (SF XOR OF) OR ZF) = 0 +}; +#endif + +#if defined(TARGET_ARM64) || defined(TARGET_AMD64) struct GenTreeCCMP final : public GenTreeOpCC { @@ -9521,7 +9586,7 @@ struct GenTreeCCMP final : public GenTreeOpCC } #endif // DEBUGGABLE_GENTREE }; -#endif +#endif // defined(TARGET_ARM64) || defined(TARGET_AMD64) //------------------------------------------------------------------------ // Deferred inline functions of GenTree -- these need the subtypes above to diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index 8c6c67fd6a3273..39b01cc67d82df 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -238,11 +238,14 @@ GTNODE(JCC , GenTreeCC ,0,0,GTK_LEAF|GTK_NOVALUE|DBK_NOTHI GTNODE(SETCC , GenTreeCC ,0,0,GTK_LEAF|DBK_NOTHIR) // Variant of SELECT that reuses flags computed by a previous node with the specified condition. GTNODE(SELECTCC , GenTreeOpCC ,0,0,GTK_BINOP|DBK_NOTHIR) -#ifdef TARGET_ARM64 +#if defined(TARGET_ARM64) || defined(TARGET_AMD64) +// Reusing ARM CCMP instruction for AMD64 APX as well. // The arm64 ccmp instruction. If the specified condition is true, compares two // operands and sets the condition flags according to the result. Otherwise // sets the condition flags to the specified immediate value. GTNODE(CCMP , GenTreeCCMP ,0,0,GTK_BINOP|GTK_NOVALUE|DBK_NOTHIR) +#endif // defined(TARGET_ARM64) || defined(TARGET_AMD64) +#ifdef TARGET_ARM64 // Maps to arm64 csinc/cinc instruction. Computes result = condition ? op1 : op2 + 1. // If op2 is null, computes result = condition ? op1 + 1 : op1. GTNODE(SELECT_INC , GenTreeOp ,0,0,GTK_BINOP|DBK_NOTHIR) diff --git a/src/coreclr/jit/gtstructs.h b/src/coreclr/jit/gtstructs.h index 26f88d17909974..0a3fcb4c4a49e6 100644 --- a/src/coreclr/jit/gtstructs.h +++ b/src/coreclr/jit/gtstructs.h @@ -119,6 +119,9 @@ GTSTRUCT_2(CC , GT_JCC, GT_SETCC) #ifdef TARGET_ARM64 GTSTRUCT_1(CCMP , GT_CCMP) GTSTRUCT_N(OpCC , GT_SELECTCC, GT_SELECT_INCCC, GT_JCMP, GT_JTEST, GT_SELECT_INVCC, GT_SELECT_NEGCC) +#elif TARGET_AMD64 +GTSTRUCT_1(CCMP , GT_CCMP) +GTSTRUCT_3(OpCC , GT_SELECTCC, GT_JCMP, GT_JTEST) #else GTSTRUCT_3(OpCC , GT_SELECTCC, GT_JCMP, GT_JTEST) #endif diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 5ec40ea333973c..02c6a0b5d294a7 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -221,6 +221,12 @@ enum insFlags : uint64_t // APX: REX2 prefix: Encoding_REX2 = 1ULL << 44, + // APX: EVEX.ND: + INS_Flags_Has_NDD = 1ULL << 45, + + // APX: EVEX.NF: + INS_Flags_Has_NF = 1ULL << 46, + // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, }; @@ -259,6 +265,24 @@ enum insOpts: unsigned INS_OPTS_EVEX_z_MASK = 0x20, // mask for EVEX.z related features INS_OPTS_EVEX_em_zero = 1 << 5, // Embedded mask merges with zero + + // One-bit: 0b0100_0000 + INS_OPTS_EVEX_nd_MASK = 0x40, // mask for APX-EVEX.nd related features + + INS_OPTS_EVEX_nd = 1 << 6, // NDD form for legacy instructions + + // One-bit: 0b1000_0000 + INS_OPTS_EVEX_nf_MASK = 0x80, // mask for APX-EVEX.nf related features + + INS_OPTS_EVEX_nf = 1 << 7, // NDD form for legacy instructions + INS_OPTS_EVEX_dfv_byte_offset = 8, // save the bit offset for first dfv flag pos + + INS_OPTS_EVEX_dfv_cf = 1 << 8, + INS_OPTS_EVEX_dfv_zf = 1 << 9, + INS_OPTS_EVEX_dfv_sf = 1 << 10, + INS_OPTS_EVEX_dfv_of = 1 << 11, + + INS_OPTS_EVEX_dfv_MASK = 0xF00, }; #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index fbc635ab5553b4..c273294df606df 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -58,26 +58,31 @@ INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, Encoding_REX2) INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, Encoding_REX2) -INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit |Encoding_REX2) -INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2) -INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2) +INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) +INST5(inc_no_evex, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) +INST5(dec_no_evex, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, Encoding_REX2) // id nm um mr mi rm a4 tt flags -INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(add_no_evex, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(or_no_evex, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(and_no_evex, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) // Does not affect the stack tracking in the emitter INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) @@ -99,25 +104,25 @@ INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, #endif INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) -INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2) -INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2) -INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2) -INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2) -INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2) -INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2) -INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) -INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) -INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2) -INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2) -INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) -INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) -INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NDD | INS_Flags_Has_NF) // id nm um mr mi rm tt flags @@ -125,28 +130,30 @@ INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, // as 2-operand instructions with the target register being implicit // implicit_reg = op1*op2_icon #define INSTMUL INST3 -INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_AX, "imul", IUM_RD, BAD_CODE, 0x000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_CX, "imul", IUM_RD, BAD_CODE, 0x000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_DX, "imul", IUM_RD, BAD_CODE, 0x001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_BX, "imul", IUM_RD, BAD_CODE, 0x001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_SP, "imul", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_BP, "imul", IUM_RD, BAD_CODE, 0x002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_SI, "imul", IUM_RD, BAD_CODE, 0x003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_DI, "imul", IUM_RD, BAD_CODE, 0x003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) #ifdef TARGET_AMD64 -INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) -INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) +INSTMUL(imul_08, "imul", IUM_RD, BAD_CODE, 0x4400000068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_09, "imul", IUM_RD, BAD_CODE, 0x4400000868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_10, "imul", IUM_RD, BAD_CODE, 0x4400001068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_11, "imul", IUM_RD, BAD_CODE, 0x4400001868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_12, "imul", IUM_RD, BAD_CODE, 0x4400002068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_13, "imul", IUM_RD, BAD_CODE, 0x4400002868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_14, "imul", IUM_RD, BAD_CODE, 0x4400003068, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) +INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, BAD_CODE, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NF) #endif // TARGET_AMD64 + + // the hex codes in this file represent the instruction encoding as follows: // 0x0000ff00 - modrm byte position // 0x000000ff - last byte of opcode (before modrm) @@ -588,11 +595,11 @@ INST3(LAST_AVXVNNI_INSTRUCTION, "LAST_AVXVNNI_INSTRUCTION", IUM_WR, BAD_CODE, BA INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // BMI1 -INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Logical AND NOT -INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF) // Bit Field Extract -INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Extract Lowest Set Isolated Bit -INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Get Mask Up to Lowest Set Bit -INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Reset Lowest Set Bit +INST3(andn, "andn", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF2), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Logical AND NOT +INST3(bextr, "bextr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Resets_CF | INS_Flags_Has_NF) // Bit Field Extract +INST3(blsi, "blsi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Extract Lowest Set Isolated Bit +INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Resets_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Get Mask Up to Lowest Set Bit +INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Reset Lowest Set Bit // BMI2 INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position @@ -884,6 +891,35 @@ INST3(vpmultishiftqb, "pmultishiftqb", IUM_WR, BAD_CODE, BAD_ INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +// id nm um mr mi rm tt flags + +INST3(FIRST_APX_INSTRUCTION, "FIRST_APX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(FIRST_CCMP_INSTRUCTION, "FIRST_CCMP_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(ccmpo, "ccmpo", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpno, "ccmpno", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpb, "ccmpb", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpae, "ccmpae", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpe, "ccmpe", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpne, "ccmpne", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpbe, "ccmpbe", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpa, "ccmpa", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmps, "ccmps", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpns, "ccmpns", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpt, "ccmpt", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpf, "ccmpf", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpl, "ccmpl", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpge, "ccmpge", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmple, "ccmple", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpg, "ccmpg", IUM_RD, 0x000038, 0x0003880, 0x00003A, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) + +INST3(LAST_CCMP_INSTRUCTION, "LAST_CCMP_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(LAST_APX_INSTRUCTION, "LAST_APX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + + + // Scalar instructions in SSE4.2 INST3(crc32, "crc32", IUM_RW, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), INS_TT_NONE, INS_FLAGS_None) @@ -899,35 +935,43 @@ INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, // POPCNT INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2) +#if defined(TARGET_AMD64) +INST3(tzcnt_evex, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F4, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Count the Number of Trailing Zero Bits +INST3(lzcnt_evex, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F5, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) +INST3(popcnt_evex, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x000088, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | INS_Flags_Has_NF) +#endif // TARGET_AMD64 + +INST3(neg, "neg", IUM_RW, 0x0018F6, BAD_CODE, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(not, "not", IUM_RW, 0x0010F6, BAD_CODE, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) + +INST3(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, 0x0000D2, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, 0x0008D2, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + +INST3(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, 0x0010D2, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST3(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, 0x0018D2, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST3(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, 0x0020D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, 0x0028D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, 0x0038D2, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + // id nm um mr mi flags INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None) INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None) INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, Encoding_REX2) -INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) - -INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) - - // id nm um mr flags INST1(r_movsb, "rep movsb", IUM_RD, 0x00A4F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) INST1(r_movsd, "rep movsd", IUM_RD, 0x00A5F3, INS_TT_NONE, Reads_DF | INS_FLAGS_Has_Wbit) @@ -959,15 +1003,12 @@ INST1(leave, "leave", IUM_RD, 0x0000C9, INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None) -INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2) - INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None) INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None) -INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) +INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | INS_Flags_Has_NF) +INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) +INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 4c5fc2e8d5328e..99dade4c9eb5f1 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -369,8 +369,10 @@ RELEASE_CONFIG_INTEGER(EnableMultiRegLocals, "EnableMultiRegLocals", 1) RELEASE_CONFIG_INTEGER(JitNoInline, "JitNoInline", 0) #if defined(DEBUG) -CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for legacy instructions. -CONFIG_INTEGER(JitBypassAPXCheck, "JitBypassAPXCheck", 0) // Bypass APX CPUID check. +CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for compatible instructions. +CONFIG_INTEGER(JitStressPromotedEvexEncoding, "JitStressPromotedEvexEncoding", 0) // Enable promoted EVEX encoding for + // compatible instructions. +CONFIG_INTEGER(JitBypassApxCheck, "JitBypassApxCheck", 0) // Bypass APX CPUID check. #endif // clang-format off @@ -440,6 +442,8 @@ RELEASE_CONFIG_INTEGER(EnableArm64Sve, "EnableArm64Sve", RELEASE_CONFIG_INTEGER(EnableEmbeddedBroadcast, "EnableEmbeddedBroadcast", 1) // Allows embedded broadcasts to be disabled RELEASE_CONFIG_INTEGER(EnableEmbeddedMasking, "EnableEmbeddedMasking", 1) // Allows embedded masking to be disabled +RELEASE_CONFIG_INTEGER(JitEnableApxNDD, "JitEnableApxNDD", 0) // Allows APX NDD feature to be disabled +RELEASE_CONFIG_INTEGER(JitEnableApxIfConv, "JitEnableApxIfConv", 0) // Testing for if conversion // clang-format on diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 9aee0fd99a1209..6d5694a03bb5be 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -89,7 +89,8 @@ class Lowering final : public Phase void ContainCheckReturnTrap(GenTreeOp* node); void ContainCheckLclHeap(GenTreeOp* node); void ContainCheckRet(GenTreeUnOp* ret); -#ifdef TARGET_ARM64 +#if defined(TARGET_ARM64) || defined(TARGET_AMD64) + bool ProducesPotentialConsumableFlagsForCCMP(GenTree* op); bool TryLowerAndOrToCCMP(GenTreeOp* tree, GenTree** next); insCflags TruthifyingFlags(GenCondition cond); void ContainCheckConditionalCompare(GenTreeCCMP* ccmp); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 87528fee51bdfe..16e37bd190a5fc 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -320,9 +320,223 @@ GenTree* Lowering::LowerBinaryArithmetic(GenTreeOp* binOp) ContainCheckBinary(binOp); +#ifdef TARGET_AMD64 + if (JitConfig.JitEnableApxIfConv()) + { + if (binOp->OperIs(GT_AND, GT_OR)) + { + GenTree* next; + if (TryLowerAndOrToCCMP(binOp, &next)) + { + return next; + } + } + } +#endif // TARGET_AMD64 + return binOp->gtNext; } +#ifdef TARGET_AMD64 + +//------------------------------------------------------------------------ +// ProducesPotentialConsumableFlagsForCCMP : Checks if an op will set +// the condition flags in a form that ccmp may us (either an integer relop +// or a setcc). +// +// Arguments: +// op - tree to check for consumable flags +// +// Return Value: +// true if the op output in form of flags can be used by ccmp +// +bool Lowering::ProducesPotentialConsumableFlagsForCCMP(GenTree* op) +{ + if (op->OperIsCompare()) + { + GenTreeOp* relop = op->AsOp(); + return !GenCondition::FromRelop(relop).IsFloat(); + } + + if (op->OperIs(GT_SETCC)) + { + return true; + } + + return false; +} + +//------------------------------------------------------------------------ +// TryLowerAndOrToCCMP : Lower AND/OR of two conditions into test + CCMP + SETCC nodes. +// +// Arguments: +// tree - pointer to the node +// next - [out] Next node to lower if this function returns true +// +// Return Value: +// false if no changes were made +// +bool Lowering::TryLowerAndOrToCCMP(GenTreeOp* tree, GenTree** next) +{ + assert(tree->OperIs(GT_AND, GT_OR)); + + if (!comp->opts.OptimizationEnabled()) + { + return false; + } + + GenTree* op1 = tree->gtGetOp1(); + GenTree* op2 = tree->gtGetOp2(); + + if ((op1->OperIsCmpCompare() && varTypeIsIntegralOrI(op1->gtGetOp1())) || + (op2->OperIsCmpCompare() && varTypeIsIntegralOrI(op2->gtGetOp1()))) + { + JITDUMP("[%06u] is a potential candidate for CCMP:\n", Compiler::dspTreeID(tree)); + DISPTREERANGE(BlockRange(), tree); + JITDUMP("\n"); + } + + // Find out whether an operand is eligible to be converted to a conditional + // compare. It must be a normal integral relop; for example, we cannot + // conditionally perform a floating point comparison and there is no "ctst" + // instruction that would allow us to conditionally implement + // TEST_EQ/TEST_NE. + // + // For the other operand we can allow more arbitrary operations that set + // the condition flags; the final transformation into the flags def is done + // by TryLowerConditionToFlagsNode. + // + // + // On X86, a FP compare is implemented as a fallthrough, which requires two flag checks; hence, + // we cannot simply get a single output condition to feed into a ccmp. Might be possible to chain + // this, but skipping those cases for now + GenCondition cond1; + if (op2->OperIsCmpCompare() && varTypeIsIntegralOrI(op2->gtGetOp1()) && IsInvariantInRange(op2, tree) && + ProducesPotentialConsumableFlagsForCCMP(op1) && TryLowerConditionToFlagsNode(tree, op1, &cond1)) + { + // Fall through, converting op2 to the CCMP + } + else if (op1->OperIsCmpCompare() && varTypeIsIntegralOrI(op1->gtGetOp1()) && IsInvariantInRange(op1, tree) && + ProducesPotentialConsumableFlagsForCCMP(op2) && TryLowerConditionToFlagsNode(tree, op2, &cond1)) + { + std::swap(op1, op2); + } + else + { + JITDUMP(" ..could not turn [%06u] or [%06u] into a def of flags, bailing\n", Compiler::dspTreeID(op1), + Compiler::dspTreeID(op2)); + return false; + } + + BlockRange().Remove(op2); + BlockRange().InsertBefore(tree, op2); + + GenCondition cond2 = GenCondition::FromRelop(op2); + op2->SetOper(GT_CCMP); + op2->gtType = TYP_VOID; + op2->gtFlags |= GTF_SET_FLAGS; + + op2->gtGetOp1()->ClearContained(); + op2->gtGetOp2()->ClearContained(); + + GenTreeCCMP* ccmp = op2->AsCCMP(); + + if (tree->OperIs(GT_AND)) + { + // If the first comparison succeeds then do the second comparison. + ccmp->gtCondition = cond1; + // Otherwise set the condition flags to something that makes the second + // one fail. + ccmp->gtFlagsVal = TruthifyingFlags(GenCondition::Reverse(cond2)); + } + else + { + // If the first comparison fails then do the second comparison. + ccmp->gtCondition = GenCondition::Reverse(cond1); + // Otherwise set the condition flags to something that makes the second + // one succeed. + ccmp->gtFlagsVal = TruthifyingFlags(cond2); + } + + ContainCheckConditionalCompare(ccmp); + + tree->SetOper(GT_SETCC); + tree->AsCC()->gtCondition = cond2; + + JITDUMP("Conversion was legal. Result:\n"); + DISPTREERANGE(BlockRange(), tree); + JITDUMP("\n"); + + *next = tree->gtNext; + return true; +} + +//------------------------------------------------------------------------ +// TruthifyingFlags: Get a flags immediate that will make a specified condition true. +// +// Arguments: +// condition - the condition. +// +// Returns: +// A flags immediate that, if those flags were set, would cause the specified condition to be true. +// (NOTE: This just has to make the condition be true, i.e., if the condition calls for (SF ^ OF), then +// returning one will suffice +// +// todo-anthony: Revisit this +insCflags Lowering::TruthifyingFlags(GenCondition condition) +{ + switch (condition.GetCode()) + { + case GenCondition::EQ: + return INS_FLAGS_ZF; + case GenCondition::NE: + return INS_FLAGS_NONE; + case GenCondition::SGE: // !(SF ^ OF) + return INS_FLAGS_NONE; + case GenCondition::SGT: // !(SF ^ OF) && !ZF + return INS_FLAGS_NONE; + case GenCondition::SLE: // !(SF ^ OF) || ZF + return INS_FLAGS_ZF; + case GenCondition::SLT: // (SF ^ OF) + return INS_FLAGS_SF; + case GenCondition::UGE: // !CF + return INS_FLAGS_NONE; + case GenCondition::UGT: // !CF && !ZF + return INS_FLAGS_NONE; + case GenCondition::ULE: // CF || ZF + return INS_FLAGS_ZF; + case GenCondition::ULT: // CF + return INS_FLAGS_CF; + default: + NO_WAY("unexpected condition type"); + return INS_FLAGS_NONE; + } +} + +//------------------------------------------------------------------------ +// ContainCheckConditionalCompare: determine whether the source of a compare within a compare chain should be contained. +// +// Arguments: +// node - pointer to the node +// +void Lowering::ContainCheckConditionalCompare(GenTreeCCMP* cmp) +{ + GenTree* op2 = cmp->gtOp2; + + if (op2->IsCnsIntOrI() && !op2->AsIntCon()->ImmedValNeedsReloc(comp)) + { + target_ssize_t immVal = (target_ssize_t)op2->AsIntCon()->gtIconVal; + + // todo-anthony: make this check work + // if (emitter::emitIns_valid_imm_for_ccmp(immVal)) + //{ + MakeSrcContained(cmp, op2); + //} + } +} + +#endif // TARGET_AMD64 + //------------------------------------------------------------------------ // LowerBlockStore: Lower a block store node // diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 56bba3469eb27b..9910797ba43547 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -4586,7 +4586,9 @@ int LinearScan::BuildGCWriteBarrier(GenTree* tree) // int LinearScan::BuildCmp(GenTree* tree) { -#if defined(TARGET_XARCH) +#if defined(TARGET_AMD64) + assert(tree->OperIsCompare() || tree->OperIs(GT_CMP, GT_TEST, GT_BT, GT_CCMP)); +#elif defined(TARGET_XARCH) assert(tree->OperIsCompare() || tree->OperIs(GT_CMP, GT_TEST, GT_BT)); #elif defined(TARGET_ARM64) assert(tree->OperIsCompare() || tree->OperIs(GT_CMP, GT_TEST, GT_JCMP, GT_JTEST, GT_CCMP)); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 5a1bd13cddd09d..66ca617ead9950 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -422,6 +422,9 @@ int LinearScan::BuildNode(GenTree* tree) case GT_CMP: case GT_TEST: case GT_BT: +#ifdef TARGET_AMD64 + case GT_CCMP: +#endif srcCount = BuildCmp(tree); break; diff --git a/src/coreclr/jit/optimizebools.cpp b/src/coreclr/jit/optimizebools.cpp index 2346ef7e9e278f..f4bb7a32250e6f 100644 --- a/src/coreclr/jit/optimizebools.cpp +++ b/src/coreclr/jit/optimizebools.cpp @@ -1937,7 +1937,7 @@ PhaseStatus Compiler::optOptimizeBools() retry = true; numCond++; } -#ifdef TARGET_ARM64 +#if defined(TARGET_ARM64) else if (optBoolsDsc.optOptimizeCompareChainCondBlock()) { // The optimization will have merged b1 and b2. Retry the loop so that @@ -1946,6 +1946,22 @@ PhaseStatus Compiler::optOptimizeBools() retry = true; numCond++; } +#elif defined(TARGET_AMD64) + // todo-xarch-apx: when we have proper CPUID (hardware) support, we can switch the below from an OR + // condition to an AND, for now, `JitConfig.JitEnableApxIfConv` will drive whether the optimization + // trigger or not + // else if ((compOpportunisticallyDependsOn(InstructionSet_APX) || JitConfig.JitEnableApxIfConv()) && + // optBoolsDsc.optOptimizeCompareChainCondBlock()) + else if (JitConfig.JitEnableApxIfConv() && !optSwitchDetectLikely(b1) && + optBoolsDsc.optOptimizeCompareChainCondBlock()) + { + // The optimization will have merged b1 and b2. Retry the loop so that + // b1 and b2->bbNext can be tested. + change = true; + retry = true; + numCond++; + } + #endif } else if (b2->KindIs(BBJ_RETURN)) diff --git a/src/coreclr/jit/switchrecognition.cpp b/src/coreclr/jit/switchrecognition.cpp index 7329194cb10cd2..919d23993a8783 100644 --- a/src/coreclr/jit/switchrecognition.cpp +++ b/src/coreclr/jit/switchrecognition.cpp @@ -131,6 +131,106 @@ bool IsConstantTestCondBlock(const BasicBlock* block, return false; } +//------------------------------------------------------------------------------ +// optSwitchDetectLikely : Return true if it is likely this block chain +// can be converted into a switch at a later optimization pass +// +// Arguments: +// firstBlock - A block to start the search from +// +// Return Value: +// True if the conversion was successful, false otherwise +// +bool Compiler::optSwitchDetectLikely(BasicBlock* firstBlock) +{ + assert(firstBlock->KindIs(BBJ_COND)); + + GenTree* variableNode = nullptr; + ssize_t cns = 0; + BasicBlock* trueTarget = nullptr; + BasicBlock* falseTarget = nullptr; + + // The algorithm is simple - we check that the given block is a constant test block + // and then try to accumulate as many constant test blocks as possible. Once we hit + // a block that doesn't match the pattern, we start processing the accumulated blocks. + bool isReversed = false; + if (IsConstantTestCondBlock(firstBlock, true, &trueTarget, &falseTarget, &isReversed, &variableNode, &cns)) + { + if (isReversed) + { + // First block uses NE - we don't support this yet. We currently expect all blocks to use EQ + // and allow NE for the last one (because it's what Roslyn usually emits). + // TODO: make it more flexible and support cases like "x != cns1 && x != cns2 && ..." + return false; + } + + // No more than SWITCH_MAX_TABLE_SIZE blocks are allowed (arbitrary limit in this context) + int testValueIndex = 0; + ssize_t testValues[SWITCH_MAX_DISTANCE] = {}; + testValues[testValueIndex] = cns; + testValueIndex++; + + // Track likelihood of reaching the false block + // + weight_t falseLikelihood = firstBlock->GetFalseEdge()->getLikelihood(); + const BasicBlock* prevBlock = firstBlock; + + // Follow the same algorithm as below but only peek to the next block + const BasicBlock* currBb = firstBlock->Next(); + if (currBb != nullptr) + { + GenTree* currVariableNode = nullptr; + ssize_t currCns = 0; + BasicBlock* currTrueTarget = nullptr; + BasicBlock* currFalseTarget = nullptr; + + if (!currBb->hasSingleStmt()) + { + // Only the first conditional block can have multiple statements. + // Stop searching and process what we already have. + return false; + } + + // Inspect secondary blocks + if (IsConstantTestCondBlock(currBb, false, &currTrueTarget, &currFalseTarget, &isReversed, + &currVariableNode, &currCns)) + { + if (currTrueTarget != trueTarget) + { + // This blocks jumps to a different target, stop searching and process what we already have. + return false; + } + + if (!GenTree::Compare(currVariableNode, variableNode->gtEffectiveVal())) + { + // A different variable node is used, stop searching and process what we already have. + return false; + } + + if (currBb->GetUniquePred(this) != prevBlock) + { + return false; + } + + if (!BasicBlock::sameEHRegion(prevBlock, currBb)) + { + // Current block is in a different EH region, stop searching and process what we already have. + return false; + } + + return true; + } + else + { + // Current block is not a suitable test, stop searching and process what we already have. + return false; + } + } + } + + return false; +} + //------------------------------------------------------------------------------ // optSwitchDetectAndConvert : Try to detect a series of conditional blocks which // can be converted into a switch (jump-table) construct. See optSwitchConvert