Skip to content

Commit

Permalink
[X86] Support lowering for APX promoted BMI instructions. (#77433)
Browse files Browse the repository at this point in the history
R16-R31 was added into GPRs in
#70958,
This patch supports the lowering for promoted BMI instructions in EVEX
space, enc/dec has been supported in
#73899.

RFC:
https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031/4
  • Loading branch information
XinWang10 authored Jan 18, 2024
1 parent f661709 commit 2d92f7d
Show file tree
Hide file tree
Showing 11 changed files with 1,091 additions and 57 deletions.
42 changes: 26 additions & 16 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4086,14 +4086,17 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
SDValue Control;
unsigned ROpc, MOpc;

#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
if (!PreferBEXTR) {
assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
// If we can't make use of BEXTR then we can't fuse shift+mask stages.
// Let's perform the mask first, and apply shift later. Note that we need to
// widen the mask to account for the fact that we'll apply shift afterwards!
Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
} else {
Expand All @@ -4108,8 +4111,10 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
} else {
assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
// BMI requires the immediate to placed in a register.
ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
}
Expand Down Expand Up @@ -5482,25 +5487,30 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i32:
Opc = UseMULXHi ? X86::MULX32Hrr :
UseMULX ? X86::MULX32rr :
IsSigned ? X86::IMUL32r : X86::MUL32r;
MOpc = UseMULXHi ? X86::MULX32Hrm :
UseMULX ? X86::MULX32rm :
IsSigned ? X86::IMUL32m : X86::MUL32m;
Opc = UseMULXHi ? X86::MULX32Hrr
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
: IsSigned ? X86::IMUL32r
: X86::MUL32r;
MOpc = UseMULXHi ? X86::MULX32Hrm
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
: IsSigned ? X86::IMUL32m
: X86::MUL32m;
LoReg = UseMULX ? X86::EDX : X86::EAX;
HiReg = X86::EDX;
break;
case MVT::i64:
Opc = UseMULXHi ? X86::MULX64Hrr :
UseMULX ? X86::MULX64rr :
IsSigned ? X86::IMUL64r : X86::MUL64r;
MOpc = UseMULXHi ? X86::MULX64Hrm :
UseMULX ? X86::MULX64rm :
IsSigned ? X86::IMUL64m : X86::MUL64m;
Opc = UseMULXHi ? X86::MULX64Hrr
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
: IsSigned ? X86::IMUL64r
: X86::MUL64r;
MOpc = UseMULXHi ? X86::MULX64Hrm
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
: IsSigned ? X86::IMUL64m
: X86::MUL64m;
LoReg = UseMULX ? X86::RDX : X86::RAX;
HiReg = X86::RDX;
break;
#undef GET_EGPR_IF_ENABLED
}

SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
Expand Down
16 changes: 11 additions & 5 deletions llvm/lib/Target/X86/X86InstrArithmetic.td
Original file line number Diff line number Diff line change
Expand Up @@ -1338,17 +1338,23 @@ defm ANDN32 : AndN<Xi32, "_EVEX">, EVEX, Requires<[HasBMI, HasEGPR, In64BitMode]
defm ANDN64 : AndN<Xi64, "_EVEX">, EVEX, REX_W, Requires<[HasBMI, HasEGPR, In64BitMode]>;
}

let Predicates = [HasBMI], AddedComplexity = -6 in {
multiclass Andn_Pats<string suffix> {
def : Pat<(and (not GR32:$src1), GR32:$src2),
(ANDN32rr GR32:$src1, GR32:$src2)>;
(!cast<Instruction>(ANDN32rr#suffix) GR32:$src1, GR32:$src2)>;
def : Pat<(and (not GR64:$src1), GR64:$src2),
(ANDN64rr GR64:$src1, GR64:$src2)>;
(!cast<Instruction>(ANDN64rr#suffix) GR64:$src1, GR64:$src2)>;
def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
(ANDN32rm GR32:$src1, addr:$src2)>;
(!cast<Instruction>(ANDN32rm#suffix) GR32:$src1, addr:$src2)>;
def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
(ANDN64rm GR64:$src1, addr:$src2)>;
(!cast<Instruction>(ANDN64rm#suffix) GR64:$src1, addr:$src2)>;
}

let Predicates = [HasBMI, NoEGPR], AddedComplexity = -6 in
defm : Andn_Pats<"">;

let Predicates = [HasBMI, HasEGPR], AddedComplexity = -6 in
defm : Andn_Pats<"_EVEX">;

//===----------------------------------------------------------------------===//
// MULX Instruction
//
Expand Down
58 changes: 43 additions & 15 deletions llvm/lib/Target/X86/X86InstrMisc.td
Original file line number Diff line number Diff line change
Expand Up @@ -1241,43 +1241,49 @@ let Predicates = [HasBMI, In64BitMode], Defs = [EFLAGS] in {
defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64, "_EVEX">, EVEX;
}

let Predicates = [HasBMI] in {
multiclass Bls_Pats<string suffix> {
// FIXME(1): patterns for the load versions are not implemented
// FIXME(2): By only matching `add_su` and `ineg_su` we may emit
// extra `mov` instructions if `src` has future uses. It may be better
// to always match if `src` has more users.
def : Pat<(and GR32:$src, (add_su GR32:$src, -1)),
(BLSR32rr GR32:$src)>;
(!cast<Instruction>(BLSR32rr#suffix) GR32:$src)>;
def : Pat<(and GR64:$src, (add_su GR64:$src, -1)),
(BLSR64rr GR64:$src)>;
(!cast<Instruction>(BLSR64rr#suffix) GR64:$src)>;

def : Pat<(xor GR32:$src, (add_su GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
(!cast<Instruction>(BLSMSK32rr#suffix) GR32:$src)>;
def : Pat<(xor GR64:$src, (add_su GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
(!cast<Instruction>(BLSMSK64rr#suffix) GR64:$src)>;

def : Pat<(and GR32:$src, (ineg_su GR32:$src)),
(BLSI32rr GR32:$src)>;
(!cast<Instruction>(BLSI32rr#suffix) GR32:$src)>;
def : Pat<(and GR64:$src, (ineg_su GR64:$src)),
(BLSI64rr GR64:$src)>;
(!cast<Instruction>(BLSI64rr#suffix) GR64:$src)>;

// Versions to match flag producing ops.
def : Pat<(and_flag_nocf GR32:$src, (add_su GR32:$src, -1)),
(BLSR32rr GR32:$src)>;
(!cast<Instruction>(BLSR32rr#suffix) GR32:$src)>;
def : Pat<(and_flag_nocf GR64:$src, (add_su GR64:$src, -1)),
(BLSR64rr GR64:$src)>;
(!cast<Instruction>(BLSR64rr#suffix) GR64:$src)>;

def : Pat<(xor_flag_nocf GR32:$src, (add_su GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
(!cast<Instruction>(BLSMSK32rr#suffix) GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add_su GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
(!cast<Instruction>(BLSMSK64rr#suffix) GR64:$src)>;

def : Pat<(and_flag_nocf GR32:$src, (ineg_su GR32:$src)),
(BLSI32rr GR32:$src)>;
(!cast<Instruction>(BLSI32rr#suffix) GR32:$src)>;
def : Pat<(and_flag_nocf GR64:$src, (ineg_su GR64:$src)),
(BLSI64rr GR64:$src)>;
(!cast<Instruction>(BLSI64rr#suffix) GR64:$src)>;
}

let Predicates = [HasBMI, NoEGPR] in
defm : Bls_Pats<"">;

let Predicates = [HasBMI, HasEGPR] in
defm : Bls_Pats<"_EVEX">;

multiclass Bmi4VOp3<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
X86FoldableSchedWrite sched, string Suffix = ""> {
let SchedRW = [sched], Form = MRMSrcReg4VOp3 in
Expand Down Expand Up @@ -1324,7 +1330,7 @@ def AndMask64 : ImmLeaf<i64, [{
}]>;

// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
let Predicates = [HasBMI, NoBMI2, NoTBM] in {
let Predicates = [HasBMI, NoBMI2, NoTBM, NoEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BEXTR64rr GR64:$src,
(SUBREG_TO_REG (i64 0),
Expand All @@ -1335,8 +1341,19 @@ let Predicates = [HasBMI, NoBMI2, NoTBM] in {
(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
}

let Predicates = [HasBMI, NoBMI2, NoTBM, HasEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BEXTR64rr_EVEX GR64:$src,
(SUBREG_TO_REG (i64 0),
(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
(BEXTR64rm_EVEX addr:$src,
(SUBREG_TO_REG (i64 0),
(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
}

// Use BZHI for 64-bit 'and' with large immediate 'mask'.
let Predicates = [HasBMI2, NoTBM] in {
let Predicates = [HasBMI2, NoTBM, NoEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BZHI64rr GR64:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
Expand All @@ -1347,6 +1364,17 @@ let Predicates = [HasBMI2, NoTBM] in {
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
}

let Predicates = [HasBMI2, NoTBM, HasEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BZHI64rr_EVEX GR64:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
(BZHI64rm_EVEX addr:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
}

multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
X86MemOperand x86memop, SDPatternOperator OpNode,
PatFrag ld_frag, string Suffix = ""> {
Expand Down
47 changes: 27 additions & 20 deletions llvm/lib/Target/X86/X86InstrShiftRotate.td
Original file line number Diff line number Diff line change
Expand Up @@ -284,32 +284,32 @@ defm SHRX64: ShiftX<"shrx", Xi64>, XD;
defm SHLX32: ShiftX<"shlx", Xi32>, PD;
defm SHLX64: ShiftX<"shlx", Xi64>, PD;

multiclass RORX_Pats {
multiclass RORX_Pats<string suffix> {
// Prefer RORX which is non-destructive and doesn't update EFLAGS.
let AddedComplexity = 10 in {
def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, imm:$shamt)>;
(!cast<Instruction>(RORX32ri#suffix) GR32:$src, imm:$shamt)>;
def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, imm:$shamt)>;
(!cast<Instruction>(RORX64ri#suffix) GR64:$src, imm:$shamt)>;

def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX32ri#suffix) GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX64ri#suffix) GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
}

def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, imm:$shamt)>;
(!cast<Instruction>(RORX32mi#suffix) addr:$src, imm:$shamt)>;
def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, imm:$shamt)>;
(!cast<Instruction>(RORX64mi#suffix) addr:$src, imm:$shamt)>;

def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX32mi#suffix) addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX64mi#suffix) addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
}

multiclass ShiftX_Pats<SDNode op> {
multiclass ShiftX_Pats<SDNode op, string suffix = ""> {
// Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
// immediate shift, i.e. the following code is considered better
//
Expand All @@ -325,16 +325,16 @@ multiclass ShiftX_Pats<SDNode op> {
//
let AddedComplexity = 1 in {
def : Pat<(op GR32:$src1, GR8:$src2),
(!cast<Instruction>(NAME#"32rr") GR32:$src1,
(!cast<Instruction>(NAME#"32rr"#suffix) GR32:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op GR64:$src1, GR8:$src2),
(!cast<Instruction>(NAME#"64rr") GR64:$src1,
(!cast<Instruction>(NAME#"64rr"#suffix) GR64:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op GR32:$src1, (shiftMask32 GR8:$src2)),
(!cast<Instruction>(NAME#"32rr") GR32:$src1,
(!cast<Instruction>(NAME#"32rr"#suffix) GR32:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op GR64:$src1, (shiftMask64 GR8:$src2)),
(!cast<Instruction>(NAME#"64rr") GR64:$src1,
(!cast<Instruction>(NAME#"64rr"#suffix) GR64:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
// We prefer to use
Expand All @@ -348,22 +348,29 @@ multiclass ShiftX_Pats<SDNode op> {
//
// This priority is enforced by IsProfitableToFoldLoad.
def : Pat<(op (loadi32 addr:$src1), GR8:$src2),
(!cast<Instruction>(NAME#"32rm") addr:$src1,
(!cast<Instruction>(NAME#"32rm"#suffix) addr:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op (loadi64 addr:$src1), GR8:$src2),
(!cast<Instruction>(NAME#"64rm") addr:$src1,
(!cast<Instruction>(NAME#"64rm"#suffix) addr:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(!cast<Instruction>(NAME#"32rm") addr:$src1,
(!cast<Instruction>(NAME#"32rm"#suffix) addr:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(!cast<Instruction>(NAME#"64rm") addr:$src1,
(!cast<Instruction>(NAME#"64rm"#suffix) addr:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}

let Predicates = [HasBMI2] in {
defm : RORX_Pats;
let Predicates = [HasBMI2, NoEGPR] in {
defm : RORX_Pats<"">;
defm SARX : ShiftX_Pats<sra>;
defm SHRX : ShiftX_Pats<srl>;
defm SHLX : ShiftX_Pats<shl>;
}

let Predicates = [HasBMI2, HasEGPR] in {
defm : RORX_Pats<"_EVEX">;
defm SARX : ShiftX_Pats<sra, "_EVEX">;
defm SHRX : ShiftX_Pats<srl, "_EVEX">;
defm SHLX : ShiftX_Pats<shl, "_EVEX">;
}
Loading

0 comments on commit 2d92f7d

Please sign in to comment.