Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 151 additions & 4 deletions llvm/lib/Target/X86/X86CompressEVEX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// d. NF_ND (EVEX) -> NF (EVEX)
// e. NonNF (EVEX) -> NF (EVEX)
// f. SETZUCCm (EVEX) -> SETCCm (legacy)
// g. VPMOV*2M (EVEX) + KMOV -> VMOVMSK/VPMOVMSKB (VEX)
//
// Compression a, b and c can always reduce code size, with some exceptions
// such as promoted 16-bit CRC32 which is as long as the legacy version.
Expand All @@ -41,6 +42,7 @@
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
Expand Down Expand Up @@ -178,8 +180,143 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
return true;
}

static bool isKMovNarrowing(unsigned VPMOVOpc, unsigned KMOVOpc) {
unsigned VPMOVBits = 0;
switch (VPMOVOpc) {
case X86::VPMOVQ2MZ128kr:
VPMOVBits = 2;
break;
case X86::VPMOVQ2MZ256kr:
case X86::VPMOVD2MZ128kr:
VPMOVBits = 4;
break;
case X86::VPMOVD2MZ256kr:
VPMOVBits = 8;
break;
case X86::VPMOVB2MZ128kr:
VPMOVBits = 16;
break;
case X86::VPMOVB2MZ256kr:
VPMOVBits = 32;
break;
default:
llvm_unreachable("Unknown VPMOV opcode");
}

unsigned KMOVSize = 0;
switch (KMOVOpc) {
case X86::KMOVBrk:
KMOVSize = 8;
break;
case X86::KMOVWrk:
KMOVSize = 16;
break;
case X86::KMOVDrk:
KMOVSize = 32;
break;
default:
llvm_unreachable("Unknown KMOV opcode");
}

return KMOVSize < VPMOVBits;
}

// Try to compress VPMOV*2M + KMOV chain patterns:
// vpmov*2m %xmm0, %k0 -> (erase this)
// kmov* %k0, %eax -> vmovmskp* %xmm0, %eax
static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
const X86Subtarget &ST,
SmallVectorImpl<MachineInstr *> &ToErase) {
const X86InstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo *MRI = &MBB.getParent()->getRegInfo();

unsigned Opc = MI.getOpcode();
if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr &&
Opc != X86::VPMOVQ2MZ128kr && Opc != X86::VPMOVQ2MZ256kr &&
Opc != X86::VPMOVB2MZ128kr && Opc != X86::VPMOVB2MZ256kr)
return false;

Register MaskReg = MI.getOperand(0).getReg();
Register SrcVecReg = MI.getOperand(1).getReg();

unsigned MovMskOpc = 0;
switch (Opc) {
case X86::VPMOVD2MZ128kr:
MovMskOpc = X86::VMOVMSKPSrr;
break;
case X86::VPMOVD2MZ256kr:
MovMskOpc = X86::VMOVMSKPSYrr;
break;
case X86::VPMOVQ2MZ128kr:
MovMskOpc = X86::VMOVMSKPDrr;
break;
case X86::VPMOVQ2MZ256kr:
MovMskOpc = X86::VMOVMSKPDYrr;
break;
case X86::VPMOVB2MZ128kr:
MovMskOpc = X86::VPMOVMSKBrr;
break;
case X86::VPMOVB2MZ256kr:
MovMskOpc = X86::VPMOVMSKBYrr;
break;
default:
llvm_unreachable("Unknown VPMOV opcode");
}

MachineInstr *KMovMI = nullptr;

for (MachineInstr &CurMI : llvm::make_range(
std::next(MachineBasicBlock::iterator(MI)), MBB.end())) {
if (CurMI.modifiesRegister(MaskReg, TRI)) {
if (!KMovMI)
return false; // Mask clobbered before use
break;
}

if (CurMI.readsRegister(MaskReg, TRI)) {
if (KMovMI)
return false; // Fail: Mask has MULTIPLE uses

unsigned UseOpc = CurMI.getOpcode();
bool IsKMOV = UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
UseOpc == X86::KMOVDrk;
// Only allow non-narrowing KMOV uses of the mask.
if (IsKMOV && CurMI.getOperand(1).getReg() == MaskReg &&
!isKMovNarrowing(Opc, UseOpc)) {
KMovMI = &CurMI;
// continue scanning to ensure
// there are no *other* uses of the mask later in the block.
} else {
return false;
}
}

if (!KMovMI && CurMI.modifiesRegister(SrcVecReg, TRI)) {
return false; // SrcVecReg modified before it could be used by MOVMSK
}
}

if (!KMovMI)
return false;

// Check if MaskReg is used in any other basic blocks
for (const MachineOperand &MO : MRI->use_operands(MaskReg))
if (MO.getParent()->getParent() != &MBB)
return false;

// Apply the transformation
KMovMI->setDesc(TII->get(MovMskOpc));
KMovMI->getOperand(1).setReg(SrcVecReg);
KMovMI->setAsmPrinterFlag(X86::AC_EVEX_2_VEX);

ToErase.push_back(&MI);
return true;
}

static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
const X86Subtarget &ST) {
const X86Subtarget &ST,
SmallVectorImpl<MachineInstr *> &ToErase) {
uint64_t TSFlags = MI.getDesc().TSFlags;

// Check for EVEX instructions only.
Expand All @@ -190,6 +327,10 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
return false;

// Specialized VPMOVD2M + KMOV -> MOVMSK fold first.
if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase))
return true;

auto IsRedundantNewDataDest = [&](unsigned &Opc) {
// $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
// ->
Expand Down Expand Up @@ -350,9 +491,15 @@ static bool runOnMF(MachineFunction &MF) {
bool Changed = false;

for (MachineBasicBlock &MBB : MF) {
// Traverse the basic block.
for (MachineInstr &MI : llvm::make_early_inc_range(MBB))
Changed |= CompressEVEXImpl(MI, MBB, ST);
SmallVector<MachineInstr *, 4> ToErase;

for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
Changed |= CompressEVEXImpl(MI, MBB, ST, ToErase);
}

for (MachineInstr *MI : ToErase) {
MI->eraseFromParent();
}
}
LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
return Changed;
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/avx512-ext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1745,8 +1745,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; AVX512DQNOBW: # %bb.0:
; AVX512DQNOBW-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQNOBW-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpmovd2m %ymm0, %k0
; AVX512DQNOBW-NEXT: kmovw %k0, %eax
; AVX512DQNOBW-NEXT: vmovmskps %ymm0, %eax
; AVX512DQNOBW-NEXT: # kill: def $al killed $al killed $eax
; AVX512DQNOBW-NEXT: vzeroupper
; AVX512DQNOBW-NEXT: retq
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/avx512-insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1669,8 +1669,7 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) n
; SKX-NEXT: vpmovm2b %k0, %ymm0
; SKX-NEXT: vpbroadcastb %eax, %ymm0 {%k1}
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vpmovmskb %ymm0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, zeroinitializer
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2751,8 +2751,7 @@ declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovd2m %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vmovmskps %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x50,0xc0]
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
Expand All @@ -2777,8 +2776,7 @@ declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovq2m %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vmovmskpd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x50,0xc0]
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
Expand All @@ -2790,8 +2788,7 @@ declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovq2m %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vmovmskpd %ymm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x50,0xc0]
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/X86/evex-to-vex-compress.mir
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
define void @evex_z256_to_evex_test() { ret void }
define void @evex_z128_to_evex_test() { ret void }
define void @evex_scalar_to_evex_test() { ret void }
define void @evex_vpmov_cross_block_test() { ret void }
...
---
# CHECK-LABEL: name: evex_z256_to_vex_test
Expand Down Expand Up @@ -894,6 +895,25 @@ body: |
$ymm0 = VSHUFI64X2Z256rmi $ymm0, $rip, 1, $noreg, 0, $noreg, 228
; CHECK: $ymm0 = VPERM2I128rri $ymm0, $ymm1, 32
$ymm0 = VSHUFI64X2Z256rri $ymm0, $ymm1, 228
; CHECK: $eax = VMOVMSKPSYrr $ymm0
$k0 = VPMOVD2MZ256kr $ymm0
$eax = KMOVBrk $k0
; CHECK: $eax = VMOVMSKPDYrr $ymm0
$k0 = VPMOVQ2MZ256kr $ymm0
$eax = KMOVBrk $k0
; CHECK: $eax = VPMOVMSKBYrr $ymm0
$k0 = VPMOVB2MZ256kr $ymm0
$eax = KMOVDrk $k0
; CHECK: $k0 = VPMOVD2MZ256kr $ymm0
; CHECK: $eax = KMOVBrk $k0
; CHECK: $ebx = KMOVBrk $k0
$k0 = VPMOVD2MZ256kr $ymm0
$eax = KMOVBrk $k0
$ebx = KMOVBrk $k0
; CHECK: $k0 = VPMOVB2MZ256kr $ymm0
; CHECK: $eax = KMOVWrk $k0
$k0 = VPMOVB2MZ256kr $ymm0
$eax = KMOVWrk $k0

RET64
...
Expand Down Expand Up @@ -1760,6 +1780,25 @@ body: |
$xmm0 = VRNDSCALEPSZ128rmi $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
; CHECK: $xmm0 = VROUNDPSri $xmm0, 15, implicit $mxcsr
$xmm0 = VRNDSCALEPSZ128rri $xmm0, 15, implicit $mxcsr
; CHECK: $eax = VMOVMSKPSrr $xmm0
$k0 = VPMOVD2MZ128kr $xmm0
$eax = KMOVBrk $k0
; CHECK: $eax = VMOVMSKPDrr $xmm0
$k0 = VPMOVQ2MZ128kr $xmm0
$eax = KMOVBrk $k0
; CHECK: $eax = VPMOVMSKBrr $xmm0
$k0 = VPMOVB2MZ128kr $xmm0
$eax = KMOVWrk $k0
; CHECK: $k0 = VPMOVD2MZ128kr $xmm0
; CHECK: $eax = KMOVBrk $k0
; CHECK: $ebx = KMOVBrk $k0
$k0 = VPMOVD2MZ128kr $xmm0
$eax = KMOVBrk $k0
$ebx = KMOVBrk $k0
; CHECK: $k0 = VPMOVB2MZ128kr $xmm0
; CHECK: $eax = KMOVBrk $k0
$k0 = VPMOVB2MZ128kr $xmm0
$eax = KMOVBrk $k0

RET64
...
Expand Down Expand Up @@ -4672,3 +4711,24 @@ body: |

RET64
...
---
# CHECK-LABEL: name: evex_vpmov_cross_block_test
# CHECK: bb.0:

name: evex_vpmov_cross_block_test
body: |
bb.0:
; CHECK: $k0 = VPMOVD2MZ128kr $xmm0
; CHECK: $eax = KMOVBrk $k0
; CHECK: JCC_1 %bb.1
$k0 = VPMOVD2MZ128kr $xmm0
$eax = KMOVBrk $k0
JCC_1 %bb.1, 4, implicit $eflags

bb.1:
; CHECK: bb.1:
; CHECK: $k1 = KANDBkk $k0, $k0
$k1 = KANDBkk $k0, $k0

RET64
...
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/masked_compressstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3444,8 +3444,7 @@ define void @compressstore_v8i16_v8i16(ptr %base, <8 x i16> %V, <8 x i16> %trigg
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: vmovmskps %ymm1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB11_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/masked_expandload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3047,8 +3047,7 @@ define <8 x i16> @expandload_v8i16_v8i16(ptr %base, <8 x i16> %src0, <8 x i16> %
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: vmovmskps %ymm1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB11_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
Expand Down
Loading
Loading