diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index 59d653b84eb8a..4e39eb3e9707a 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -16,6 +16,7 @@ // d. NF_ND (EVEX) -> NF (EVEX) // e. NonNF (EVEX) -> NF (EVEX) // f. SETZUCCm (EVEX) -> SETCCm (legacy) +// g. VPMOV*2M (EVEX) + KMOV -> VMOVMSK/VPMOVMSKB (VEX) // // Compression a, b and c can always reduce code size, with some exceptions // such as promoted 16-bit CRC32 which is as long as the legacy version. @@ -41,6 +42,7 @@ #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionAnalysisManager.h" @@ -178,8 +180,143 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { return true; } +static bool isKMovNarrowing(unsigned VPMOVOpc, unsigned KMOVOpc) { + unsigned VPMOVBits = 0; + switch (VPMOVOpc) { + case X86::VPMOVQ2MZ128kr: + VPMOVBits = 2; + break; + case X86::VPMOVQ2MZ256kr: + case X86::VPMOVD2MZ128kr: + VPMOVBits = 4; + break; + case X86::VPMOVD2MZ256kr: + VPMOVBits = 8; + break; + case X86::VPMOVB2MZ128kr: + VPMOVBits = 16; + break; + case X86::VPMOVB2MZ256kr: + VPMOVBits = 32; + break; + default: + llvm_unreachable("Unknown VPMOV opcode"); + } + + unsigned KMOVSize = 0; + switch (KMOVOpc) { + case X86::KMOVBrk: + KMOVSize = 8; + break; + case X86::KMOVWrk: + KMOVSize = 16; + break; + case X86::KMOVDrk: + KMOVSize = 32; + break; + default: + llvm_unreachable("Unknown KMOV opcode"); + } + + return KMOVSize < VPMOVBits; +} + +// Try to compress VPMOV*2M + KMOV chain patterns: +// vpmov*2m %xmm0, %k0 -> (erase this) +// kmov* %k0, %eax -> vmovmskp* %xmm0, %eax +static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB, + const X86Subtarget &ST, + SmallVectorImpl &ToErase) { + const X86InstrInfo *TII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo *MRI = &MBB.getParent()->getRegInfo(); + + unsigned Opc = MI.getOpcode(); + if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr && + Opc != X86::VPMOVQ2MZ128kr && Opc != X86::VPMOVQ2MZ256kr && + Opc != X86::VPMOVB2MZ128kr && Opc != X86::VPMOVB2MZ256kr) + return false; + + Register MaskReg = MI.getOperand(0).getReg(); + Register SrcVecReg = MI.getOperand(1).getReg(); + + unsigned MovMskOpc = 0; + switch (Opc) { + case X86::VPMOVD2MZ128kr: + MovMskOpc = X86::VMOVMSKPSrr; + break; + case X86::VPMOVD2MZ256kr: + MovMskOpc = X86::VMOVMSKPSYrr; + break; + case X86::VPMOVQ2MZ128kr: + MovMskOpc = X86::VMOVMSKPDrr; + break; + case X86::VPMOVQ2MZ256kr: + MovMskOpc = X86::VMOVMSKPDYrr; + break; + case X86::VPMOVB2MZ128kr: + MovMskOpc = X86::VPMOVMSKBrr; + break; + case X86::VPMOVB2MZ256kr: + MovMskOpc = X86::VPMOVMSKBYrr; + break; + default: + llvm_unreachable("Unknown VPMOV opcode"); + } + + MachineInstr *KMovMI = nullptr; + + for (MachineInstr &CurMI : llvm::make_range( + std::next(MachineBasicBlock::iterator(MI)), MBB.end())) { + if (CurMI.modifiesRegister(MaskReg, TRI)) { + if (!KMovMI) + return false; // Mask clobbered before use + break; + } + + if (CurMI.readsRegister(MaskReg, TRI)) { + if (KMovMI) + return false; // Fail: Mask has MULTIPLE uses + + unsigned UseOpc = CurMI.getOpcode(); + bool IsKMOV = UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk || + UseOpc == X86::KMOVDrk; + // Only allow non-narrowing KMOV uses of the mask. + if (IsKMOV && CurMI.getOperand(1).getReg() == MaskReg && + !isKMovNarrowing(Opc, UseOpc)) { + KMovMI = &CurMI; + // continue scanning to ensure + // there are no *other* uses of the mask later in the block. + } else { + return false; + } + } + + if (!KMovMI && CurMI.modifiesRegister(SrcVecReg, TRI)) { + return false; // SrcVecReg modified before it could be used by MOVMSK + } + } + + if (!KMovMI) + return false; + + // Check if MaskReg is used in any other basic blocks + for (const MachineOperand &MO : MRI->use_operands(MaskReg)) + if (MO.getParent()->getParent() != &MBB) + return false; + + // Apply the transformation + KMovMI->setDesc(TII->get(MovMskOpc)); + KMovMI->getOperand(1).setReg(SrcVecReg); + KMovMI->setAsmPrinterFlag(X86::AC_EVEX_2_VEX); + + ToErase.push_back(&MI); + return true; +} + static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, - const X86Subtarget &ST) { + const X86Subtarget &ST, + SmallVectorImpl &ToErase) { uint64_t TSFlags = MI.getDesc().TSFlags; // Check for EVEX instructions only. @@ -190,6 +327,10 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2)) return false; + // Specialized VPMOVD2M + KMOV -> MOVMSK fold first. + if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase)) + return true; + auto IsRedundantNewDataDest = [&](unsigned &Opc) { // $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx // -> @@ -350,9 +491,15 @@ static bool runOnMF(MachineFunction &MF) { bool Changed = false; for (MachineBasicBlock &MBB : MF) { - // Traverse the basic block. - for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) - Changed |= CompressEVEXImpl(MI, MBB, ST); + SmallVector ToErase; + + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + Changed |= CompressEVEXImpl(MI, MBB, ST, ToErase); + } + + for (MachineInstr *MI : ToErase) { + MI->eraseFromParent(); + } } LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";); return Changed; diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 1a712ffac5b7e..2617e2d12adfd 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1745,8 +1745,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512DQNOBW-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpmovd2m %ymm0, %k0 -; AVX512DQNOBW-NEXT: kmovw %k0, %eax +; AVX512DQNOBW-NEXT: vmovmskps %ymm0, %eax ; AVX512DQNOBW-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQNOBW-NEXT: vzeroupper ; AVX512DQNOBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index e183da1386d5b..f8b0c3465f3db 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -1669,8 +1669,7 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) n ; SKX-NEXT: vpmovm2b %k0, %ymm0 ; SKX-NEXT: vpbroadcastb %eax, %ymm0 {%k1} ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 -; SKX-NEXT: vpmovb2m %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vpmovmskb %ymm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <32 x i8> %a, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index f31dafcd68626..6f3be88d7cd0c 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2751,8 +2751,7 @@ declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>) define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovd2m %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0] -; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vmovmskps %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x50,0xc0] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0) @@ -2777,8 +2776,7 @@ declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>) define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovq2m %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0] -; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vmovmskpd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x50,0xc0] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0) @@ -2790,8 +2788,7 @@ declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>) define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) { ; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovq2m %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0] -; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vmovmskpd %ymm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x50,0xc0] ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir index 982af38204728..142f07ae6e2b2 100644 --- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir +++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir @@ -10,6 +10,7 @@ define void @evex_z256_to_evex_test() { ret void } define void @evex_z128_to_evex_test() { ret void } define void @evex_scalar_to_evex_test() { ret void } + define void @evex_vpmov_cross_block_test() { ret void } ... --- # CHECK-LABEL: name: evex_z256_to_vex_test @@ -894,6 +895,25 @@ body: | $ymm0 = VSHUFI64X2Z256rmi $ymm0, $rip, 1, $noreg, 0, $noreg, 228 ; CHECK: $ymm0 = VPERM2I128rri $ymm0, $ymm1, 32 $ymm0 = VSHUFI64X2Z256rri $ymm0, $ymm1, 228 + ; CHECK: $eax = VMOVMSKPSYrr $ymm0 + $k0 = VPMOVD2MZ256kr $ymm0 + $eax = KMOVBrk $k0 + ; CHECK: $eax = VMOVMSKPDYrr $ymm0 + $k0 = VPMOVQ2MZ256kr $ymm0 + $eax = KMOVBrk $k0 + ; CHECK: $eax = VPMOVMSKBYrr $ymm0 + $k0 = VPMOVB2MZ256kr $ymm0 + $eax = KMOVDrk $k0 + ; CHECK: $k0 = VPMOVD2MZ256kr $ymm0 + ; CHECK: $eax = KMOVBrk $k0 + ; CHECK: $ebx = KMOVBrk $k0 + $k0 = VPMOVD2MZ256kr $ymm0 + $eax = KMOVBrk $k0 + $ebx = KMOVBrk $k0 + ; CHECK: $k0 = VPMOVB2MZ256kr $ymm0 + ; CHECK: $eax = KMOVWrk $k0 + $k0 = VPMOVB2MZ256kr $ymm0 + $eax = KMOVWrk $k0 RET64 ... @@ -1760,6 +1780,25 @@ body: | $xmm0 = VRNDSCALEPSZ128rmi $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr ; CHECK: $xmm0 = VROUNDPSri $xmm0, 15, implicit $mxcsr $xmm0 = VRNDSCALEPSZ128rri $xmm0, 15, implicit $mxcsr + ; CHECK: $eax = VMOVMSKPSrr $xmm0 + $k0 = VPMOVD2MZ128kr $xmm0 + $eax = KMOVBrk $k0 + ; CHECK: $eax = VMOVMSKPDrr $xmm0 + $k0 = VPMOVQ2MZ128kr $xmm0 + $eax = KMOVBrk $k0 + ; CHECK: $eax = VPMOVMSKBrr $xmm0 + $k0 = VPMOVB2MZ128kr $xmm0 + $eax = KMOVWrk $k0 + ; CHECK: $k0 = VPMOVD2MZ128kr $xmm0 + ; CHECK: $eax = KMOVBrk $k0 + ; CHECK: $ebx = KMOVBrk $k0 + $k0 = VPMOVD2MZ128kr $xmm0 + $eax = KMOVBrk $k0 + $ebx = KMOVBrk $k0 + ; CHECK: $k0 = VPMOVB2MZ128kr $xmm0 + ; CHECK: $eax = KMOVBrk $k0 + $k0 = VPMOVB2MZ128kr $xmm0 + $eax = KMOVBrk $k0 RET64 ... @@ -4672,3 +4711,24 @@ body: | RET64 ... +--- + # CHECK-LABEL: name: evex_vpmov_cross_block_test + # CHECK: bb.0: + +name: evex_vpmov_cross_block_test +body: | + bb.0: + ; CHECK: $k0 = VPMOVD2MZ128kr $xmm0 + ; CHECK: $eax = KMOVBrk $k0 + ; CHECK: JCC_1 %bb.1 + $k0 = VPMOVD2MZ128kr $xmm0 + $eax = KMOVBrk $k0 + JCC_1 %bb.1, 4, implicit $eflags + + bb.1: + ; CHECK: bb.1: + ; CHECK: $k1 = KANDBkk $k0, $k0 + $k1 = KANDBkk $k0, $k0 + + RET64 +... diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 3187bf6448690..5296c9d0f0777 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -3444,8 +3444,7 @@ define void @compressstore_v8i16_v8i16(ptr %base, <8 x i16> %V, <8 x i16> %trigg ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vmovmskps %ymm1, %eax ; AVX512VLDQ-NEXT: testb $1, %al ; AVX512VLDQ-NEXT: jne LBB11_1 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index e81a983c07018..ce8a34db498df 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -3047,8 +3047,7 @@ define <8 x i16> @expandload_v8i16_v8i16(ptr %base, <8 x i16> %src0, <8 x i16> % ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vmovmskps %ymm1, %eax ; AVX512VLDQ-NEXT: testb $1, %al ; AVX512VLDQ-NEXT: jne LBB11_1 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 58adbb767ed87..cf49ac1e4886b 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -966,10 +966,9 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub ; X86-SKX-LABEL: test17: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmovq2m %xmm1, %k0 ; X86-SKX-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm1, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB16_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -1255,8 +1254,7 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) { ; X64-SKX-LABEL: test20: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; X64-SKX-NEXT: vpmovq2m %xmm2, %k0 -; X64-SKX-NEXT: kmovw %k0, %eax +; X64-SKX-NEXT: vmovmskpd %xmm2, %eax ; X64-SKX-NEXT: testb $1, %al ; X64-SKX-NEXT: jne .LBB19_1 ; X64-SKX-NEXT: # %bb.2: # %else @@ -1277,8 +1275,7 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) { ; X86-SKX-LABEL: test20: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; X86-SKX-NEXT: vpmovq2m %xmm2, %k0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm2, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB19_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -1352,8 +1349,7 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) { ; X64-SKX-LABEL: test21: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; X64-SKX-NEXT: vpmovq2m %xmm2, %k0 -; X64-SKX-NEXT: kmovw %k0, %eax +; X64-SKX-NEXT: vmovmskpd %xmm2, %eax ; X64-SKX-NEXT: testb $1, %al ; X64-SKX-NEXT: jne .LBB20_1 ; X64-SKX-NEXT: # %bb.2: # %else @@ -1374,8 +1370,7 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) { ; X86-SKX-LABEL: test21: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; X86-SKX-NEXT: vpmovq2m %xmm2, %k0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm2, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB20_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -1494,10 +1489,9 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float ; X86-SKX-LABEL: test22: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmovq2m %xmm1, %k0 ; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm1, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB21_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -1617,11 +1611,10 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa ; X86-SKX-LABEL: test22a: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmovq2m %xmm1, %k0 ; X86-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm1, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB22_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -1741,10 +1734,9 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s ; X86-SKX-LABEL: test23: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmovq2m %xmm1, %k0 ; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm1, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB23_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -1860,11 +1852,10 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> % ; X86-SKX-LABEL: test23b: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmovq2m %xmm1, %k0 ; X86-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm1, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB24_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -2034,10 +2025,9 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s ; X86-SKX-LABEL: test25: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; X86-SKX-NEXT: vpmovq2m %xmm1, %k0 ; X86-SKX-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm1, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB26_1 ; X86-SKX-NEXT: # %bb.2: # %else @@ -3762,10 +3752,9 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind ; X86-SKX-LABEL: test_scatter_2i32_index: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; X86-SKX-NEXT: vpmovq2m %xmm2, %k0 ; X86-SKX-NEXT: vpslld $3, %xmm1, %xmm1 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1 -; X86-SKX-NEXT: kmovw %k0, %eax +; X86-SKX-NEXT: vmovmskpd %xmm2, %eax ; X86-SKX-NEXT: testb $1, %al ; X86-SKX-NEXT: jne .LBB52_1 ; X86-SKX-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll index aad1b44344850..5b5280601ea71 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -164,8 +164,7 @@ define <2 x i32> @test_gather_v2i32_data(<2 x ptr> %ptr, <2 x i1> %mask, <2 x i3 ; WIDEN_SKX-LABEL: test_gather_v2i32_data: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 -; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: vmovmskpd %xmm1, %eax ; WIDEN_SKX-NEXT: testb $1, %al ; WIDEN_SKX-NEXT: jne .LBB2_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else @@ -226,8 +225,7 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask ; WIDEN_SKX-LABEL: test_scatter_v2i32_data: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 -; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: vmovmskpd %xmm2, %eax ; WIDEN_SKX-NEXT: testb $1, %al ; WIDEN_SKX-NEXT: jne .LBB3_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 8c4bab99a5b7b..fa8f34cea4638 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -3008,8 +3008,7 @@ define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %dst ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vmovmskps %ymm0, %eax ; AVX512VLDQ-NEXT: testb $1, %al ; AVX512VLDQ-NEXT: jne LBB21_1 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index c7320275091c6..fbecfcb45f8e7 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -1829,8 +1829,7 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) no ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 -; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: vmovmskps %ymm0, %eax ; AVX512VLDQ-NEXT: testb $1, %al ; AVX512VLDQ-NEXT: jne LBB13_1 ; AVX512VLDQ-NEXT: ## %bb.2: ## %else diff --git a/llvm/test/CodeGen/X86/pr174871.ll b/llvm/test/CodeGen/X86/pr174871.ll new file mode 100644 index 0000000000000..9d671a9a1b8d2 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr174871.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s + +define <16 x i32> @pr174871(<16 x i32> %a, <16 x i1> %__mask) local_unnamed_addr { +; CHECK-LABEL: pr174871: +; CHECK: # %bb.0: # %allocas +; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 +; CHECK-NEXT: vpmovb2m %xmm1, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: andl $65534, %eax # imm = 0xFFFE +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.2: # %for_loop.lr.ph +; CHECK-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm4 +; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm5 +; CHECK-NEXT: vpsubd %zmm2, %zmm0, %zmm6 +; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm7 +; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm8 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; CHECK-NEXT: movw $-2, %cx +; CHECK-NEXT: kmovd %ecx, %k1 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm11 = [4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284,4294967284] +; CHECK-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_4: # %switch_done +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: vpsubd %zmm2, %zmm12, %zmm12 +; CHECK-NEXT: vpcmpltud %zmm10, %zmm12, %k1 {%k1} +; CHECK-NEXT: kandw %k1, %k0, %k2 +; CHECK-NEXT: kmovd %k2, %eax +; CHECK-NEXT: ktestw %k1, %k0 +; CHECK-NEXT: je .LBB0_5 +; CHECK-NEXT: .LBB0_3: # %for_loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vpcmpltud %zmm9, %zmm3, %k2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm1, %zmm1 {%k2} +; CHECK-NEXT: kandw %k2, %k0, %k3 +; CHECK-NEXT: kmovd %k3, %ecx +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.6: # %not_all_continued_or_breaked +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: vpcmpltud %zmm11, %zmm4, %k3 {%k1} +; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm1 {%k3} +; CHECK-NEXT: korw %k3, %k2, %k2 +; CHECK-NEXT: kandw %k2, %k0, %k3 +; CHECK-NEXT: kmovd %k3, %ecx +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.7: # %not_all_continued_or_breaked95 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: vpcmpltud %zmm9, %zmm5, %k3 {%k1} +; CHECK-NEXT: vpaddd %zmm6, %zmm1, %zmm1 {%k3} +; CHECK-NEXT: korw %k2, %k3, %k2 +; CHECK-NEXT: kandw %k2, %k0, %k2 +; CHECK-NEXT: kmovd %k2, %ecx +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.8: # %not_all_continued_or_breaked135 +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: vpcmpltud %zmm9, %zmm7, %k2 {%k1} +; CHECK-NEXT: vpaddd %zmm8, %zmm1, %zmm1 {%k2} +; CHECK-NEXT: jmp .LBB0_4 +; CHECK-NEXT: .LBB0_5: # %for_exit +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq +allocas: + %"internal_mask&function_mask7208" = and <16 x i1> %__mask, + %mask_i16.i.i209 = bitcast <16 x i1> %"internal_mask&function_mask7208" to i16 + %res.i.not210 = icmp eq i16 %mask_i16.i.i209, 0 + br i1 %res.i.not210, label %for_exit, label %for_loop.lr.ph + +for_loop.lr.ph: + %0 = add <16 x i32> %a, splat (i32 -1) + %"entry_mask&case_match32185" = icmp ult <16 x i32> %0, splat (i32 4) + %1 = add <16 x i32> %a, splat (i32 -13) + %2 = icmp ult <16 x i32> %1, splat (i32 -12) + %3 = add <16 x i32> %a, splat (i32 -5) + %"entry_mask&case_match115188" = icmp ult <16 x i32> %3, splat (i32 4) + %add_a_load120_ = add nsw <16 x i32> %a, splat (i32 1) + %4 = add <16 x i32> %a, splat (i32 -9) + %"entry_mask&case_match155191" = icmp ult <16 x i32> %4, splat (i32 4) + %mul_a_load160_ = shl nsw <16 x i32> %a, splat (i32 1) + br label %for_loop + +for_loop: + %mask_i16.i.i214 = phi i16 [ %mask_i16.i.i209, %for_loop.lr.ph ], [ %mask_i16.i.i, %switch_done ] + %"oldMask&test213" = phi <16 x i1> [ , %for_loop.lr.ph ], [ %"oldMask&test", %switch_done ] + %i.0212 = phi <16 x i32> [ zeroinitializer, %for_loop.lr.ph ], [ %i_load170_plus1, %switch_done ] + %res.0211 = phi <16 x i32> [ zeroinitializer, %for_loop.lr.ph ], [ %res.1, %switch_done ] + %"mask|case_match34" = and <16 x i1> %"entry_mask&case_match32185", %"oldMask&test213" + %add_res_load_sub_a_load37_ = select <16 x i1> %"mask|case_match34", <16 x i32> %0, <16 x i32> zeroinitializer + %5 = add nsw <16 x i32> %add_res_load_sub_a_load37_, %res.0211 + %"finished&func" = and <16 x i1> %__mask, %"mask|case_match34" + %mask_i16.i.i192 = bitcast <16 x i1> %"finished&func" to i16 + %"equal_finished&func_internal_mask&function_mask13" = icmp eq i16 %mask_i16.i.i214, %mask_i16.i.i192 + br i1 %"equal_finished&func_internal_mask&function_mask13", label %switch_done, label %not_all_continued_or_breaked + +switch_done: + %res.1 = phi <16 x i32> [ %5, %for_loop ], [ %6, %not_all_continued_or_breaked ], [ %7, %not_all_continued_or_breaked95 ], [ %8, %not_all_continued_or_breaked135 ] + %i_load170_plus1 = add nuw nsw <16 x i32> %i.0212, splat (i32 1) + %less_i_load_ = icmp samesign ult <16 x i32> %i_load170_plus1, + %"oldMask&test" = and <16 x i1> %"oldMask&test213", %less_i_load_ + %"internal_mask&function_mask7" = and <16 x i1> %__mask, %"oldMask&test" + %mask_i16.i.i = bitcast <16 x i1> %"internal_mask&function_mask7" to i16 + %res.i.not = icmp eq i16 %mask_i16.i.i, 0 + br i1 %res.i.not, label %for_exit, label %for_loop + +for_exit: + %res.0.lcssa = phi <16 x i32> [ zeroinitializer, %allocas ], [ %res.1, %switch_done ] + ret <16 x i32> %res.0.lcssa + +not_all_continued_or_breaked: + %"default&~case_match76" = and <16 x i1> %2, %"oldMask&test213" + %add_res_load82_a_load80 = select <16 x i1> %"default&~case_match76", <16 x i32> %a, <16 x i32> zeroinitializer + %6 = add nsw <16 x i32> %5, %add_res_load82_a_load80 + %"mask|break_mask86" = or <16 x i1> %"mask|case_match34", %"default&~case_match76" + %"finished&func92" = and <16 x i1> %__mask, %"mask|break_mask86" + %mask_i16.i.i196 = bitcast <16 x i1> %"finished&func92" to i16 + %"equal_finished&func92_internal_mask&function_mask13" = icmp eq i16 %mask_i16.i.i214, %mask_i16.i.i196 + br i1 %"equal_finished&func92_internal_mask&function_mask13", label %switch_done, label %not_all_continued_or_breaked95 + +not_all_continued_or_breaked95: + %"mask|case_match117" = and <16 x i1> %"entry_mask&case_match115188", %"oldMask&test213" + %add_res_load122_add_a_load120_ = select <16 x i1> %"mask|case_match117", <16 x i32> %add_a_load120_, <16 x i32> zeroinitializer + %7 = add nsw <16 x i32> %6, %add_res_load122_add_a_load120_ + %"mask|break_mask126" = or <16 x i1> %"mask|case_match117", %"mask|break_mask86" + %"finished&func132" = and <16 x i1> %__mask, %"mask|break_mask126" + %mask_i16.i.i198 = bitcast <16 x i1> %"finished&func132" to i16 + %"equal_finished&func132_internal_mask&function_mask13" = icmp eq i16 %mask_i16.i.i214, %mask_i16.i.i198 + br i1 %"equal_finished&func132_internal_mask&function_mask13", label %switch_done, label %not_all_continued_or_breaked135 + +not_all_continued_or_breaked135: + %"mask|case_match157" = and <16 x i1> %"entry_mask&case_match155191", %"oldMask&test213" + %add_res_load162_mul_a_load160_ = select <16 x i1> %"mask|case_match157", <16 x i32> %mul_a_load160_, <16 x i32> zeroinitializer + %8 = add nsw <16 x i32> %7, %add_res_load162_mul_a_load160_ + br label %switch_done +} + diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll index 9c072e6f5e3fc..b03907d6c871f 100644 --- a/llvm/test/CodeGen/X86/pr77459.ll +++ b/llvm/test/CodeGen/X86/pr77459.ll @@ -100,8 +100,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) { ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpmovd2m %ymm0, %k0 -; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vmovmskps %ymm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -227,8 +226,7 @@ define i32 @reverse_cmp_v32i1(<32 x i8> %a0, <32 x i8> %a1) { ; AVX512-V4-NEXT: vpmovm2b %k0, %ymm0 ; AVX512-V4-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] ; AVX512-V4-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512-V4-NEXT: vpmovb2m %ymm0, %k0 -; AVX512-V4-NEXT: kmovd %k0, %eax +; AVX512-V4-NEXT: vpmovmskb %ymm0, %eax ; AVX512-V4-NEXT: vzeroupper ; AVX512-V4-NEXT: retq ; @@ -238,8 +236,7 @@ define i32 @reverse_cmp_v32i1(<32 x i8> %a0, <32 x i8> %a1) { ; AVX512-VBMI-NEXT: vpmovm2b %k0, %ymm0 ; AVX512-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512-VBMI-NEXT: vpmovb2m %ymm0, %k0 -; AVX512-VBMI-NEXT: kmovd %k0, %eax +; AVX512-VBMI-NEXT: vpmovmskb %ymm0, %eax ; AVX512-VBMI-NEXT: vzeroupper ; AVX512-VBMI-NEXT: retq %cmp = icmp eq <32 x i8> %a0, %a1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index f434fc8c6cad8..116dcdc8c5907 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -1240,8 +1240,7 @@ define i8 @icmp0_v8i1(<8 x i8>) nounwind { ; AVX512VL-LABEL: icmp0_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: vpmovmskb %xmm0, %eax ; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq @@ -1907,8 +1906,7 @@ define i8 @icmp1_v8i1(<8 x i8>) nounwind { ; AVX512VL-LABEL: icmp1_v8i1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: vpmovmskb %xmm0, %eax ; AVX512VL-NEXT: cmpb $-1, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 2b89590a0bb41..9645f7c524cb4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -573,8 +573,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 -; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: vmovmskps %ymm2, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -615,8 +614,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 -; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: vmovmskps %ymm0, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -661,8 +659,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 -; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: vmovmskps %ymm2, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -703,8 +700,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 -; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: vmovmskps %ymm0, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -746,8 +742,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] -; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 -; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: vmovmskps %ymm0, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -796,8 +791,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 -; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 -; VL_BW_DQ-NEXT: kmovd %k0, %eax +; VL_BW_DQ-NEXT: vmovmskps %ymm2, %eax ; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq