diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 32ecf350db59cf..875738dad74ced 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1460,7 +1460,15 @@ bool SIFoldOperands::tryFoldFoldableCopy( return false; } - MachineOperand &OpToFold = MI.getOperand(1); + MachineOperand *OpToFoldPtr; + if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) { + // Folding when any src_modifiers are non-zero is unsupported + if (TII->hasAnyModifiersSet(MI)) + return false; + OpToFoldPtr = &MI.getOperand(2); + } else + OpToFoldPtr = &MI.getOperand(1); + MachineOperand &OpToFold = *OpToFoldPtr; bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); // FIXME: We could also be folding things like TargetIndexes. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 463737f645d459..6f8874daa9c172 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3369,6 +3369,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B16_t16_e32: + case AMDGPU::V_MOV_B16_t16_e64: case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: @@ -5635,7 +5637,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Size = RI.getRegSizeInBits(*RC); - unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; + unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO + : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64 + : AMDGPU::V_MOV_B32_e32; if (MO.isReg()) Opcode = AMDGPU::COPY; else if (RI.isSGPRClass(RC)) diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 79bcf5e8cd30d4..155747551471e3 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -657,6 +657,7 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs( // although requirements match the pass placement and it reduces code size too. MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || + MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || MovT.getOpcode() == AMDGPU::COPY); Register T = MovT.getOperand(0).getReg(); @@ -668,7 +669,12 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { Register X = Xop.getReg(); unsigned Xsub = Xop.getSubReg(); - unsigned Size = TII->getOpSize(MovT, 0) / 4; + unsigned Size = TII->getOpSize(MovT, 0); + + // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers + // are not allocatble. + if (Size == 2 && X.isVirtual()) + return nullptr; if (!TRI->isVGPR(*MRI, X)) return nullptr; @@ -684,9 +690,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { KilledT = MovY->killsRegister(T, TRI); if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && + MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && MovY->getOpcode() != AMDGPU::COPY) || - !MovY->getOperand(1).isReg() || - MovY->getOperand(1).getReg() != T || + !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T || MovY->getOperand(1).getSubReg() != Tsub) continue; @@ -714,6 +720,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { } if (MovX || (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && + I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && I->getOpcode() != AMDGPU::COPY) || I->getOperand(0).getReg() != X || I->getOperand(0).getSubReg() != Xsub) { @@ -721,7 +728,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { break; } - if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) + if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) continue; MovX = &*I; @@ -730,23 +737,40 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { if (!MovX) continue; - LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); + LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY); - for (unsigned I = 0; I < Size; ++I) { - TargetInstrInfo::RegSubRegPair X1, Y1; - X1 = getSubRegForIndex(X, Xsub, I); - Y1 = getSubRegForIndex(Y, Ysub, I); - MachineBasicBlock &MBB = *MovT.getParent(); + MachineBasicBlock &MBB = *MovT.getParent(); + SmallVector Swaps; + if (Size == 2) { auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), - TII->get(AMDGPU::V_SWAP_B32)) - .addDef(X1.Reg, 0, X1.SubReg) - .addDef(Y1.Reg, 0, Y1.SubReg) - .addReg(Y1.Reg, 0, Y1.SubReg) - .addReg(X1.Reg, 0, X1.SubReg).getInstr(); - if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { - // Drop implicit EXEC. - MIB->removeOperand(MIB->getNumExplicitOperands()); - MIB->copyImplicitOps(*MBB.getParent(), *MovX); + TII->get(AMDGPU::V_SWAP_B16)) + .addDef(X) + .addDef(Y) + .addReg(Y) + .addReg(X) + .getInstr(); + Swaps.push_back(MIB); + } else { + assert(Size > 0 && Size % 4 == 0); + for (unsigned I = 0; I < Size / 4; ++I) { + TargetInstrInfo::RegSubRegPair X1, Y1; + X1 = getSubRegForIndex(X, Xsub, I); + Y1 = getSubRegForIndex(Y, Ysub, I); + auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), + TII->get(AMDGPU::V_SWAP_B32)) + .addDef(X1.Reg, 0, X1.SubReg) + .addDef(Y1.Reg, 0, Y1.SubReg) + .addReg(Y1.Reg, 0, Y1.SubReg) + .addReg(X1.Reg, 0, X1.SubReg) + .getInstr(); + Swaps.push_back(MIB); + } + } + // Drop implicit EXEC. + if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { + for (MachineInstr *Swap : Swaps) { + Swap->removeOperand(Swap->getNumExplicitOperands()); + Swap->copyImplicitOps(*MBB.getParent(), *MovX); } } MovX->eraseFromParent(); @@ -833,6 +857,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || + MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 || MI.getOpcode() == AMDGPU::COPY)) { if (auto *NextMI = matchSwap(MI)) { Next = NextMI->getIterator(); @@ -1023,7 +1048,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineFunctionProperties::Property::NoVRegs)) continue; - if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && + if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && !shouldShrinkTrue16(MI)) continue; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 0a2e338b347871..34d12aa5e07835 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -751,7 +751,7 @@ let SubtargetPredicate = isGFX11Plus in { let IsInvalidSingleUseConsumer = 1; let IsInvalidSingleUseProducer = 1; } - defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16>; + defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index d732da1a67bc1f..970bb08e1838b2 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2131,26 +2131,14 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { ; GFX10-NEXT: global_store_short v[2:3], v5, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: test_store_fpimm: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v5, off -; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v4, off -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: test_store_fpimm: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80 -; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228 -; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off -; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: test_store_fpimm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228 +; GFX11-NEXT: global_store_b16 v[0:1], v4, off +; GFX11-NEXT: global_store_b16 v[2:3], v5, off +; GFX11-NEXT: s_setpc_b64 s[30:31] store bfloat 1.0, ptr addrspace(1) %ptr0 store bfloat 42.0, ptr addrspace(1) %ptr1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index 7352fcdd071d5b..9fe7544003568c 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -246,9 +246,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; GFX11-SDAG-NEXT: s_mov_b32 s3, s7 ; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -264,9 +262,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x3c00 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -390,9 +386,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; GFX11-SDAG-NEXT: s_mov_b32 s3, s7 ; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x4000 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -408,9 +402,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x4000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll new file mode 100644 index 00000000000000..1f36f7a0d9616e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s + +define half @swap(half %a, half %b, i32 %i) { +; GFX11-TRUE16-LABEL: swap: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB0_1: ; %loop +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_swap_b16 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %ret +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: swap: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB0_1: ; %loop +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2 +; GFX11-FAKE16-NEXT: v_swap_b32 v1, v0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %ret +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: swap: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB0_1: ; %loop +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: v_swap_b16 v0.l, v0.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: swap: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2 +; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +entry: + br label %loop + +loop: + %x = phi half [%a, %entry], [%y, %loop] + %y = phi half [%b, %entry], [%x, %loop] + %i2 = phi i32 [%i, %entry], [%i3, %loop] + + %i3 = sub i32 %i2, 1 + + %cmp = icmp eq i32 %i3, 0 + br i1 %cmp, label %ret, label %loop + +ret: + ret half %x +} diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s index 7f99afe0192599..68442b01bf7d90 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s @@ -169,21 +169,3 @@ s_load_b96 s[20:22], s[2:3], s0 s_buffer_load_b96 s[20:22], s[4:7], s0 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU - -v_mov_b16 v0.l, s0.h -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction - -v_mov_b16 v0.l, ttmp0.h -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction - -v_mov_b16 v0.l, a0.h -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction - -v_mov_b16 v0.l, s0.h -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction - -v_mov_b16 v0.l, ttmp0.h -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction - -v_mov_b16 v0.l, a0.h -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s new file mode 100644 index 00000000000000..aa2309dd7d5d7c --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s @@ -0,0 +1,10 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s + +v_mov_b16 v0.l, s0.h +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_mov_b16 v0.l, ttmp0.h +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +v_mov_b16 v0.l, a0.h +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction