Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 6 additions & 24 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ class SIFoldOperandsImpl {
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;

std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
Expand Down Expand Up @@ -1567,24 +1566,6 @@ static unsigned getMovOpc(bool IsScalar) {
return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
}

std::optional<int64_t>
SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
if (Op.isImm())
return Op.getImm();

if (!Op.isReg() || !Op.getReg().isVirtual())
return std::nullopt;

const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
if (Def && Def->isMoveImmediate()) {
const MachineOperand &ImmSrc = Def->getOperand(1);
if (ImmSrc.isImm())
return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
}

return std::nullopt;
}

// Try to simplify operations with a constant that may appear after instruction
// selection.
// TODO: See if a frame index with a fixed offset can fold.
Expand All @@ -1599,7 +1580,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
return false;

MachineOperand *Src0 = &MI->getOperand(Src0Idx);
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);

if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
Opc == AMDGPU::S_NOT_B32) &&
Expand All @@ -1615,7 +1596,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
return false;

MachineOperand *Src1 = &MI->getOperand(Src1Idx);
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);

if (!Src0Imm && !Src1Imm)
return false;
Expand Down Expand Up @@ -1706,11 +1687,11 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
if (!Src1Imm)
return false;

std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
if (!Src0Imm || *Src0Imm != *Src1Imm)
return false;
}
Expand Down Expand Up @@ -1744,7 +1725,8 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
return false;

std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
std::optional<int64_t> Src0Imm =
TII->getImmOrMaterializedImm(MI.getOperand(1));
if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
return false;

Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 +1372,24 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
}
}

std::optional<int64_t>
SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
if (Op.isImm())
return Op.getImm();

if (!Op.isReg() || !Op.getReg().isVirtual())
return std::nullopt;
MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
if (Def && Def->isMoveImmediate()) {
const MachineOperand &ImmSrc = Def->getOperand(1);
if (ImmSrc.isImm())
return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
}

return std::nullopt;
}

unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {

if (RI.isAGPRClass(DstRC))
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
int64_t &ImmVal) const override;

std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;

unsigned getVectorRegSpillSaveOpcode(Register Reg,
const TargetRegisterClass *RC,
unsigned Size,
Expand Down Expand Up @@ -1019,6 +1021,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
}

static bool usesASYNC_CNT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::ASYNC_CNT;
}

bool usesASYNC_CNT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT;
}

// Most sopk treat the immediate as a signed 16-bit, however some
// use it as unsigned.
static bool sopkIsZext(unsigned Opcode) {
Expand Down
136 changes: 110 additions & 26 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ class SILoadStoreOptimizer {

unsigned LoSubReg = 0;
unsigned HiSubReg = 0;
// True when using V_ADD_U64_e64 pattern
bool UseV64Pattern = false;
};

struct MemAddress {
Expand Down Expand Up @@ -279,9 +281,12 @@ class SILoadStoreOptimizer {

void updateBaseAndOffset(MachineInstr &I, Register NewBase,
int32_t NewOffset) const;
void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const;
Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
bool processBaseWithConstOffset64(MachineInstr *AddDef,
const MachineOperand &Base,
MemAddress &Addr) const;
void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
/// Promotes constant offset to the immediate by adjusting the base. It
/// tries to use a base from the nearby instructions that allows it to have
Expand Down Expand Up @@ -2165,6 +2170,33 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
MachineBasicBlock::iterator MBBI = MI.getIterator();
const DebugLoc &DL = MI.getDebugLoc();

LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");

// Use V_ADD_U64_e64 when the original pattern used it (gfx1250+)
if (Addr.Base.UseV64Pattern) {
Register FullDestReg = MRI->createVirtualRegister(
TII->getRegClass(TII->get(AMDGPU::V_ADD_U64_e64), 0));

// Load the 64-bit offset into an SGPR pair if needed
Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstr *MovOffset =
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
OffsetReg)
.addImm(Addr.Offset);
MachineInstr *Add64 =
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_U64_e64), FullDestReg)
.addReg(Addr.Base.LoReg)
.addReg(OffsetReg, RegState::Kill)
.addImm(0);
(void)MovOffset;
(void)Add64;
LLVM_DEBUG(dbgs() << " " << *MovOffset << "\n";
dbgs() << " " << *Add64 << "\n\n";);

return FullDestReg;
}

// Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32)
assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
Addr.Base.LoSubReg) &&
"Expected 32-bit Base-Register-Low!!");
Expand All @@ -2173,7 +2205,6 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
Addr.Base.HiSubReg) &&
"Expected 32-bit Base-Register-Hi!!");

LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
MachineOperand OffsetHi =
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
Expand All @@ -2190,8 +2221,6 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
.addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)
.add(OffsetLo)
.addImm(0); // clamp bit
(void)LoHalf;
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););

MachineInstr *HiHalf =
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
Expand All @@ -2200,8 +2229,6 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
.add(OffsetHi)
.addReg(CarryReg, RegState::Kill)
.addImm(0); // clamp bit
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););

Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
Expand All @@ -2210,8 +2237,13 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);

(void)LoHalf;
(void)HiHalf;
(void)FullBase;
LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
LLVM_DEBUG(dbgs() << " " << *LoHalf << "\n";
dbgs() << " " << *HiHalf << "\n";
dbgs() << " " << *FullBase << "\n\n";);

return FullDestReg;
}
Expand All @@ -2226,20 +2258,35 @@ void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
}

std::optional<int32_t>
SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
if (Op.isImm())
return Op.getImm();
// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction.
// Returns true if successful, populating Addr with base register info and
// offset.
bool SILoadStoreOptimizer::processBaseWithConstOffset64(
MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const {
if (!Base.isReg())
return false;

if (!Op.isReg())
return std::nullopt;
MachineOperand *Src0 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);

MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
!Def->getOperand(1).isImm())
return std::nullopt;
const MachineOperand *BaseOp = nullptr;

auto Offset = TII->getImmOrMaterializedImm(*Src1);

if (Offset) {
BaseOp = Src0;
Addr.Offset = *Offset;
} else {
// Both or neither are constants - can't handle this pattern
return false;
}

return Def->getOperand(1).getImm();
// Now extract the base register (which should be a 64-bit VGPR).
MachineInstr *BaseDef = MRI->getVRegDef(BaseOp->getReg());
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BaseDef is unused when building without asserts.

assert(BaseDef && "Expected definition for base register");
Addr.Base.LoReg = BaseOp->getReg();
Addr.Base.UseV64Pattern = true;
return true;
}

// Analyze Base and extracts:
Expand All @@ -2252,14 +2299,27 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
// %Base:vreg_64 =
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
//
// Also handles V_ADD_U64_e64 pattern (gfx1250+):
// %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256
// %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
MemAddress &Addr) const {
if (!Base.isReg())
return;

MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
|| Def->getNumOperands() != 5)
if (!Def)
return;

// Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+)
if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
if (processBaseWithConstOffset64(Def, Base, Addr))
return;
}

// Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern
if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5)
return;

MachineOperand BaseLo = Def->getOperand(1);
Expand All @@ -2274,14 +2334,14 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
!BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
return;

const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
MachineOperand *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);

auto Offset0P = extractConstOffset(*Src0);
auto Offset0P = TII->getImmOrMaterializedImm(*Src0);
if (Offset0P)
BaseLo = *Src1;
else {
if (!(Offset0P = extractConstOffset(*Src1)))
if (!(Offset0P = TII->getImmOrMaterializedImm(*Src1)))
return;
BaseLo = *Src0;
}
Expand Down Expand Up @@ -2311,6 +2371,26 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
}

// Maintain the correct LDS address for async loads.
// It becomes incorrect when promoteConstantOffsetToImm
// adds an offset only meant for the src operand.
void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
int32_t OffsetDiff) const {
if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
return;

Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst));
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst)
.addReg(OldVDst)
.addImm(-OffsetDiff)
.addImm(0);

MI.getOperand(0).setReg(NewVDst);
}

bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineInstr &MI,
MemInfoMap &Visited,
Expand Down Expand Up @@ -2440,7 +2520,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
// Instead of moving up, just re-compute anchor-instruction's base address.
Register Base = computeBase(MI, AnchorAddr);

updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
updateBaseAndOffset(MI, Base, OffsetDiff);
updateAsyncLDSAddress(MI, OffsetDiff);
LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););

for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
Expand All @@ -2451,7 +2533,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
if (TLI->isLegalFlatAddressingMode(AM, AS)) {
LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
OtherMI->dump());
updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
updateBaseAndOffset(*OtherMI, Base, OtherOffsetDiff);
updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);
LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
}
}
Expand Down
Loading