-
Notifications
You must be signed in to change notification settings - Fork 77
Wave Transform to generate SSA Exec mask manipulation instrs #789
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: amd-feature/wave-transform
Are you sure you want to change the base?
Changes from 1 commit
10c69e3
54b5f3b
7a31f7f
35f7d2c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1839,7 +1839,7 @@ void ControlFlowRewriter::rewrite() { | |
| Opcode = AMDGPU::S_CBRANCH_SCC1; | ||
| } else { | ||
| Register CondReg = Info.OrigCondition; | ||
| if (!LMA.isSubsetOfExec(CondReg, *Node->Block)) { | ||
| if (!LMA.isSubsetOfExec(CondReg, *Node->Block, Node->Block->end())) { | ||
| CondReg = LMU.createLaneMaskReg(); | ||
| BuildMI(*Node->Block, Node->Block->end(), {}, TII.get(LMC.AndOpc), | ||
| CondReg) | ||
|
|
@@ -1937,7 +1937,7 @@ void ControlFlowRewriter::rewrite() { | |
| } | ||
| } else { | ||
| CondReg = LaneOrigin.CondReg; | ||
| if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block)) { | ||
| if (!LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator())) { | ||
| Register Prev = CondReg; | ||
| CondReg = LMU.createLaneMaskReg(); | ||
| BuildMI(*LaneOrigin.Node->Block, | ||
|
|
@@ -2033,28 +2033,33 @@ void ControlFlowRewriter::rewrite() { | |
| CFGNodeInfo &PredInfo = NodeInfo.find(Pred)->second; | ||
| Register PrimaryExec = PredInfo.PrimarySuccessorExec; | ||
|
|
||
| MachineInstr *PrimaryExecDef; | ||
| for (;;) { | ||
| PrimaryExecDef = MRI.getVRegDef(PrimaryExec); | ||
| if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) | ||
| break; | ||
| PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); | ||
| } | ||
| //Turning off this copy-chain optimization to retain the Accumulator as the PrimaryExec | ||
|
|
||
| // MachineInstr *PrimaryExecDef; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code commented out won't look good. Better clean them all. What is the significance of adding the above comment here? Are you planning to implement a similar optimization for ACC based non-SSA form? If yes, leave a strong note mentioning that (still need to clean up the commented code). Otherwise, remove the comment as well. |
||
| // for (;;) { | ||
| // PrimaryExecDef = MRI.getVRegDef(PrimaryExec); | ||
| // if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) | ||
| // break; | ||
| // PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); | ||
| // } | ||
|
|
||
| // Rejoin = EXEC ^ PrimaryExec | ||
| // | ||
| // Fold immediately if PrimaryExec was obtained via XOR as well. | ||
| Register Rejoin; | ||
|
|
||
| if (PrimaryExecDef->getParent() == Pred->Block && | ||
| PrimaryExecDef->getOpcode() == LMC.XorOpc && | ||
| PrimaryExecDef->getOperand(1).isReg() && | ||
| PrimaryExecDef->getOperand(2).isReg()) { | ||
| if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) | ||
| Rejoin = PrimaryExecDef->getOperand(2).getReg(); | ||
| else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) | ||
| Rejoin = PrimaryExecDef->getOperand(1).getReg(); | ||
| } | ||
| //Turning off this XOR optimiztion since buildMergeLaneMasks() will not | ||
| // introduce XOR instruction for creating the PrimaryExec | ||
|
|
||
| // if (PrimaryExecDef->getParent() == Pred->Block && | ||
| // PrimaryExecDef->getOpcode() == LMC.XorOpc && | ||
| // PrimaryExecDef->getOperand(1).isReg() && | ||
| // PrimaryExecDef->getOperand(2).isReg()) { | ||
| // if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) | ||
| // Rejoin = PrimaryExecDef->getOperand(2).getReg(); | ||
| // else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) | ||
| // Rejoin = PrimaryExecDef->getOperand(1).getReg(); | ||
| // } | ||
|
|
||
| if (!Rejoin) { | ||
| // Try to find a previously generated XOR (or merely masked) value | ||
|
|
@@ -2091,7 +2096,7 @@ void ControlFlowRewriter::rewrite() { | |
|
|
||
| LLVM_DEBUG(Function.dump()); | ||
| } | ||
|
|
||
| Updater.insertAccumulatorResets(); | ||
| Updater.cleanup(); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,13 +31,12 @@ bool GCNLaneMaskUtils::maybeLaneMask(Register Reg) const { | |
|
|
||
| /// Determine whether the lane-mask register \p Reg is a wave-wide constant. | ||
| /// If so, the value is stored in \p Val. | ||
| bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val) const { | ||
| bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { | ||
| MachineRegisterInfo &MRI = MF.getRegInfo(); | ||
|
|
||
| const MachineInstr *MI; | ||
| for (;;) { | ||
| MI = MRI.getVRegDef(Reg); | ||
| if (!MI) { | ||
| MI = MRI.getDomVRegDefInBasicBlock(Reg, MBB, MI); | ||
| if (MI == MBB.end()) { | ||
| // This can happen when called from GCNLaneMaskUpdater, where Reg can | ||
| // be a placeholder that has not yet been filled in. | ||
| return false; | ||
|
|
@@ -100,18 +99,20 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { | |
| /// properly masked, i.e. use PrevReg directly instead of | ||
| /// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg | ||
| /// beyond (CurReg & EXEC). | ||
| /// \param isPrevZeroReg Indicates that PrevReg is a zero register. | ||
lalaniket8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, | ||
| MachineBasicBlock::iterator I, | ||
| const DebugLoc &DL, Register DstReg, | ||
| Register PrevReg, Register CurReg, | ||
| GCNLaneMaskAnalysis *LMA, | ||
| bool accumulating) const { | ||
| bool accumulating, | ||
| bool isPrevZeroReg) const { | ||
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||
| const SIInstrInfo *TII = ST.getInstrInfo(); | ||
| bool PrevVal = false; | ||
| bool PrevConstant = !PrevReg || isConstantLaneMask(PrevReg, PrevVal); | ||
| bool PrevConstant = !PrevReg || isPrevZeroReg; | ||
| bool CurVal = false; | ||
| bool CurConstant = isConstantLaneMask(CurReg, CurVal); | ||
| bool CurConstant = isConstantLaneMask(CurReg, CurVal, MBB, I); | ||
|
|
||
| assert(PrevReg || !accumulating); | ||
|
|
||
|
|
@@ -147,7 +148,7 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, | |
| } | ||
| if (!CurConstant) { | ||
| if ((PrevConstant && PrevVal) || | ||
| (LMA && LMA->isSubsetOfExec(CurReg, MBB))) { | ||
| (LMA && LMA->isSubsetOfExec(CurReg, MBB, I))) { | ||
| CurMaskedReg = CurReg; | ||
| } else { | ||
| CurMaskedReg = createLaneMaskReg(); | ||
|
|
@@ -188,22 +189,26 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, | |
| /// (Reg & EXEC) == Reg when used in \p UseBlock. | ||
| bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, | ||
| MachineBasicBlock &UseBlock, | ||
| MachineBasicBlock::iterator I, | ||
| unsigned RemainingDepth) { | ||
| MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); | ||
| MachineInstr *DefInstr = nullptr; | ||
| MachineBasicBlock::iterator DefInstr = UseBlock.end(); | ||
| const AMDGPU::LaneMaskConstants &LMC = LMU.getLaneMaskConsts(); | ||
|
|
||
| for (;;) { | ||
| if (!Register::isVirtualRegister(Reg)) { | ||
| if (Reg == LMC.ExecReg && | ||
| (!DefInstr || DefInstr->getParent() == &UseBlock)) | ||
| (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)) | ||
| return true; | ||
| return false; | ||
| } | ||
|
|
||
| DefInstr = MRI.getVRegDef(Reg); | ||
| DefInstr = MRI.getDomVRegDefInBasicBlock(Reg, UseBlock, I); | ||
| if(DefInstr == UseBlock.end()) | ||
| return false; | ||
| if (DefInstr->getOpcode() == AMDGPU::COPY) { | ||
| Reg = DefInstr->getOperand(1).getReg(); | ||
| I = DefInstr; | ||
| continue; | ||
| } | ||
|
|
||
|
|
@@ -242,7 +247,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, | |
| if ((LikeOr || IsAnd || IsAndN2) && | ||
| (DefInstr->getOperand(1).isReg() && DefInstr->getOperand(2).isReg())) { | ||
| bool FirstIsSubset = isSubsetOfExec(DefInstr->getOperand(1).getReg(), | ||
| UseBlock, RemainingDepth); | ||
| UseBlock, DefInstr, RemainingDepth); | ||
| if (!FirstIsSubset && (LikeOr || IsAndN2)) | ||
| return SubsetOfExec.try_emplace(Reg, false).first->second; | ||
|
|
||
|
|
@@ -252,7 +257,7 @@ bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, | |
| } | ||
|
|
||
| bool SecondIsSubset = isSubsetOfExec(DefInstr->getOperand(2).getReg(), | ||
| UseBlock, RemainingDepth); | ||
| UseBlock, DefInstr, RemainingDepth); | ||
| if (!SecondIsSubset) | ||
| return SubsetOfExec.try_emplace(Reg, false).first->second; | ||
|
|
||
|
|
@@ -268,14 +273,14 @@ void GCNLaneMaskUpdater::init(Register Reg) { | |
| Processed = false; | ||
| Blocks.clear(); | ||
| // SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC); | ||
| SSAUpdater.Initialize(Reg); | ||
lalaniket8 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Accumulator = {}; | ||
| } | ||
|
|
||
| /// Optional cleanup, may remove stray instructions. | ||
| void GCNLaneMaskUpdater::cleanup() { | ||
| Processed = false; | ||
| Blocks.clear(); | ||
|
|
||
| Accumulator = {}; | ||
| MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); | ||
|
|
||
| if (ZeroReg && MRI.use_empty(ZeroReg)) { | ||
|
|
@@ -330,7 +335,7 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block, | |
| Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { | ||
| if (!Processed) | ||
| process(); | ||
| return SSAUpdater.GetValueInMiddleOfBlock(&Block); | ||
| return Accumulator; | ||
| } | ||
|
|
||
| /// Return the value at the end of the given block, i.e. after any change that | ||
|
|
@@ -342,7 +347,7 @@ Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { | |
| Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { | ||
| if (!Processed) | ||
| process(); | ||
| return SSAUpdater.GetValueAtEndOfBlock(&Block); | ||
| return Accumulator; | ||
| } | ||
|
|
||
| /// Return the value in \p Block after the value merge (if any). | ||
|
|
@@ -352,15 +357,15 @@ Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { | |
|
|
||
| auto BlockIt = findBlockInfo(Block); | ||
| if (BlockIt != Blocks.end()) { | ||
| if (BlockIt->Merged) | ||
| return BlockIt->Merged; | ||
| if (BlockIt->Value) | ||
| return Accumulator; | ||
| if (BlockIt->Flags & ResetInMiddle) | ||
| return ZeroReg; | ||
| } | ||
|
|
||
| // We didn't merge anything in the block, but the block may still be | ||
| // ResetAtEnd, in which case we need the pre-reset value. | ||
| return SSAUpdater.GetValueInMiddleOfBlock(&Block); | ||
| return Accumulator; | ||
| } | ||
|
|
||
| /// Determine whether \p MI defines and/or uses SCC. | ||
|
|
@@ -422,22 +427,22 @@ void GCNLaneMaskUpdater::process() { | |
| .addImm(0); | ||
| } | ||
|
|
||
| // Add available values. | ||
| if (!Accumulator) { | ||
| Accumulator = LMU.createLaneMaskReg(); | ||
| BuildMI(Entry, Entry.getFirstTerminator(), {}, | ||
| TII->get(LMU.getLaneMaskConsts().MovOpc), Accumulator) | ||
| .addImm(0); | ||
| } | ||
|
|
||
| // Reset accumulator. | ||
| for (BlockInfo &Info : Blocks) { | ||
| assert(Accumulating || !Info.Flags); | ||
| assert(Info.Flags || Info.Value); | ||
|
|
||
| if (Info.Value) | ||
| Info.Merged = LMU.createLaneMaskReg(); | ||
|
|
||
| SSAUpdater.AddAvailableValue( | ||
| Info.Block, | ||
| (Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg); | ||
| if(!Info.Value || (Info.Flags & ResetAtEnd)) | ||
|
||
| AccumulatorResetBlocks[Info.Block].insert(Accumulator); | ||
| } | ||
|
|
||
| if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry)) | ||
| SSAUpdater.AddAvailableValue(&Entry, ZeroReg); | ||
|
|
||
| // Once the SSA updater is ready, we can fill in all merge code, relying | ||
| // on the SSA updater to insert required PHIs. | ||
| for (BlockInfo &Info : Blocks) { | ||
|
|
@@ -448,11 +453,8 @@ void GCNLaneMaskUpdater::process() { | |
| Register Previous; | ||
| if (Info.Block != &LMU.function()->front() && | ||
| !(Info.Flags & ResetInMiddle)) { | ||
| Previous = SSAUpdater.GetValueInMiddleOfBlock(Info.Block); | ||
| if (Accumulating) { | ||
| assert(!MRI.getVRegDef(Previous) || | ||
| MRI.getVRegDef(Previous)->getOpcode() != AMDGPU::IMPLICIT_DEF); | ||
| } else { | ||
| Previous = Accumulator; | ||
| if (!Accumulating) { | ||
| MachineInstr *PrevInstr = MRI.getVRegDef(Previous); | ||
| if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) { | ||
| PotentiallyDead.insert(PrevInstr); | ||
|
|
@@ -466,18 +468,20 @@ void GCNLaneMaskUpdater::process() { | |
|
|
||
| // Insert merge logic. | ||
| MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block); | ||
| LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Info.Merged, Previous, | ||
| Info.Value, LMA, Accumulating); | ||
|
|
||
| if (Info.Flags & ResetAtEnd) { | ||
| MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); | ||
| if (mergeInstr->getOpcode() == AMDGPU::COPY && | ||
| mergeInstr->getOperand(1).getReg().isVirtual()) { | ||
| assert(MRI.use_empty(Info.Merged)); | ||
| Info.Merged = mergeInstr->getOperand(1).getReg(); | ||
| mergeInstr->eraseFromParent(); | ||
| } | ||
| } | ||
| LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, | ||
| Info.Value, LMA, Accumulating, Previous == ZeroReg); | ||
|
|
||
|
|
||
| // Switching off this optimization, since Accumulator will always have a use | ||
| // if (Info.Flags & ResetAtEnd) { | ||
| // MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); | ||
| // if (mergeInstr->getOpcode() == AMDGPU::COPY && | ||
| // mergeInstr->getOperand(1).getReg().isVirtual()) { | ||
| // assert(MRI.use_empty(Info.Merged)); | ||
| // Info.Merged = mergeInstr->getOperand(1).getReg(); | ||
| // mergeInstr->eraseFromParent(); | ||
| // } | ||
| // } | ||
| } | ||
|
|
||
| Processed = true; | ||
|
|
@@ -489,3 +493,18 @@ GCNLaneMaskUpdater::findBlockInfo(MachineBasicBlock &Block) { | |
| return llvm::find_if( | ||
| Blocks, [&](const auto &Entry) { return Entry.Block == &Block; }); | ||
| } | ||
|
|
||
| void GCNLaneMaskUpdater::insertAccumulatorResets() { | ||
| const SIInstrInfo *TII = LMU.function()->getSubtarget<GCNSubtarget>().getInstrInfo(); | ||
| for (auto &Entry : AccumulatorResetBlocks) { | ||
| MachineBasicBlock *B = Entry.first; | ||
| DenseSet<Register> &Accumulators = Entry.second; | ||
| for (Register ACC : Accumulators) { | ||
| //get first branch instruction | ||
lalaniket8 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| MachineBasicBlock::iterator I = B->getFirstTerminator(); | ||
| while(I != B->end() && !I->isBranch()) I++; | ||
| if(I == B->end()) I--; | ||
| BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC).addImm(0); | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't you insert all resets one after another once you find the right place rather than searching for right insertion place for every accumulator to reset? Seems bit expensive!
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, they need to be inserted at the end of the basic blocks right before the first branch instruction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sense! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea of having write to EXEC as MovTermOpc breaks the new flow (non-SSA) as we need to insert the accumulator at the end of the BB, before the actual terminator instructions. The better approach would be to delay the insertion of EXEC write alongside the ACC reset routine. There could be challenges as we might not reset ACC all the time. However, if you knew earlier about the need for ACC reset in the block, we could handle them specially, and it can still be done without introducing MovTermOpc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't understand this part, writing to EXEC as MovTermOpc seems independent from writing to the accumulator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
EXEC insertions happen in 2 stages: First for all divergent incoming BBs, then for secondary BBs (creating rejoin masks). In the 2nd stage, when we set EXEC to the computed rejoin masks, the insertion point for this is found by iterating from the first terminator (MovTermOpc introduced by stage 1), by getSaluInsertionAtEnd() function.
Thats true, but the order of instructions breaks the verifier since it sees ACC Reset instructions after a MovTermOpc:
Yes, this is a better approach, will incorporate this. |
||
| } | ||
| } | ||

Uh oh!
There was an error while loading. Please reload this page.