@@ -41,6 +41,7 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
4141STATISTIC (NumPreFolded, " Number of pre-index updates folded" );
4242STATISTIC (NumUnscaledPairCreated,
4343 " Number of load/store from unscaled generated" );
44+ STATISTIC (NumSmallTypeMerged, " Number of small type loads merged" );
4445
4546static cl::opt<unsigned > ScanLimit (" aarch64-load-store-scan-limit" ,
4647 cl::init (20 ), cl::Hidden);
@@ -77,12 +78,13 @@ typedef struct LdStPairFlags {
7778
7879struct AArch64LoadStoreOpt : public MachineFunctionPass {
7980 static char ID;
80- AArch64LoadStoreOpt () : MachineFunctionPass(ID) {
81+ AArch64LoadStoreOpt () : MachineFunctionPass(ID), IsStrictAlign( false ) {
8182 initializeAArch64LoadStoreOptPass (*PassRegistry::getPassRegistry ());
8283 }
8384
8485 const AArch64InstrInfo *TII;
8586 const TargetRegisterInfo *TRI;
87+ bool IsStrictAlign;
8688
8789 // Scan the instructions looking for a load/store that can be combined
8890 // with the current instruction into a load/store pair.
@@ -122,6 +124,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
122124 mergeUpdateInsn (MachineBasicBlock::iterator I,
123125 MachineBasicBlock::iterator Update, bool IsPreIdx);
124126
127+ // Find and merge foldable ldr/str instructions.
128+ bool tryToMergeLdStInst (MachineBasicBlock::iterator &MBBI);
129+
125130 bool optimizeBlock (MachineBasicBlock &MBB);
126131
127132 bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -151,6 +156,7 @@ static bool isUnscaledLdSt(unsigned Opc) {
151156 case AArch64::LDURWi:
152157 case AArch64::LDURXi:
153158 case AArch64::LDURSWi:
159+ case AArch64::LDURHHi:
154160 return true ;
155161 }
156162}
@@ -159,6 +165,20 @@ static bool isUnscaledLdSt(MachineInstr *MI) {
159165 return isUnscaledLdSt (MI->getOpcode ());
160166}
161167
168+ static bool isSmallTypeLdMerge (unsigned Opc) {
169+ switch (Opc) {
170+ default :
171+ return false ;
172+ case AArch64::LDRHHui:
173+ case AArch64::LDURHHi:
174+ return true ;
175+ // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.).
176+ }
177+ }
178+ static bool isSmallTypeLdMerge (MachineInstr *MI) {
179+ return isSmallTypeLdMerge (MI->getOpcode ());
180+ }
181+
162182// Scaling factor for unscaled load or store.
163183static int getMemScale (MachineInstr *MI) {
164184 switch (MI->getOpcode ()) {
@@ -168,6 +188,7 @@ static int getMemScale(MachineInstr *MI) {
168188 case AArch64::STRBBui:
169189 return 1 ;
170190 case AArch64::LDRHHui:
191+ case AArch64::LDURHHi:
171192 case AArch64::STRHHui:
172193 return 2 ;
173194 case AArch64::LDRSui:
@@ -238,6 +259,8 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
238259 case AArch64::STURSi:
239260 case AArch64::LDRSui:
240261 case AArch64::LDURSi:
262+ case AArch64::LDRHHui:
263+ case AArch64::LDURHHi:
241264 return Opc;
242265 case AArch64::LDRSWui:
243266 return AArch64::LDRWui;
@@ -283,6 +306,10 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
283306 case AArch64::LDRSWui:
284307 case AArch64::LDURSWi:
285308 return AArch64::LDPSWi;
309+ case AArch64::LDRHHui:
310+ return AArch64::LDRWui;
311+ case AArch64::LDURHHi:
312+ return AArch64::LDURWi;
286313 }
287314}
288315
@@ -440,6 +467,21 @@ static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
440467 return MI->getOperand (Idx);
441468}
442469
470+ // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
471+ static void concatenateMemOperands (MachineInstr *MI, MachineInstr *Op0,
472+ MachineInstr *Op1) {
473+ assert (MI->memoperands_empty () && " expected a new machineinstr" );
474+ size_t numMemRefs = (Op0->memoperands_end () - Op0->memoperands_begin ()) +
475+ (Op1->memoperands_end () - Op1->memoperands_begin ());
476+
477+ MachineFunction *MF = MI->getParent ()->getParent ();
478+ MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray (numMemRefs);
479+ MachineSDNode::mmo_iterator MemEnd =
480+ std::copy (Op0->memoperands_begin (), Op0->memoperands_end (), MemBegin);
481+ MemEnd = std::copy (Op1->memoperands_begin (), Op1->memoperands_end (), MemEnd);
482+ MI->setMemRefs (MemBegin, MemEnd);
483+ }
484+
443485MachineBasicBlock::iterator
444486AArch64LoadStoreOpt::mergePairedInsns (MachineBasicBlock::iterator I,
445487 MachineBasicBlock::iterator Paired,
@@ -484,8 +526,79 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
484526 RtMI = I;
485527 Rt2MI = Paired;
486528 }
487- // Handle Unscaled
529+
488530 int OffsetImm = getLdStOffsetOp (RtMI).getImm ();
531+
532+ if (isSmallTypeLdMerge (Opc)) {
533+ // Change the scaled offset from small to large type.
534+ if (!IsUnscaled)
535+ OffsetImm /= 2 ;
536+ MachineInstr *RtNewDest = MergeForward ? I : Paired;
537+ // Construct the new load instruction.
538+ // FIXME: currently we support only halfword unsigned load. We need to
539+ // handle byte type, signed, and store instructions as well.
540+ MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
541+ NewMemMI = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
542+ TII->get (NewOpc))
543+ .addOperand (getLdStRegOp (RtNewDest))
544+ .addOperand (BaseRegOp)
545+ .addImm (OffsetImm);
546+
547+ // Copy MachineMemOperands from the original loads.
548+ concatenateMemOperands (NewMemMI, I, Paired);
549+
550+ DEBUG (
551+ dbgs ()
552+ << " Creating the new load and extract. Replacing instructions:\n " );
553+ DEBUG (I->print (dbgs ()));
554+ DEBUG (dbgs () << " " );
555+ DEBUG (Paired->print (dbgs ()));
556+ DEBUG (dbgs () << " with instructions:\n " );
557+ DEBUG ((NewMemMI)->print (dbgs ()));
558+
559+ MachineInstr *ExtDestMI = MergeForward ? Paired : I;
560+ if (ExtDestMI == Rt2MI) {
561+ // Create the bitfield extract for high half.
562+ BitExtMI1 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
563+ TII->get (AArch64::UBFMWri))
564+ .addOperand (getLdStRegOp (Rt2MI))
565+ .addReg (getLdStRegOp (RtNewDest).getReg ())
566+ .addImm (16 )
567+ .addImm (31 );
568+ // Create the bitfield extract for low half.
569+ BitExtMI2 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
570+ TII->get (AArch64::ANDWri))
571+ .addOperand (getLdStRegOp (RtMI))
572+ .addReg (getLdStRegOp (RtNewDest).getReg ())
573+ .addImm (15 );
574+ } else {
575+ // Create the bitfield extract for low half.
576+ BitExtMI1 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
577+ TII->get (AArch64::ANDWri))
578+ .addOperand (getLdStRegOp (RtMI))
579+ .addReg (getLdStRegOp (RtNewDest).getReg ())
580+ .addImm (15 );
581+ // Create the bitfield extract for high half.
582+ BitExtMI2 = BuildMI (*I->getParent (), InsertionPoint, I->getDebugLoc (),
583+ TII->get (AArch64::UBFMWri))
584+ .addOperand (getLdStRegOp (Rt2MI))
585+ .addReg (getLdStRegOp (RtNewDest).getReg ())
586+ .addImm (16 )
587+ .addImm (31 );
588+ }
589+ DEBUG (dbgs () << " " );
590+ DEBUG ((BitExtMI1)->print (dbgs ()));
591+ DEBUG (dbgs () << " " );
592+ DEBUG ((BitExtMI2)->print (dbgs ()));
593+ DEBUG (dbgs () << " \n " );
594+
595+ // Erase the old instructions.
596+ I->eraseFromParent ();
597+ Paired->eraseFromParent ();
598+ return NextI;
599+ }
600+
601+ // Handle Unscaled
489602 if (IsUnscaled)
490603 OffsetImm /= OffsetStride;
491604
@@ -622,8 +735,7 @@ static bool mayAlias(MachineInstr *MIa,
622735// / be combined with the current instruction into a load/store pair.
623736MachineBasicBlock::iterator
624737AArch64LoadStoreOpt::findMatchingInsn (MachineBasicBlock::iterator I,
625- LdStPairFlags &Flags,
626- unsigned Limit) {
738+ LdStPairFlags &Flags, unsigned Limit) {
627739 MachineBasicBlock::iterator E = I->getParent ()->end ();
628740 MachineBasicBlock::iterator MBBI = I;
629741 MachineInstr *FirstMI = I;
@@ -645,7 +757,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
645757 // range, plus allow an extra one in case we find a later insn that matches
646758 // with Offset-1)
647759 int OffsetStride = IsUnscaled ? getMemScale (FirstMI) : 1 ;
648- if (!inBoundsForPair (IsUnscaled, Offset, OffsetStride))
760+ if (!isSmallTypeLdMerge (Opc) &&
761+ !inBoundsForPair (IsUnscaled, Offset, OffsetStride))
649762 return E;
650763
651764 // Track which registers have been modified and used between the first insn
@@ -704,18 +817,32 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
704817 // If the resultant immediate offset of merging these instructions
705818 // is out of range for a pairwise instruction, bail and keep looking.
706819 bool MIIsUnscaled = isUnscaledLdSt (MI);
707- if (!inBoundsForPair (MIIsUnscaled, MinOffset, OffsetStride)) {
820+ bool IsSmallTypeLd = isSmallTypeLdMerge (MI->getOpcode ());
821+ if (!IsSmallTypeLd &&
822+ !inBoundsForPair (MIIsUnscaled, MinOffset, OffsetStride)) {
708823 trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
709824 MemInsns.push_back (MI);
710825 continue ;
711826 }
712- // If the alignment requirements of the paired (scaled) instruction
713- // can't express the offset of the unscaled input, bail and keep
714- // looking.
715- if (IsUnscaled && (alignTo (MinOffset, OffsetStride) != MinOffset)) {
716- trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
717- MemInsns.push_back (MI);
718- continue ;
827+
828+ if (IsSmallTypeLd) {
829+ // If the alignment requirements of the larger type scaled load
830+ // instruction can't express the scaled offset of the smaller type
831+ // input, bail and keep looking.
832+ if (!IsUnscaled && alignTo (MinOffset, 2 ) != MinOffset) {
833+ trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
834+ MemInsns.push_back (MI);
835+ continue ;
836+ }
837+ } else {
838+ // If the alignment requirements of the paired (scaled) instruction
839+ // can't express the offset of the unscaled input, bail and keep
840+ // looking.
841+ if (IsUnscaled && (alignTo (MinOffset, OffsetStride) != MinOffset)) {
842+ trackRegDefsUses (MI, ModifiedRegs, UsedRegs, TRI);
843+ MemInsns.push_back (MI);
844+ continue ;
845+ }
719846 }
720847 // If the destination register of the loads is the same register, bail
721848 // and keep looking. A load-pair instruction with both destination
@@ -996,24 +1123,94 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
9961123 return E;
9971124}
9981125
1126+ bool AArch64LoadStoreOpt::tryToMergeLdStInst (
1127+ MachineBasicBlock::iterator &MBBI) {
1128+ MachineInstr *MI = MBBI;
1129+ MachineBasicBlock::iterator E = MI->getParent ()->end ();
1130+ // If this is a volatile load/store, don't mess with it.
1131+ if (MI->hasOrderedMemoryRef ())
1132+ return false ;
1133+
1134+ // Make sure this is a reg+imm (as opposed to an address reloc).
1135+ if (!getLdStOffsetOp (MI).isImm ())
1136+ return false ;
1137+
1138+ // Check if this load/store has a hint to avoid pair formation.
1139+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1140+ if (TII->isLdStPairSuppressed (MI))
1141+ return false ;
1142+
1143+ // Look ahead up to ScanLimit instructions for a pairable instruction.
1144+ LdStPairFlags Flags;
1145+ MachineBasicBlock::iterator Paired = findMatchingInsn (MBBI, Flags, ScanLimit);
1146+ if (Paired != E) {
1147+ if (isSmallTypeLdMerge (MI)) {
1148+ ++NumSmallTypeMerged;
1149+ } else {
1150+ ++NumPairCreated;
1151+ if (isUnscaledLdSt (MI))
1152+ ++NumUnscaledPairCreated;
1153+ }
1154+
1155+ // Merge the loads into a pair. Keeping the iterator straight is a
1156+ // pain, so we let the merge routine tell us what the next instruction
1157+ // is after it's done mucking about.
1158+ MBBI = mergePairedInsns (MBBI, Paired, Flags);
1159+ return true ;
1160+ }
1161+ return false ;
1162+ }
1163+
9991164bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB) {
10001165 bool Modified = false ;
1001- // Two tranformations to do here:
1002- // 1) Find loads and stores that can be merged into a single load or store
1166+ // Three tranformations to do here:
1167+ // 1) Find halfword loads that can be merged into a single 32-bit word load
1168+ // with bitfield extract instructions.
1169+ // e.g.,
1170+ // ldrh w0, [x2]
1171+ // ldrh w1, [x2, #2]
1172+ // ; becomes
1173+ // ldr w0, [x2]
1174+ // ubfx w1, w0, #16, #16
1175+ // and w0, w0, #ffff
1176+ // 2) Find loads and stores that can be merged into a single load or store
10031177 // pair instruction.
10041178 // e.g.,
10051179 // ldr x0, [x2]
10061180 // ldr x1, [x2, #8]
10071181 // ; becomes
10081182 // ldp x0, x1, [x2]
1009- // 2 ) Find base register updates that can be merged into the load or store
1183+ // 3 ) Find base register updates that can be merged into the load or store
10101184 // as a base-reg writeback.
10111185 // e.g.,
10121186 // ldr x0, [x2]
10131187 // add x2, x2, #4
10141188 // ; becomes
10151189 // ldr x0, [x2], #4
10161190
1191+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
1192+ !IsStrictAlign && MBBI != E;) {
1193+ MachineInstr *MI = MBBI;
1194+ switch (MI->getOpcode ()) {
1195+ default :
1196+ // Just move on to the next instruction.
1197+ ++MBBI;
1198+ break ;
1199+ // Scaled instructions.
1200+ case AArch64::LDRHHui:
1201+ // Unscaled instructions.
1202+ case AArch64::LDURHHi: {
1203+ if (tryToMergeLdStInst (MBBI)) {
1204+ Modified = true ;
1205+ break ;
1206+ }
1207+ ++MBBI;
1208+ break ;
1209+ }
1210+ // FIXME: Do the other instructions.
1211+ }
1212+ }
1213+
10171214 for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
10181215 MBBI != E;) {
10191216 MachineInstr *MI = MBBI;
@@ -1046,35 +1243,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
10461243 case AArch64::LDURWi:
10471244 case AArch64::LDURXi:
10481245 case AArch64::LDURSWi: {
1049- // If this is a volatile load/store, don't mess with it.
1050- if (MI->hasOrderedMemoryRef ()) {
1051- ++MBBI;
1052- break ;
1053- }
1054- // Make sure this is a reg+imm (as opposed to an address reloc).
1055- if (!getLdStOffsetOp (MI).isImm ()) {
1056- ++MBBI;
1057- break ;
1058- }
1059- // Check if this load/store has a hint to avoid pair formation.
1060- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1061- if (TII->isLdStPairSuppressed (MI)) {
1062- ++MBBI;
1063- break ;
1064- }
1065- // Look ahead up to ScanLimit instructions for a pairable instruction.
1066- LdStPairFlags Flags;
1067- MachineBasicBlock::iterator Paired =
1068- findMatchingInsn (MBBI, Flags, ScanLimit);
1069- if (Paired != E) {
1070- ++NumPairCreated;
1071- if (isUnscaledLdSt (MI))
1072- ++NumUnscaledPairCreated;
1073-
1074- // Merge the loads into a pair. Keeping the iterator straight is a
1075- // pain, so we let the merge routine tell us what the next instruction
1076- // is after it's done mucking about.
1077- MBBI = mergePairedInsns (MBBI, Paired, Flags);
1246+ if (tryToMergeLdStInst (MBBI)) {
10781247 Modified = true ;
10791248 break ;
10801249 }
@@ -1206,6 +1375,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
12061375bool AArch64LoadStoreOpt::runOnMachineFunction (MachineFunction &Fn) {
12071376 TII = static_cast <const AArch64InstrInfo *>(Fn.getSubtarget ().getInstrInfo ());
12081377 TRI = Fn.getSubtarget ().getRegisterInfo ();
1378+ IsStrictAlign = (static_cast <const AArch64Subtarget &>(Fn.getSubtarget ()))
1379+ .requiresStrictAlign ();
12091380
12101381 bool Modified = false ;
12111382 for (auto &MBB : Fn)
0 commit comments