|
39 | 39 | #include "llvm/ADT/StringRef.h" |
40 | 40 | #include "llvm/Analysis/AliasAnalysis.h" |
41 | 41 | #include "llvm/Analysis/CmpInstAnalysis.h" |
| 42 | +#include "llvm/Analysis/HashRecognize.h" |
42 | 43 | #include "llvm/Analysis/LoopInfo.h" |
43 | 44 | #include "llvm/Analysis/LoopPass.h" |
44 | 45 | #include "llvm/Analysis/MemoryLocation.h" |
@@ -144,6 +145,14 @@ static cl::opt<bool, true> |
144 | 145 | cl::location(DisableLIRP::Wcslen), cl::init(false), |
145 | 146 | cl::ReallyHidden); |
146 | 147 |
|
| 148 | +bool DisableLIRP::HashRecognize; |
| 149 | +static cl::opt<bool, true> |
| 150 | + DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", |
| 151 | + cl::desc("Proceed with loop idiom recognize pass, " |
| 152 | + "but do not optimize CRC loops."), |
| 153 | + cl::location(DisableLIRP::HashRecognize), |
| 154 | + cl::init(false), cl::ReallyHidden); |
| 155 | + |
147 | 156 | static cl::opt<bool> UseLIRCodeSizeHeurs( |
148 | 157 | "use-lir-code-size-heurs", |
149 | 158 | cl::desc("Use loop idiom recognition code size heuristics when compiling " |
@@ -238,6 +247,7 @@ class LoopIdiomRecognize { |
238 | 247 | const SCEV *BECount); |
239 | 248 | bool avoidLIRForMultiBlockLoop(bool IsMemset = false, |
240 | 249 | bool IsLoopMemset = false); |
| 250 | + bool optimizeCRCLoop(const PolynomialInfo &Info); |
241 | 251 |
|
242 | 252 | /// @} |
243 | 253 | /// \name Noncountable Loop Idiom Handling |
@@ -283,6 +293,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, |
283 | 293 | // but ORE cannot be preserved (see comment before the pass definition). |
284 | 294 | OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); |
285 | 295 |
|
| 296 | + std::optional<PolynomialInfo> HR; |
| 297 | + |
286 | 298 | LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, |
287 | 299 | AR.MSSA, DL, ORE); |
288 | 300 | if (!LIR.runOnLoop(&L)) |
@@ -326,7 +338,7 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { |
326 | 338 | HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); |
327 | 339 | HasMemcpy = TLI->has(LibFunc_memcpy); |
328 | 340 |
|
329 | | - if (HasMemset || HasMemsetPattern || HasMemcpy) |
| 341 | + if (HasMemset || HasMemsetPattern || HasMemcpy || !DisableLIRP::HashRecognize) |
330 | 342 | if (SE->hasLoopInvariantBackedgeTakenCount(L)) |
331 | 343 | return runOnCountableLoop(); |
332 | 344 |
|
@@ -369,6 +381,12 @@ bool LoopIdiomRecognize::runOnCountableLoop() { |
369 | 381 |
|
370 | 382 | MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); |
371 | 383 | } |
| 384 | + |
| 385 | + // Optimize a CRC loop if HashRecognize found one. |
| 386 | + if (!DisableLIRP::HashRecognize) |
| 387 | + if (auto Res = HashRecognize(*CurLoop, *SE).getResult()) |
| 388 | + optimizeCRCLoop(*Res); |
| 389 | + |
372 | 390 | return MadeChange; |
373 | 391 | } |
374 | 392 |
|
@@ -1473,6 +1491,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, |
1473 | 1491 | return false; |
1474 | 1492 | } |
1475 | 1493 |
|
| 1494 | +bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) { |
| 1495 | + // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using |
| 1496 | + // carry-less multiplication instructions, which is more efficient than our |
| 1497 | + // Sarwate table-lookup optimization. Hence, until we're able to emit |
| 1498 | + // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom, |
| 1499 | + // disable the optimization for Hexagon. |
| 1500 | + Module &M = *CurLoop->getHeader()->getModule(); |
| 1501 | + Triple TT(M.getTargetTriple()); |
| 1502 | + if (TT.getArch() == Triple::hexagon) |
| 1503 | + return false; |
| 1504 | + |
| 1505 | + // First, create a new GlobalVariable corresponding to the |
| 1506 | + // Sarwate-lookup-table. |
| 1507 | + Type *CRCTy = Info.LHS->getType(); |
| 1508 | + unsigned CRCBW = CRCTy->getIntegerBitWidth(); |
| 1509 | + std::array<Constant *, 256> CRCConstants; |
| 1510 | + transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped), |
| 1511 | + CRCConstants.begin(), |
| 1512 | + [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); }); |
| 1513 | + Constant *ConstArray = |
| 1514 | + ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants); |
| 1515 | + GlobalVariable *GV = |
| 1516 | + new GlobalVariable(M, ConstArray->getType(), true, |
| 1517 | + GlobalValue::PrivateLinkage, ConstArray, ".crctable"); |
| 1518 | + |
| 1519 | + PHINode *IV = CurLoop->getCanonicalInductionVariable(); |
| 1520 | + SmallVector<PHINode *, 2> Cleanup; |
| 1521 | + |
| 1522 | + // Next, mark all PHIs for removal except IV. |
| 1523 | + { |
| 1524 | + for (PHINode &PN : CurLoop->getHeader()->phis()) { |
| 1525 | + if (&PN == IV) |
| 1526 | + continue; |
| 1527 | + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); |
| 1528 | + Cleanup.push_back(&PN); |
| 1529 | + } |
| 1530 | + } |
| 1531 | + |
| 1532 | + // Next, fix up the trip count. |
| 1533 | + { |
| 1534 | + unsigned NewBTC = (Info.TripCount / 8) - 1; |
| 1535 | + BasicBlock *LoopBlk = CurLoop->getLoopLatch(); |
| 1536 | + BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator()); |
| 1537 | + CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk |
| 1538 | + ? ICmpInst::Predicate::ICMP_NE |
| 1539 | + : ICmpInst::Predicate::ICMP_EQ; |
| 1540 | + Instruction *ExitCond = CurLoop->getLatchCmpInst(); |
| 1541 | + Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC); |
| 1542 | + IRBuilder<> Builder(ExitCond); |
| 1543 | + Value *NewExitCond = |
| 1544 | + Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond"); |
| 1545 | + ExitCond->replaceAllUsesWith(NewExitCond); |
| 1546 | + deleteDeadInstruction(ExitCond); |
| 1547 | + } |
| 1548 | + |
| 1549 | + // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all |
| 1550 | + // uses of ComputedValue. |
| 1551 | + // |
| 1552 | + // Little-endian: |
| 1553 | + // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)] |
| 1554 | + // Big-Endian: |
| 1555 | + // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)] |
| 1556 | + { |
| 1557 | + auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) { |
| 1558 | + Type *OpTy = Op->getType(); |
| 1559 | + unsigned OpBW = OpTy->getIntegerBitWidth(); |
| 1560 | + return OpBW > 8 |
| 1561 | + ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name) |
| 1562 | + : Op; |
| 1563 | + }; |
| 1564 | + auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op, |
| 1565 | + const Twine &Name) { |
| 1566 | + Type *OpTy = Op->getType(); |
| 1567 | + |
| 1568 | + // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to |
| 1569 | + // use the CRC's bitwidth as the reference for shifting right. |
| 1570 | + return LoByte(Builder, |
| 1571 | + CRCBW > 8 ? Builder.CreateLShr( |
| 1572 | + Op, ConstantInt::get(OpTy, CRCBW - 8), Name) |
| 1573 | + : Op, |
| 1574 | + Name + ".lo.byte"); |
| 1575 | + }; |
| 1576 | + |
| 1577 | + IRBuilder<> Builder(CurLoop->getHeader(), |
| 1578 | + CurLoop->getHeader()->getFirstNonPHIIt()); |
| 1579 | + |
| 1580 | + // Create the CRC PHI, and initialize its incoming value to the initial |
| 1581 | + // value of CRC. |
| 1582 | + PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc"); |
| 1583 | + CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader()); |
| 1584 | + |
| 1585 | + // CRC is now an evolving variable, initialized to the PHI. |
| 1586 | + Value *CRC = CRCPhi; |
| 1587 | + |
| 1588 | + // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte |
| 1589 | + // of LHSAux), if LHSAux is non-nullptr. |
| 1590 | + Value *Indexer = CRC; |
| 1591 | + if (Value *Data = Info.LHSAux) { |
| 1592 | + Type *DataTy = Data->getType(); |
| 1593 | + |
| 1594 | + // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we |
| 1595 | + // shift right by that amount, and take the lo-byte (in the little-endian |
| 1596 | + // case), or shift left by that amount, and take the hi-idx (in the |
| 1597 | + // big-endian case). |
| 1598 | + Value *IVBits = Builder.CreateZExtOrTrunc( |
| 1599 | + Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer"); |
| 1600 | + Value *DataIndexer = |
| 1601 | + Info.ByteOrderSwapped |
| 1602 | + ? Builder.CreateShl(Data, IVBits, "data.indexer") |
| 1603 | + : Builder.CreateLShr(Data, IVBits, "data.indexer"); |
| 1604 | + Indexer = Builder.CreateXor( |
| 1605 | + DataIndexer, |
| 1606 | + Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"), |
| 1607 | + "crc.data.indexer"); |
| 1608 | + } |
| 1609 | + |
| 1610 | + Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi") |
| 1611 | + : LoByte(Builder, Indexer, "indexer.lo"); |
| 1612 | + |
| 1613 | + // Always index into a GEP using the index type. |
| 1614 | + Indexer = Builder.CreateZExt( |
| 1615 | + Indexer, SE->getDataLayout().getIndexType(GV->getType()), |
| 1616 | + "indexer.ext"); |
| 1617 | + |
| 1618 | + // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC]. |
| 1619 | + Value *CRCTableGEP = |
| 1620 | + Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd"); |
| 1621 | + Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld"); |
| 1622 | + |
| 1623 | + // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of |
| 1624 | + // CRC-8. |
| 1625 | + Value *CRCNext = CRCTableLd; |
| 1626 | + if (CRCBW > 8) { |
| 1627 | + Value *CRCShift = Info.ByteOrderSwapped |
| 1628 | + ? Builder.CreateShl(CRC, 8, "crc.be.shift") |
| 1629 | + : Builder.CreateLShr(CRC, 8, "crc.le.shift"); |
| 1630 | + CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next"); |
| 1631 | + } |
| 1632 | + |
| 1633 | + // Connect the back-edge for the loop, and RAUW the ComputedValue. |
| 1634 | + CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch()); |
| 1635 | + Info.ComputedValue->replaceUsesOutsideBlock(CRCNext, |
| 1636 | + CurLoop->getLoopLatch()); |
| 1637 | + } |
| 1638 | + |
| 1639 | + // Cleanup. |
| 1640 | + { |
| 1641 | + for (PHINode *PN : Cleanup) |
| 1642 | + RecursivelyDeleteDeadPHINode(PN); |
| 1643 | + SE->forgetLoop(CurLoop); |
| 1644 | + } |
| 1645 | + return true; |
| 1646 | +} |
| 1647 | + |
1476 | 1648 | bool LoopIdiomRecognize::runOnNoncountableLoop() { |
1477 | 1649 | LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" |
1478 | 1650 | << CurLoop->getHeader()->getParent()->getName() |
|
0 commit comments