-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[LoopIdiom] Use HashRecognize to optimize CRC #143208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,7 @@ | |
| #include "llvm/ADT/StringRef.h" | ||
| #include "llvm/Analysis/AliasAnalysis.h" | ||
| #include "llvm/Analysis/CmpInstAnalysis.h" | ||
| #include "llvm/Analysis/HashRecognize.h" | ||
| #include "llvm/Analysis/LoopInfo.h" | ||
| #include "llvm/Analysis/LoopPass.h" | ||
| #include "llvm/Analysis/MemoryLocation.h" | ||
|
|
@@ -143,6 +144,14 @@ static cl::opt<bool, true> | |
| cl::location(DisableLIRP::Wcslen), cl::init(false), | ||
| cl::ReallyHidden); | ||
|
|
||
| bool DisableLIRP::HashRecognize; | ||
| static cl::opt<bool, true> | ||
| DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", | ||
| cl::desc("Proceed with loop idiom recognize pass, " | ||
| "but do not optimize CRC loops."), | ||
| cl::location(DisableLIRP::HashRecognize), | ||
| cl::init(false), cl::ReallyHidden); | ||
|
|
||
| static cl::opt<bool> UseLIRCodeSizeHeurs( | ||
| "use-lir-code-size-heurs", | ||
| cl::desc("Use loop idiom recognition code size heuristics when compiling " | ||
|
|
@@ -242,6 +251,7 @@ class LoopIdiomRecognize { | |
| const SCEV *BECount); | ||
| bool avoidLIRForMultiBlockLoop(bool IsMemset = false, | ||
| bool IsLoopMemset = false); | ||
| bool optimizeCRCLoop(const PolynomialInfo &Info); | ||
|
|
||
| /// @} | ||
| /// \name Noncountable Loop Idiom Handling | ||
|
|
@@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, | |
| // but ORE cannot be preserved (see comment before the pass definition). | ||
| OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); | ||
|
|
||
| std::optional<PolynomialInfo> HR; | ||
|
|
||
| LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, | ||
| AR.MSSA, DL, ORE); | ||
| if (!LIR.runOnLoop(&L)) | ||
|
|
@@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { | |
| HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); | ||
| HasMemcpy = TLI->has(LibFunc_memcpy); | ||
|
|
||
| if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) | ||
| if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || | ||
| HasMemcpy || !DisableLIRP::HashRecognize) | ||
| if (SE->hasLoopInvariantBackedgeTakenCount(L)) | ||
| return runOnCountableLoop(); | ||
|
|
||
|
|
@@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { | |
|
|
||
| MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); | ||
| } | ||
|
|
||
| // Optimize a CRC loop if HashRecognize found one, provided we're not | ||
| // optimizing for size. | ||
| if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics) | ||
| if (auto Res = HashRecognize(*CurLoop, *SE).getResult()) | ||
| optimizeCRCLoop(*Res); | ||
|
|
||
| return MadeChange; | ||
| } | ||
|
|
||
|
|
@@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, | |
| return false; | ||
| } | ||
|
|
||
| bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) { | ||
| // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using | ||
artagnon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // carry-less multiplication instructions, which is more efficient than our | ||
| // Sarwate table-lookup optimization. Hence, until we're able to emit | ||
| // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom, | ||
| // disable the optimization for Hexagon. | ||
| Module &M = *CurLoop->getHeader()->getModule(); | ||
| Triple TT(M.getTargetTriple()); | ||
| if (TT.getArch() == Triple::hexagon) | ||
| return false; | ||
|
|
||
| // First, create a new GlobalVariable corresponding to the | ||
| // Sarwate-lookup-table. | ||
| Type *CRCTy = Info.LHS->getType(); | ||
| unsigned CRCBW = CRCTy->getIntegerBitWidth(); | ||
| std::array<Constant *, 256> CRCConstants; | ||
| transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped), | ||
| CRCConstants.begin(), | ||
| [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); }); | ||
| Constant *ConstArray = | ||
| ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants); | ||
| GlobalVariable *GV = | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we created two identical LUTs in two translation units, does ICF work during linking?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would expect Identical COMDAT folding to work on any two identical global variables, and I'm not sure what that has to do with the patch? Does it depend on some parameter of the GlobalVariable?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by "identical" ? Constant arrays cannot be merged if the program compares array addresses. So I'd expect that's an option. Also, same-translation-unit case would be nice to test, in case someone implements CRC as a C macro.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I'm not sure how we'd write a test for this: possibly an LLD test? |
||
| new GlobalVariable(M, ConstArray->getType(), true, | ||
| GlobalValue::PrivateLinkage, ConstArray, ".crctable"); | ||
|
|
||
| PHINode *IV = CurLoop->getCanonicalInductionVariable(); | ||
| SmallVector<PHINode *, 2> Cleanup; | ||
|
|
||
| // Next, mark all PHIs for removal except IV. | ||
| { | ||
| for (PHINode &PN : CurLoop->getHeader()->phis()) { | ||
| if (&PN == IV) | ||
| continue; | ||
| PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); | ||
| Cleanup.push_back(&PN); | ||
| } | ||
| } | ||
|
|
||
| // Next, fix up the trip count. | ||
| { | ||
| unsigned NewBTC = (Info.TripCount / 8) - 1; | ||
| BasicBlock *LoopBlk = CurLoop->getLoopLatch(); | ||
| BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator()); | ||
| CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk | ||
| ? ICmpInst::Predicate::ICMP_NE | ||
| : ICmpInst::Predicate::ICMP_EQ; | ||
| Instruction *ExitCond = CurLoop->getLatchCmpInst(); | ||
| Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC); | ||
| IRBuilder<> Builder(ExitCond); | ||
| Value *NewExitCond = | ||
| Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond"); | ||
| ExitCond->replaceAllUsesWith(NewExitCond); | ||
| deleteDeadInstruction(ExitCond); | ||
| } | ||
|
|
||
| // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all | ||
| // uses of ComputedValue. | ||
| // | ||
| // Little-endian: | ||
| // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)] | ||
| // Big-Endian: | ||
| // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)] | ||
| { | ||
| auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) { | ||
| Type *OpTy = Op->getType(); | ||
| unsigned OpBW = OpTy->getIntegerBitWidth(); | ||
| return OpBW > 8 | ||
| ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name) | ||
| : Op; | ||
| }; | ||
| auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op, | ||
| const Twine &Name) { | ||
| Type *OpTy = Op->getType(); | ||
|
|
||
| // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to | ||
| // use the CRC's bitwidth as the reference for shifting right. | ||
| return LoByte(Builder, | ||
| CRCBW > 8 ? Builder.CreateLShr( | ||
| Op, ConstantInt::get(OpTy, CRCBW - 8), Name) | ||
| : Op, | ||
| Name + ".lo.byte"); | ||
| }; | ||
|
|
||
| IRBuilder<> Builder(CurLoop->getHeader(), | ||
| CurLoop->getHeader()->getFirstNonPHIIt()); | ||
|
|
||
| // Create the CRC PHI, and initialize its incoming value to the initial | ||
| // value of CRC. | ||
| PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc"); | ||
| CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader()); | ||
|
|
||
| // CRC is now an evolving variable, initialized to the PHI. | ||
| Value *CRC = CRCPhi; | ||
|
|
||
| // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte | ||
| // of LHSAux), if LHSAux is non-nullptr. | ||
| Value *Indexer = CRC; | ||
| if (Value *Data = Info.LHSAux) { | ||
| Type *DataTy = Data->getType(); | ||
|
|
||
| // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we | ||
| // shift right by that amount, and take the lo-byte (in the little-endian | ||
| // case), or shift left by that amount, and take the hi-idx (in the | ||
| // big-endian case). | ||
| Value *IVBits = Builder.CreateZExtOrTrunc( | ||
| Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer"); | ||
| Value *DataIndexer = | ||
| Info.ByteOrderSwapped | ||
| ? Builder.CreateShl(Data, IVBits, "data.indexer") | ||
| : Builder.CreateLShr(Data, IVBits, "data.indexer"); | ||
| Indexer = Builder.CreateXor( | ||
| DataIndexer, | ||
| Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"), | ||
| "crc.data.indexer"); | ||
| } | ||
|
|
||
| Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi") | ||
| : LoByte(Builder, Indexer, "indexer.lo"); | ||
|
|
||
| // Always index into a GEP using the index type. | ||
| Indexer = Builder.CreateZExt( | ||
| Indexer, SE->getDataLayout().getIndexType(GV->getType()), | ||
| "indexer.ext"); | ||
|
|
||
| // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC]. | ||
| Value *CRCTableGEP = | ||
| Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd"); | ||
| Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld"); | ||
|
|
||
| // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of | ||
| // CRC-8. | ||
| Value *CRCNext = CRCTableLd; | ||
| if (CRCBW > 8) { | ||
| Value *CRCShift = Info.ByteOrderSwapped | ||
| ? Builder.CreateShl(CRC, 8, "crc.be.shift") | ||
| : Builder.CreateLShr(CRC, 8, "crc.le.shift"); | ||
| CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next"); | ||
| } | ||
|
|
||
| // Connect the back-edge for the loop, and RAUW the ComputedValue. | ||
| CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch()); | ||
| Info.ComputedValue->replaceUsesOutsideBlock(CRCNext, | ||
| CurLoop->getLoopLatch()); | ||
| } | ||
|
|
||
| // Cleanup. | ||
| { | ||
| for (PHINode *PN : Cleanup) | ||
| RecursivelyDeleteDeadPHINode(PN); | ||
| SE->forgetLoop(CurLoop); | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| bool LoopIdiomRecognize::runOnNoncountableLoop() { | ||
| LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" | ||
| << CurLoop->getHeader()->getParent()->getName() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, everything seems to be in order: will merge. Quick note for reviewers in case regressions are found: prefer flipping this cl::init to true while the regression is being investigated, over reverting the patch.