Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions llvm/include/llvm/CodeGen/EVLIndVarSimplify.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//===- EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV-*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass optimizes a vectorized loop with canonical IV to using EVL-based
// IV if it was tail-folded by predicated EVL.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CODEGEN_EVLINDVARSIMPLIFY_H
#define LLVM_CODEGEN_EVLINDVARSIMPLIFY_H

#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/IR/PassManager.h"

namespace llvm {
class Loop;
class LPMUpdater;
class Pass;

/// Turn vectorized loops with canonical induction variables into loops that
/// only use a single EVL-based induction variable.
struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> {
PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
};

Pass *createEVLIndVarSimplifyPass();
} // namespace llvm
#endif
1 change: 1 addition & 0 deletions llvm/include/llvm/InitializePasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ void initializeExpandReductionsPass(PassRegistry &);
void initializeExpandVariadicsPass(PassRegistry &);
void initializeExpandVectorPredicationPass(PassRegistry &);
void initializeExternalAAWrapperPassPass(PassRegistry &);
void initializeEVLIndVarSimplifyPass(PassRegistry &);
void initializeFEntryInserterPass(PassRegistry &);
void initializeFinalizeISelPass(PassRegistry &);
void initializeFinalizeMachineBundlesPass(PassRegistry &);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/CodeGen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ add_llvm_component_library(LLVMCodeGen
EarlyIfConversion.cpp
EdgeBundles.cpp
EHContGuardCatchret.cpp
EVLIndVarSimplify.cpp
ExecutionDomainFix.cpp
ExpandLargeDivRem.cpp
ExpandLargeFpConvert.cpp
Expand Down
271 changes: 271 additions & 0 deletions llvm/lib/CodeGen/EVLIndVarSimplify.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
//===------ EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass optimizes a vectorized loop with canonical IV to using EVL-based
// IV if it was tail-folded by predicated EVL.
//
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/EVLIndVarSimplify.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

#define DEBUG_TYPE "evl-iv-simplify"

using namespace llvm;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add an option to control that pass ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");

static cl::opt<bool> EnableEVLIndVarSimplify(
"enable-evl-indvar-simplify",
cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden,
cl::init(true));

namespace {
struct EVLIndVarSimplifyImpl {
ScalarEvolution &SE;

explicit EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR)
: SE(LAR.SE) {}

explicit EVLIndVarSimplifyImpl(ScalarEvolution &SE) : SE(SE) {}

// Returns true if modify the loop.
bool run(Loop &L);
};

struct EVLIndVarSimplify : public LoopPass {
static char ID;

EVLIndVarSimplify() : LoopPass(ID) {
initializeEVLIndVarSimplifyPass(*PassRegistry::getPassRegistry());
}

bool runOnLoop(Loop *L, LPPassManager &LPM) override;

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.setPreservesCFG();
}
};
} // anonymous namespace

static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
if (!Step)
return 0U;

// Looking for loops with IV step value in the form of `(<constant VF> x
// vscale)`.
if (auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
if (Mul->getNumOperands() == 2) {
const SCEV *LHS = Mul->getOperand(0);
const SCEV *RHS = Mul->getOperand(1);
if (auto *Const = dyn_cast<SCEVConstant>(LHS)) {
uint64_t V = Const->getAPInt().getLimitedValue();
if (isa<SCEVVScale>(RHS) && llvm::isUInt<32>(V))
return static_cast<uint32_t>(V);
}
}
}

// If not, see if the vscale_range of the parent function is a fixed value,
// which makes the step value to be replaced by a constant.
if (F.hasFnAttribute(Attribute::VScaleRange))
if (auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
APInt V = ConstStep->getAPInt().abs();
ConstantRange CR = llvm::getVScaleRange(&F, 64);
if (const APInt *Fixed = CR.getSingleElement()) {
V = V.zextOrTrunc(Fixed->getBitWidth());
uint64_t VF = V.udiv(*Fixed).getLimitedValue();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible the constant isn't evenly divisible by vscale?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a check here.

if (VF && llvm::isUInt<32>(VF) &&
// Make sure step is divisible by vscale.
V.urem(*Fixed).isZero())
return static_cast<uint32_t>(VF);
}
}

return 0U;
}

bool EVLIndVarSimplifyImpl::run(Loop &L) {
if (!EnableEVLIndVarSimplify)
return false;

InductionDescriptor IVD;
PHINode *IndVar = L.getInductionVariable(SE);
if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
<< "\n");
return false;
}

BasicBlock *InitBlock, *BackEdgeBlock;
if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
<< L.getName() << "\n");
return false;
}

// Retrieve the loop bounds.
std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the getBounds API is a slightly weird choice here. getBounds doesn't use SCEV, and has a bunch of restrictions that SCEV does not. You're immediately turning around and converting the results to SCEV, so maybe just use that to begin with?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The SCEV of initial and final values are probably going to be gone, since they're only used in the ad-hoc check I wrote, which is subject to be replaced. Also, I don't quite understand what you mean by saying getBounds doesn't use SCEV, because judging from its code it's using ScalarEvolution to calculate the initial and final values. Could you elaborate a little more?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IVDescriptors has limited use of SCEV to identify addrecs. LoopBounds::getBounds works solely on IR, matching existing instructions. It doesn't produce the limit or start in terms of SCEV values.

if (!Bounds) {
LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
<< "\n");
return false;
}
Value *CanonicalIVInit = &Bounds->getInitialIVValue();
Value *CanonicalIVFinal = &Bounds->getFinalIVValue();

const SCEV *StepV = IVD.getStep();
uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
if (!VF) {
LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
<< "'\n");
return false;
}
LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName()
<< "\n");

// Try to find the EVL-based induction variable.
using namespace PatternMatch;
BasicBlock *BB = IndVar->getParent();

Value *EVLIndVar = nullptr;
Value *RemTC = nullptr;
auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
m_Value(RemTC), m_SpecificInt(VF),
/*Scalable=*/m_SpecificInt(1));
for (auto &PN : BB->phis()) {
if (&PN == IndVar)
continue;

// Check 1: it has to contain both incoming (init) & backedge blocks
// from IndVar.
if (PN.getBasicBlockIndex(InitBlock) < 0 ||
PN.getBasicBlockIndex(BackEdgeBlock) < 0)
continue;
// Check 2: EVL index is always increasing, thus its inital value has to be
// equal to either the initial IV value (when the canonical IV is also
// increasing) or the last IV value (when canonical IV is decreasing).
Value *Init = PN.getIncomingValueForBlock(InitBlock);
using Direction = Loop::LoopBounds::Direction;
switch (Bounds->getDirection()) {
case Direction::Increasing:
if (Init != CanonicalIVInit)
continue;
break;
case Direction::Decreasing:
if (Init != CanonicalIVFinal)
continue;
break;
case Direction::Unknown:
// To be more permissive and see if either the initial or final IV value
// matches PN's init value.
if (Init != CanonicalIVInit && Init != CanonicalIVFinal)
continue;
break;
}
Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
assert(RecValue);

LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
<< "\n");

// Check 3: Pattern match to find the EVL-based index.
if (match(RecValue,
m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
match(RemTC, m_Sub(m_Value(), m_Specific(&PN)))) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a nasty correctness problem lurking here.

Consider an EVL IV where the sub is M, and the canonical IV where BTC is in terms of N. N != M. In this case, the original loop would run a different number of iterations than the modified loop unless you can prove N and M are the same. (Well, actually that the BTC derived from both is the same, which is slightly different and maybe easier.)

Let's consider the case where M < N. This may or may not be correct. For a purely EVL vectorized loop, running the extra iterations had no effect. However, if there's any bit of scalar code in the loop with side effects (i.e. this isn't the output of the vectorizer), then we've dropped semantically visible side effects.

Two alternate approaches here (warning, these are quite different - we should probably brainstorm offline before you bother to implement either):

  • Proof of noop iterations - prove that once active.vector.length is zero, that the loop has no side effects and is not infinite. Thus trip count can be reduced. Unfortunately, this only proves an upper bound, you'd have to then somehow prove (via scev) that M >= N.
  • Directly teach SCEV how to compute the BTC of an EVL based IV, and then directly prove that the two BTCs are the same. This may be hard. (It can also subsume the option above.)

Honestly, at this point I'm leaning strongly towards just having the vectorizer do this transform during codegen. It knows by construction the two BTCs are equal and doesn't have to do this difficult proof.

Copy link
Member Author

@mshockwave mshockwave Nov 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having the vectorizer do this transform during codegen. It knows by construction the two BTCs are equal

I actually have been testing an alternative solution that uses IR pattern matching to find the original trip count (similar to the very first version of this PR) but putting the Pass right after the LoopVectorizer. Based on the assumption that loop vectorizer will produce the desired "shape" of loops that we can easily extract the trip count / BTC without fear. But I guess even with this, we somehow need to make sure the LV produces the same BTCs for the two IVs just like you mentioned.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider an EVL IV where the sub is M, and the canonical IV where BTC is in terms of N. N != M. In this case, the original loop would run a different number of iterations than the modified loop unless you can prove N and M are the same

To my best understandings, due to how LoopVectorizer derives N and M from the original trip count, N and M will be the same in the code produced by LV. So with my alternative solution which run this Pass right after LV, I think we can assure that N and M as seen by this Pass will be the same.

EVLIndVar = RecValue;
break;
}
}

if (!EVLIndVar)
return false;

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check that TC is related to the original trip count calculation? Couldn't we construct an invalid loop that has a different trip count for the vsetvli than the countable trip count?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we can do this by comparing the TC's SCEV with canonical trip count's SCEV. But it turns out canonical trip count is not always SCEV-computable. So I only do such comparison when both TC and vscale are constants. Otherwise, I simply check if TC is used in the SCEV expressions of either the upper or lower bound of canonical IV.

const SCEV *BTC = SE.getBackedgeTakenCount(&L);
LLVM_DEBUG(dbgs() << "BTC: " << *BTC << "\n");
if (isa<SCEVCouldNotCompute>(BTC))
return false;

const SCEV *VFV = SE.getConstant(BTC->getType(), VF);
VFV = SE.getMulExpr(VFV, SE.getVScale(VFV->getType()));
const SCEV *ExitValV = SE.getMulExpr(BTC, VFV);
LLVM_DEBUG(dbgs() << "ExitVal: " << *ExitValV << "\n");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't ExitValV just CanonicalIVFinal - CanonicalIVInit - CanonicalIVStep by construction? Or alternatively, CanonicalIV->evaluateAtIteration(BTC)?

Note that the way you're computing it should be correct when combined with the matching code above, so this is purely a style/codegen point.


// Create an EVL-based comparison and replace the branch to use it as

This comment was marked as resolved.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the suggestion, I think IV->evaluateAtIteration(BTC) + EVLStep.evaluateAtIteration(BTC) is pretty neat.

I have one question here though:

First, terminology. EVLStep = min(VF, sub_nsw(EVL_TC, IV))

Is it supposed to be EVLStep = min(IV.Step, sub_nsw(EVL_TC, IV))?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it wasn't. The active_vector_length has the VF parameter. That may be the same as the step, but in principle doesn't have to be. Note that my (1) above (which your code already does) is proving VF=IV.Step, but that is a step you need to prove. It's not something you can assume.

Copy link
Member Author

@mshockwave mshockwave May 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is another problem here: the backedge taken count cannot be computed when vscale is not constant. Namely, IV.Step is in the most commonly seen form of (VF x vscale). Which also causes a problem in another comment thread where you suggested avoid using getBounds, because I guess you were suggesting to get the final IV SCEV via something like IV.evaluateAtIteration(CanonicalTripCount), but CanonicalTripCount, just like backedge taken count, cannot be computed in this (i.e. non-constant vscale) case.

However, alternatively I think we can get exit value via this:

IV_BTC = IV_Final - IV.Step
Tail = TC - IV_BTC
Exit value = IV_BTC + Tail

IV_BTC as the name suggested, is the equivalence of IV->evaluateAtIteration(BTC) you suggested, but this time we calculate it from IV_Final, which is, well, final IV value returned from getBounds. Since it only uses IR pattern matching we don't have to worry about the aforementioned uncomputable problem.

What do you think?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

think there is another problem here: the backedge taken count cannot be computed when vscale is not constant. Namely, IV.Step is in the most commonly seen form of (VF x vscale).... just like backedge taken count, cannot be computed in this (i.e. non-constant vscale) case.

Ok, this seems weird. I don't know of any fundamental reason that SCEV shouldn't be able to figure out a trip count for a loop invariant step - even if non constant. Would you mind sharing a piece of IR which demonstrates this? I want to see if this is something easy to fix, or I'm forgetting some subtle interaction.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind sharing a piece of IR which demonstrates this? I want to see if this is something easy to fix, or I'm forgetting some subtle interaction.

I was using the first function (i.e. @simple) in the test for this PR (evl-iv-simplify.ll). I actually traced down to why it could not compute: the smoking gun is ScalarEvolution::howFarToZero, specifically, this line that actually bails out with SCEVCouldNotCompute. As the TODO comment right above it suggested, it couldn't handle non-constant step in AddRec for now.

Thanks!

That limitation actually looks pretty easy to at least partially remove. I'm currently a bit buried, but expect a patch in the next few days.

Thank you!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #94411

Though this is with two changes to your test:

  • adding vscale_range to the function
  • making the IV nuw

I think both should apply to your original (i.e. your test is slightly over-reduced), but please confirm.

Copy link
Member Author

@mshockwave mshockwave Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #94411

Though this is with two changes to your test:

  • adding vscale_range to the function
  • making the IV nuw

I think both should apply to your original (i.e. your test is slightly over-reduced), but please confirm.

Thank you for the quick fix.
I gave it a try and found 1 minor and 1 major concern: the minor one is that vscale_range has to be set otherwise BTC cannot be computed. This is because when we're computing BTC, one of the preconditions is that the range of (VF x vscale) has to be non-zero, which depends on the range of vscale. Currently we use [1,0) ,a.k.a wrap around starting from 1, to represent the range of vscale when vscale_range is not set -- but this is an unsigned wrap around. However, when computing the multiplication range of (VF x vscale) this [1,0) is treated as signed wrap around, leading to (VF x vscale)'s range being full_set. Therefore when being asked the minimum unsigned value of (VF x vscale)'s range, it returns zero. I call this "minor" because I believe in practice vscale_range will mostly be set. But it's still a little annoyed to be that we cannot represent explicit unsigned range like [1, UINT_MAX] with ConstantRange.

The major concern being that the instructions for computing exit value expanded from the SCEV expressions we crafted is pretty verbose, long, a more importantly, contain division instructions. Let me start with the formula we have:

IV = the original canonical IV
EVL_TC = the trip count in the scalar loop
IV.Step = (VF x vscale)
EVLStep = umin(IV.Step, sub_nsw(EVL_TC, IV))
BTC = backedge taken count in the original canonical IV loop
ExitValue = IV.evaluateAtIteration(BTC) + EVLStep.evaluateAtIteration(BTC)

In your original formula EVLStep is min(VF, sub_nsw(EVL_TC, IV)), which I had a question on whether the first operand of min is VF or IV.Step. Because to my best understandings, VF is the same VF we put in the scalable vector type we use, (vscale x VF x ...), and also the VF used in computing step value in IV: (VF x vscale). So I think we should use IV.Step here. For the very least, if I use VF (as in my definition) here, the result will be pretty wrong.

Now, since only SCEVAddRec has evaluateAtIteration, we can't really do EVLStep.evaluateAtIteration. Instead, I rewrote EVLStep.evaluateAtIteration into:

umin(IV.Step, sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC))

Because IV.Step and EVL_TC are both loop invariant.
Using the @simple function in my test case + the same modifications you made in your comment above, as an example, this final IV.evaluateAtIteration(BTC) + umin(IV.Step, sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC)) looks like:

((4 * vscale * ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)) + (((-4 * vscale * ((-1 * vscale * (4 + (-4 * ((-1 + (4 * vscale)<nuw><nsw> + %N) /u (4 * vscale)<nuw><nsw>))<nsw>)<nsw>) /u (4 * vscale)<nuw><nsw>)) + %N) umin (4 * vscale)<nuw><nsw>))

And those divisions seen in the expressions will be expanded into division instructions. I'm concern whether we should spend so many instructions to compute something that will almost certain be EVL_TC.

I also run the same algorithm on the @fixed_iv_step_tc function in my test case, which has fixed IV.Step and fixed trip count. The generated exit value is correct, which means my methodology is probably not terribly off.

I've also tried to use sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC), which is the tail in the last iteration, in replacement of umin(IV.Step, sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC)). But the resulting Exit value, IV.evaluateAtIteration(BTC) + sub_nsw(EVL_TC, IV.evaluateAtIteration(BTC) is not really meaningful (it will certainly be EVL_TC though).

Another thing is that I'm still a little confused about how to do Step (1) you described: it's true that I already did such a check (line 212 ~ 223) -- but only on cases where both trip count and vscale are constant. For every other cases we only have SCEV expressions, which we cannot know it value during compile time. Even we expand the check into runtime check, what should we do if the check fails during runtime? Do we fall back the original canonical IV?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #94411
Though this is with two changes to your test:

  • adding vscale_range to the function
  • making the IV nuw

I think both should apply to your original (i.e. your test is slightly over-reduced), but please confirm.

Thank you for the quick fix. I gave it a try and found 1 minor and 1 major concern: the minor one is that vscale_range has to be set otherwise BTC cannot be computed. This is because when we're computing BTC, one of the preconditions is that the range of (VF x vscale) has to be non-zero, which depends on the range of vscale. Currently we use [1,0) ,a.k.a wrap around starting from 1, to represent the range of vscale when vscale_range is not set -- but this is an unsigned wrap around. However, when computing the multiplication range of (VF x vscale) this [1,0) is treated as signed wrap around, leading to (VF x vscale)'s range being full_set. Therefore when being asked the minimum unsigned value of (VF x vscale)'s range, it returns zero. I call this "minor" because I believe in practice vscale_range will mostly be set. But it's still a little annoyed to be that we cannot represent explicit unsigned range like [1, UINT_MAX] with ConstantRange.

Clang emits the vscale_range by default. vscale is bounded by the values of possible VLEN, and VLEN has a specific upper bound in the RISCV vector specification. So while annoying that SCEV can't just ask the backend for this, this is a non-problem for any real c/c++ input.

For the rest of your response, one quick comment - the divisions are by 4 * vscale. We know that vscale is a power of two by definition, so these are right shifts. You do need a popcount to know which power of two, but we could reasonable emit one popcount and a bunch of shifts. (Not saying we do so today - I'm saying it's possible.)

For the rest, I feel we are talking past each other - and frankly, I'm having trouble following the detail in github PR interface - can I suggest we setup a brief phone call to chat this through instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the rest, I feel we are talking past each other - and frankly, I'm having trouble following the detail in github PR interface - can I suggest we setup a brief phone call to chat this through instead?

That will be really helpful! I'll send you an email to your "public" email address to coordinate this.

// predicate.
ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
const DataLayout &DL = L.getHeader()->getDataLayout();
SCEVExpander Expander(SE, DL, "evl.iv.exitcondition");
if (!Expander.isSafeToExpandAt(ExitValV, OrigLatchCmp))
return false;

LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");

Value *ExitVal =
Expander.expandCodeFor(ExitValV, EVLIndVar->getType(), OrigLatchCmp);
IRBuilder<> Builder(OrigLatchCmp);
auto *NewPred = Builder.CreateICmp(ICmpInst::ICMP_UGT, EVLIndVar, ExitVal);
OrigLatchCmp->replaceAllUsesWith(NewPred);

// llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are
// not used outside the cycles. However, in this case the now-RAUW-ed
// OrigLatchCmp will be consied a use outside the cycle while in reality it's
// practically dead. Thus we need to remove it before calling
// RecursivelyDeleteDeadPHINode.
(void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp);
if (llvm::RecursivelyDeleteDeadPHINode(IndVar))
LLVM_DEBUG(dbgs() << "Removed original IndVar\n");

++NumEliminatedCanonicalIV;

return true;
}

PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
LoopStandardAnalysisResults &AR,
LPMUpdater &U) {
if (EVLIndVarSimplifyImpl(AR).run(L))
return PreservedAnalyses::allInSet<CFGAnalyses>();
return PreservedAnalyses::all();
}

char EVLIndVarSimplify::ID = 0;

INITIALIZE_PASS_BEGIN(EVLIndVarSimplify, DEBUG_TYPE,
"EVL-based Induction Variables Simplify", false, false)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(EVLIndVarSimplify, DEBUG_TYPE,
"EVL-based Induction Variables Simplify", false, false)

bool EVLIndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
return false;

auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check skipLoop here for opt-bisect?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

return EVLIndVarSimplifyImpl(SE).run(*L);
}

Pass *llvm::createEVLIndVarSimplifyPass() { return new EVLIndVarSimplify(); }
1 change: 1 addition & 0 deletions llvm/lib/Passes/PassBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
#include "llvm/CodeGen/DeadMachineInstructionElim.h"
#include "llvm/CodeGen/DwarfEHPrepare.h"
#include "llvm/CodeGen/EarlyIfConversion.h"
#include "llvm/CodeGen/EVLIndVarSimplify.h"
#include "llvm/CodeGen/ExpandLargeDivRem.h"
#include "llvm/CodeGen/ExpandLargeFpConvert.h"
#include "llvm/CodeGen/ExpandMemCmp.h"
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
#include "llvm/CodeGen/EVLIndVarSimplify.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Passes/OptimizationLevel.h"
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Passes/PassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch",
#endif
LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
LOOP_PASS("dot-ddg", DDGDotPrinterPass())
LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass())
LOOP_PASS("guard-widening", GuardWideningPass())
LOOP_PASS("indvars", IndVarSimplifyPass())
LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/EVLIndVarSimplify.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
Expand Down Expand Up @@ -446,6 +447,9 @@ void RISCVPassConfig::addIRPasses() {
}

TargetPassConfig::addIRPasses();

if (getOptLevel() != CodeGenOptLevel::None)
addPass(createEVLIndVarSimplifyPass());
}

bool RISCVPassConfig::addPreISel() {
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/RISCV/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: TLS Variable Hoist
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Loop Pass Manager
; CHECK-NEXT: EVL-based Induction Variables Simplify
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is possible to get this into the same loop pass manager as LSR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC in order to do that we need to put EVLIndVarSimplify side-by-side with LSR, but I feel like we can make good uses of canonical form loops even between LSR and CodeGenPrepare. Do you think that will be the case?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure. LSR itself already does some things that make loops harder to analyze. Though it doesn't make them not countable.

; CHECK-NEXT: Type Promotion
; CHECK-NEXT: CodeGen Prepare
; CHECK-NEXT: Dominator Tree Construction
Expand Down
Loading
Loading