From c61d4f26dac28b98622d395f61e90ec1f4a87bf2 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 24 Nov 2025 16:09:35 +0800 Subject: [PATCH 1/4] Precommit tests --- .../InstCombine/get_vector_length.ll | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/get_vector_length.ll diff --git a/llvm/test/Transforms/InstCombine/get_vector_length.ll b/llvm/test/Transforms/InstCombine/get_vector_length.ll new file mode 100644 index 0000000000000..2925d6f556988 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/get_vector_length.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=instcombine,verify -S | FileCheck %s + +define i32 @cnt_known_lt() { +; CHECK-LABEL: define i32 @cnt_known_lt() { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 1, i32 2, i1 false) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.get.vector.length(i32 1, i32 2, i1 false) + ret i32 %x +} + +define i32 @cnt_not_known_lt() { +; CHECK-LABEL: define i32 @cnt_not_known_lt() { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 false) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 false) + ret i32 %x +} + +define i32 @cnt_known_lt_scalable() vscale_range(2, 4) { +; CHECK-LABEL: define i32 @cnt_known_lt_scalable( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true) + ret i32 %x +} + +define i32 @cnt_not_known_lt_scalable() { +; CHECK-LABEL: define i32 @cnt_not_known_lt_scalable() { +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true) + ret i32 %x +} + +define i32 @cnt_known_lt_runtime(i32 %x) { +; CHECK-LABEL: define i32 @cnt_known_lt_runtime( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[X]], 4 +; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]]) +; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 3, i1 false) +; CHECK-NEXT: ret i32 [[Y]] +; + %icmp = icmp ule i32 %x, 3 + call void @llvm.assume(i1 %icmp) + %y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 3, i1 false) + ret i32 %y +} + +define i32 @cnt_known_lt_runtime_trunc(i64 %x) { +; CHECK-LABEL: define i32 @cnt_known_lt_runtime_trunc( +; CHECK-SAME: i64 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[X]], 4 +; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]]) +; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[X]], i32 3, i1 false) +; CHECK-NEXT: ret i32 [[Y]] +; + %icmp = icmp ule i64 %x, 3 + call void @llvm.assume(i1 %icmp) + %y = call i32 @llvm.experimental.get.vector.length(i64 %x, i32 3, i1 false) + ret i32 %y +} + +; FIXME: We should be able to deduce the constant range from AssumptionCache +; rather than relying on KnownBits, which in this case only knows x <= 3. +define i32 @cnt_known_lt_runtime_assumption(i32 %x) { +; CHECK-LABEL: define i32 @cnt_known_lt_runtime_assumption( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[X]], 3 +; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]]) +; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 2, i1 false) +; CHECK-NEXT: ret i32 [[Y]] +; + %icmp = icmp ule i32 %x, 2 + call void @llvm.assume(i1 %icmp) + %y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 2, i1 false) + ret i32 %y +} From 4da2081d9743a9ae36ed3a93b04353930b1ad1ff Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 24 Nov 2025 16:16:01 +0800 Subject: [PATCH 2/4] [InstCombine] Fold @llvm.experimental.get.vector.length when cnt <= max_lanes On RISC-V, some loops that the loop vectorizer vectorizes pre-LTO may turn out to have the exact trip count exposed after LTO, see #164762. If the trip count is small enough we can fold away the @llvm.experimental.get.vector.length intrinsic based on this corollary from the LangRef: > If %cnt is less than or equal to %max_lanes, the return value is equal to %cnt. This on its own doesn't remove the @llvm.experimental.get.vector.length in #164762 since we also need to teach computeKnownBits about @llvm.experimental.get.vector.length and the sub recurrence, but this PR is a starting point. I've added this in InstCombine rather than InstSimplify since we may need to insert a truncation (@llvm.experimental.get.vector.length can take an i64 %cnt argument, but always truncates the result to i32). Note that there was something similar done in VPlan in #167647 for when the loop vectorizer knows the trip count. --- .../Transforms/InstCombine/InstCombineCalls.cpp | 16 ++++++++++++++++ .../Transforms/InstCombine/get_vector_length.ll | 11 ++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8e4edefec42fd..247f615ed0b54 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4005,6 +4005,22 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } break; } + case Intrinsic::experimental_get_vector_length: { + // get.vector.length(Cnt, MaxLanes) --> Cnt when Cnt <= MaxLanes + ConstantRange Cnt = computeConstantRangeIncludingKnownBits( + II->getArgOperand(0), false, SQ.getWithInstruction(II)); + ConstantRange MaxLanes = cast(II->getArgOperand(1)) + ->getValue() + .zext(Cnt.getBitWidth()); + if (cast(II->getArgOperand(2))->getZExtValue()) + MaxLanes = MaxLanes.multiply( + getVScaleRange(II->getFunction(), Cnt.getBitWidth())); + + if (Cnt.icmp(CmpInst::ICMP_ULE, MaxLanes)) + return replaceInstUsesWith( + *II, Builder.CreateTrunc(II->getArgOperand(0), II->getType())); + return nullptr; + } default: { // Handle target specific intrinsics std::optional V = targetInstCombineIntrinsic(*II); diff --git a/llvm/test/Transforms/InstCombine/get_vector_length.ll b/llvm/test/Transforms/InstCombine/get_vector_length.ll index 2925d6f556988..96a7f3058c43c 100644 --- a/llvm/test/Transforms/InstCombine/get_vector_length.ll +++ b/llvm/test/Transforms/InstCombine/get_vector_length.ll @@ -3,8 +3,7 @@ define i32 @cnt_known_lt() { ; CHECK-LABEL: define i32 @cnt_known_lt() { -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 1, i32 2, i1 false) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 1 ; %x = call i32 @llvm.experimental.get.vector.length(i32 1, i32 2, i1 false) ret i32 %x @@ -22,8 +21,7 @@ define i32 @cnt_not_known_lt() { define i32 @cnt_known_lt_scalable() vscale_range(2, 4) { ; CHECK-LABEL: define i32 @cnt_known_lt_scalable( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true) -; CHECK-NEXT: ret i32 [[X]] +; CHECK-NEXT: ret i32 2 ; %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true) ret i32 %x @@ -43,8 +41,7 @@ define i32 @cnt_known_lt_runtime(i32 %x) { ; CHECK-SAME: i32 [[X:%.*]]) { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[X]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]]) -; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 3, i1 false) -; CHECK-NEXT: ret i32 [[Y]] +; CHECK-NEXT: ret i32 [[X]] ; %icmp = icmp ule i32 %x, 3 call void @llvm.assume(i1 %icmp) @@ -57,7 +54,7 @@ define i32 @cnt_known_lt_runtime_trunc(i64 %x) { ; CHECK-SAME: i64 [[X:%.*]]) { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[X]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[ICMP]]) -; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[X]], i32 3, i1 false) +; CHECK-NEXT: [[Y:%.*]] = trunc nuw nsw i64 [[X]] to i32 ; CHECK-NEXT: ret i32 [[Y]] ; %icmp = icmp ule i64 %x, 3 From 2f71e8a38242da1740c8820d659e0a4ea95e5f2f Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 25 Nov 2025 18:21:51 +0800 Subject: [PATCH 3/4] Use createZExtOrTrunc, use isOne() --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 247f615ed0b54..a576dc42f43b9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4012,13 +4012,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { ConstantRange MaxLanes = cast(II->getArgOperand(1)) ->getValue() .zext(Cnt.getBitWidth()); - if (cast(II->getArgOperand(2))->getZExtValue()) + if (cast(II->getArgOperand(2))->isOne()) MaxLanes = MaxLanes.multiply( getVScaleRange(II->getFunction(), Cnt.getBitWidth())); if (Cnt.icmp(CmpInst::ICMP_ULE, MaxLanes)) return replaceInstUsesWith( - *II, Builder.CreateTrunc(II->getArgOperand(0), II->getType())); + *II, Builder.CreateZExtOrTrunc(II->getArgOperand(0), II->getType())); return nullptr; } default: { From db24622342e6abf7e15672aca9fe7bb8658c81c5 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 26 Nov 2025 16:15:12 +0800 Subject: [PATCH 4/4] Perform in larger of two bitwidths --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 11 ++++++++--- llvm/test/Transforms/InstCombine/get_vector_length.ll | 9 +++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index a576dc42f43b9..ad6407fba81da 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4007,11 +4007,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } case Intrinsic::experimental_get_vector_length: { // get.vector.length(Cnt, MaxLanes) --> Cnt when Cnt <= MaxLanes - ConstantRange Cnt = computeConstantRangeIncludingKnownBits( - II->getArgOperand(0), false, SQ.getWithInstruction(II)); + unsigned BitWidth = + std::max(II->getArgOperand(0)->getType()->getScalarSizeInBits(), + II->getType()->getScalarSizeInBits()); + ConstantRange Cnt = + computeConstantRangeIncludingKnownBits(II->getArgOperand(0), false, + SQ.getWithInstruction(II)) + .zextOrTrunc(BitWidth); ConstantRange MaxLanes = cast(II->getArgOperand(1)) ->getValue() - .zext(Cnt.getBitWidth()); + .zextOrTrunc(Cnt.getBitWidth()); if (cast(II->getArgOperand(2))->isOne()) MaxLanes = MaxLanes.multiply( getVScaleRange(II->getFunction(), Cnt.getBitWidth())); diff --git a/llvm/test/Transforms/InstCombine/get_vector_length.ll b/llvm/test/Transforms/InstCombine/get_vector_length.ll index 96a7f3058c43c..122beeae866f3 100644 --- a/llvm/test/Transforms/InstCombine/get_vector_length.ll +++ b/llvm/test/Transforms/InstCombine/get_vector_length.ll @@ -78,3 +78,12 @@ define i32 @cnt_known_lt_runtime_assumption(i32 %x) { %y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 2, i1 false) ret i32 %y } + + +define i32 @cnt_known_lt_i16() { +; CHECK-LABEL: define i32 @cnt_known_lt_i16() { +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.get.vector.length(i16 1, i32 2, i1 false) + ret i32 %x +}