From c61d4f26dac28b98622d395f61e90ec1f4a87bf2 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 24 Nov 2025 16:09:35 +0800
Subject: [PATCH 1/4] Precommit tests

---
 .../InstCombine/get_vector_length.ll          | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/get_vector_length.ll

diff --git a/llvm/test/Transforms/InstCombine/get_vector_length.ll b/llvm/test/Transforms/InstCombine/get_vector_length.ll
new file mode 100644
index 0000000000000..2925d6f556988
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/get_vector_length.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine,verify -S | FileCheck %s
+
+define i32 @cnt_known_lt() {
+; CHECK-LABEL: define i32 @cnt_known_lt() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 1, i32 2, i1 false)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 1, i32 2, i1 false)
+  ret i32 %x
+}
+
+define i32 @cnt_not_known_lt() {
+; CHECK-LABEL: define i32 @cnt_not_known_lt() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 false)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 false)
+  ret i32 %x
+}
+
+define i32 @cnt_known_lt_scalable() vscale_range(2, 4) {
+; CHECK-LABEL: define i32 @cnt_known_lt_scalable(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true)
+  ret i32 %x
+}
+
+define i32 @cnt_not_known_lt_scalable() {
+; CHECK-LABEL: define i32 @cnt_not_known_lt_scalable() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true)
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true)
+  ret i32 %x
+}
+
+define i32 @cnt_known_lt_runtime(i32 %x) {
+; CHECK-LABEL: define i32 @cnt_known_lt_runtime(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i32 [[X]], 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ICMP]])
+; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 3, i1 false)
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %icmp = icmp ule i32 %x, 3
+  call void @llvm.assume(i1 %icmp)
+  %y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 3, i1 false)
+  ret i32 %y
+}
+
+define i32 @cnt_known_lt_runtime_trunc(i64 %x) {
+; CHECK-LABEL: define i32 @cnt_known_lt_runtime_trunc(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[X]], 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ICMP]])
+; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[X]], i32 3, i1 false)
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %icmp = icmp ule i64 %x, 3
+  call void @llvm.assume(i1 %icmp)
+  %y = call i32 @llvm.experimental.get.vector.length(i64 %x, i32 3, i1 false)
+  ret i32 %y
+}
+
+; FIXME: We should be able to deduce the constant range from AssumptionCache
+; rather than relying on KnownBits, which in this case only knows x <= 3.
+define i32 @cnt_known_lt_runtime_assumption(i32 %x) {
+; CHECK-LABEL: define i32 @cnt_known_lt_runtime_assumption(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i32 [[X]], 3
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ICMP]])
+; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 2, i1 false)
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %icmp = icmp ule i32 %x, 2
+  call void @llvm.assume(i1 %icmp)
+  %y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 2, i1 false)
+  ret i32 %y
+}

From 4da2081d9743a9ae36ed3a93b04353930b1ad1ff Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 24 Nov 2025 16:16:01 +0800
Subject: [PATCH 2/4] [InstCombine] Fold @llvm.experimental.get.vector.length
 when cnt <= max_lanes

On RISC-V, some loops that the loop vectorizer vectorizes pre-LTO may turn out to have the exact trip count exposed after LTO, see #164762.

If the trip count is small enough we can fold away the @llvm.experimental.get.vector.length intrinsic based on this corollary from the LangRef:

> If %cnt is less than or equal to %max_lanes, the return value is equal to %cnt.

This on its own doesn't remove the @llvm.experimental.get.vector.length in #164762 since we also need to teach computeKnownBits about @llvm.experimental.get.vector.length and the sub recurrence, but this PR is a starting point.

I've added this in InstCombine rather than InstSimplify since we may need to insert a truncation (@llvm.experimental.get.vector.length can take an i64 %cnt argument, but always truncates the result to i32).

Note that there was something similar done in VPlan in #167647 for when the loop vectorizer knows the trip count.
---
 .../Transforms/InstCombine/InstCombineCalls.cpp  | 16 ++++++++++++++++
 .../Transforms/InstCombine/get_vector_length.ll  | 11 ++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8e4edefec42fd..247f615ed0b54 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4005,6 +4005,22 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_get_vector_length: {
+    // get.vector.length(Cnt, MaxLanes) --> Cnt when Cnt <= MaxLanes
+    ConstantRange Cnt = computeConstantRangeIncludingKnownBits(
+        II->getArgOperand(0), false, SQ.getWithInstruction(II));
+    ConstantRange MaxLanes = cast<ConstantInt>(II->getArgOperand(1))
+                                 ->getValue()
+                                 .zext(Cnt.getBitWidth());
+    if (cast<ConstantInt>(II->getArgOperand(2))->getZExtValue())
+      MaxLanes = MaxLanes.multiply(
+          getVScaleRange(II->getFunction(), Cnt.getBitWidth()));
+
+    if (Cnt.icmp(CmpInst::ICMP_ULE, MaxLanes))
+      return replaceInstUsesWith(
+          *II, Builder.CreateTrunc(II->getArgOperand(0), II->getType()));
+    return nullptr;
+  }
   default: {
     // Handle target specific intrinsics
     std::optional<Instruction *> V = targetInstCombineIntrinsic(*II);
diff --git a/llvm/test/Transforms/InstCombine/get_vector_length.ll b/llvm/test/Transforms/InstCombine/get_vector_length.ll
index 2925d6f556988..96a7f3058c43c 100644
--- a/llvm/test/Transforms/InstCombine/get_vector_length.ll
+++ b/llvm/test/Transforms/InstCombine/get_vector_length.ll
@@ -3,8 +3,7 @@
 
 define i32 @cnt_known_lt() {
 ; CHECK-LABEL: define i32 @cnt_known_lt() {
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 1, i32 2, i1 false)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 1
 ;
   %x = call i32 @llvm.experimental.get.vector.length(i32 1, i32 2, i1 false)
   ret i32 %x
@@ -22,8 +21,7 @@ define i32 @cnt_not_known_lt() {
 define i32 @cnt_known_lt_scalable() vscale_range(2, 4) {
 ; CHECK-LABEL: define i32 @cnt_known_lt_scalable(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[X:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 2, i32 1, i1 true)
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 2
 ;
   %x = call i32 @llvm.experimental.get.vector.length(i32 2, i32 1, i1 true)
   ret i32 %x
@@ -43,8 +41,7 @@ define i32 @cnt_known_lt_runtime(i32 %x) {
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i32 [[X]], 4
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ICMP]])
-; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[X]], i32 3, i1 false)
-; CHECK-NEXT:    ret i32 [[Y]]
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   %icmp = icmp ule i32 %x, 3
   call void @llvm.assume(i1 %icmp)
@@ -57,7 +54,7 @@ define i32 @cnt_known_lt_runtime_trunc(i64 %x) {
 ; CHECK-SAME: i64 [[X:%.*]]) {
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[X]], 4
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ICMP]])
-; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[X]], i32 3, i1 false)
+; CHECK-NEXT:    [[Y:%.*]] = trunc nuw nsw i64 [[X]] to i32
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
   %icmp = icmp ule i64 %x, 3

From 2f71e8a38242da1740c8820d659e0a4ea95e5f2f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 25 Nov 2025 18:21:51 +0800
Subject: [PATCH 3/4] Use createZExtOrTrunc, use isOne()

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 247f615ed0b54..a576dc42f43b9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4012,13 +4012,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     ConstantRange MaxLanes = cast<ConstantInt>(II->getArgOperand(1))
                                  ->getValue()
                                  .zext(Cnt.getBitWidth());
-    if (cast<ConstantInt>(II->getArgOperand(2))->getZExtValue())
+    if (cast<ConstantInt>(II->getArgOperand(2))->isOne())
       MaxLanes = MaxLanes.multiply(
           getVScaleRange(II->getFunction(), Cnt.getBitWidth()));
 
     if (Cnt.icmp(CmpInst::ICMP_ULE, MaxLanes))
       return replaceInstUsesWith(
-          *II, Builder.CreateTrunc(II->getArgOperand(0), II->getType()));
+          *II, Builder.CreateZExtOrTrunc(II->getArgOperand(0), II->getType()));
     return nullptr;
   }
   default: {

From db24622342e6abf7e15672aca9fe7bb8658c81c5 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 26 Nov 2025 16:15:12 +0800
Subject: [PATCH 4/4] Perform in larger of two bitwidths

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp  | 11 ++++++++---
 llvm/test/Transforms/InstCombine/get_vector_length.ll |  9 +++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a576dc42f43b9..ad6407fba81da 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4007,11 +4007,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::experimental_get_vector_length: {
     // get.vector.length(Cnt, MaxLanes) --> Cnt when Cnt <= MaxLanes
-    ConstantRange Cnt = computeConstantRangeIncludingKnownBits(
-        II->getArgOperand(0), false, SQ.getWithInstruction(II));
+    unsigned BitWidth =
+        std::max(II->getArgOperand(0)->getType()->getScalarSizeInBits(),
+                 II->getType()->getScalarSizeInBits());
+    ConstantRange Cnt =
+        computeConstantRangeIncludingKnownBits(II->getArgOperand(0), false,
+                                               SQ.getWithInstruction(II))
+            .zextOrTrunc(BitWidth);
     ConstantRange MaxLanes = cast<ConstantInt>(II->getArgOperand(1))
                                  ->getValue()
-                                 .zext(Cnt.getBitWidth());
+                                 .zextOrTrunc(Cnt.getBitWidth());
     if (cast<ConstantInt>(II->getArgOperand(2))->isOne())
       MaxLanes = MaxLanes.multiply(
           getVScaleRange(II->getFunction(), Cnt.getBitWidth()));
diff --git a/llvm/test/Transforms/InstCombine/get_vector_length.ll b/llvm/test/Transforms/InstCombine/get_vector_length.ll
index 96a7f3058c43c..122beeae866f3 100644
--- a/llvm/test/Transforms/InstCombine/get_vector_length.ll
+++ b/llvm/test/Transforms/InstCombine/get_vector_length.ll
@@ -78,3 +78,12 @@ define i32 @cnt_known_lt_runtime_assumption(i32 %x) {
   %y = call i32 @llvm.experimental.get.vector.length(i32 %x, i32 2, i1 false)
   ret i32 %y
 }
+
+
+define i32 @cnt_known_lt_i16() {
+; CHECK-LABEL: define i32 @cnt_known_lt_i16() {
+; CHECK-NEXT:    ret i32 1
+;
+  %x = call i32 @llvm.experimental.get.vector.length(i16 1, i32 2, i1 false)
+  ret i32 %x
+}