From 1fd7c1bbaea97d59452233e583d0d3c10298ffe6 Mon Sep 17 00:00:00 2001
From: Steve Suzuki <shinsuke.suzuki@arm.com>
Date: Mon, 24 Nov 2025 10:07:59 +0000
Subject: [PATCH 1/6] Add helpers for shuffle operations of scalable vector

---
 src/CodeGen_LLVM.cpp | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/CodeGen_LLVM.h   |  17 +++++
 2 files changed, 173 insertions(+)
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 4a5b45475533..746559e9b3a6 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4868,6 +4868,10 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
 }
 
 Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
+    if (effective_vscale > 0 && is_scalable_vector(vec)) {
+        return slice_scalable_vector(vec, start, size);
+    }
+
     // Force the arg to be an actual vector
     if (!vec->getType()->isVectorTy()) {
         vec = create_broadcast(vec, 1);
@@ -4931,6 +4935,10 @@ Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
 
     internal_assert(!v.empty());
 
+    if (effective_vscale > 0 && is_scalable_vector(v[0])) {
+        return concat_scalable_vectors(v);
+    }
+
     vector<Value *> vecs = v;
 
     // Force them all to be actual vectors
@@ -4990,6 +4998,147 @@ Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
     return vecs[0];
 }
 
+Value *CodeGen_LLVM::concat_scalable_vectors(const vector<Value *> &vecs) {
+    internal_assert(effective_vscale > 0 && is_scalable_vector(vecs[0]));
+    int total_lanes = 0;
+    for (auto* v: vecs) {
+        total_lanes += get_vector_num_elements(v->getType());
+    }
+
+    llvm::Type *concat_type = get_vector_type(get_vector_element_type(vecs[0]->getType()), total_lanes);
+    Value *ret = UndefValue::get(concat_type);
+    int insert_index = 0;
+    for (auto* v: vecs) {
+        ret = insert_scalable_vector(ret, v, insert_index);
+        insert_index += get_vector_num_elements(v->getType());
+    }
+    return ret;
+}
+
+Value *CodeGen_LLVM::slice_scalable_vector(llvm::Value *vec, int start, int slice_size) {
+    const int vec_lanes = get_vector_num_elements(vec->getType());
+    if (slice_size == 1) {
+        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
+    } else if (start == 0) {
+        if (vec_lanes == slice_size) {
+            return vec;
+        } else if (vec_lanes < slice_size) {
+            return insert_scalable_vector(UndefValue::get(get_vector_type(vec, slice_size)), vec, 0);
+        } else {
+            auto *dst_type = get_vector_type(vec, slice_size);
+            Value *val_index = ConstantInt::get(i64_t, 0, true);
+            return builder->CreateExtractVector(dst_type, vec, val_index);
+        }
+    } else {
+        const int extract_size = std::min(vec_lanes - start, slice_size);
+        Value *extracted = extract_scalable_vector(vec, start, extract_size);
+        if (slice_size == extract_size) {
+            return extracted;
+        } else {
+            Value *sliced = UndefValue::get(get_vector_type(vec, slice_size));
+            sliced = insert_scalable_vector(sliced, extracted, 0);
+            return sliced;
+        }
+    }
+}
+
+Value *CodeGen_LLVM::extract_scalable_vector(Value *vec, int start, int extract_size) {
+    internal_assert(is_scalable_vector(vec) && effective_vscale);
+    internal_assert(start + extract_size <= get_vector_num_elements(vec->getType()));  // No overrun
+
+    if (extract_size == 1) {
+        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
+    } else {
+        // To follow the requirement of ‘llvm.experimental.vector.extract’ intrinsic that
+        // idx must be a constant multiple of the known-minimum vector length of the result type,
+        // the extraction is performed as multiple sub-extraction, where the worst case is extraction of scalar.
+        std::vector<Value *> sub_slices;
+        int i = 0;
+        while (i < extract_size) {
+            int sub_extract_pos = start + i;
+            for (int sub_extract_size = extract_size - i; sub_extract_size > 0; --sub_extract_size) {
+                if (sub_extract_pos % sub_extract_size == 0) {
+                    internal_assert(sub_extract_pos % effective_vscale == 0);
+                    Value *sub_extracted;
+                    if (sub_extract_size == 1) {
+                        sub_extracted = builder->CreateExtractElement(vec, sub_extract_pos);
+                    } else {
+                        // In vector operation, index needs to be normalized by vscale
+                        Value *idx_val = ConstantInt::get(i64_t, sub_extract_pos / effective_vscale, true);
+                        llvm::Type *sub_extract_type = get_vector_type(vec, sub_extract_size);
+                        sub_extracted = builder->CreateExtractVector(sub_extract_type, vec, idx_val);
+                    }
+                    sub_slices.push_back(sub_extracted);
+
+                    i += sub_extract_size;
+                    break;
+                }
+            }
+        }
+        Value *extracted = concat_vectors(sub_slices);
+        return extracted;
+    }
+}
+
+Value *CodeGen_LLVM::insert_scalable_vector(Value *base_vec, Value *new_vec, int start) {
+    // To follow the requirement of ‘llvm.experimental.vector.insert’ intrinsic that
+    // idx must be a constant multiple of subvec’s known minimum vector length,
+    // insertion is performed in multiple sub slices.
+
+    const int base_lanes = get_vector_num_elements(base_vec->getType());
+    const int new_vec_lanes = get_vector_num_elements(new_vec->getType());
+    llvm::Type *element_type = get_vector_element_type(base_vec->getType());
+
+    internal_assert(start + new_vec_lanes <= base_lanes);
+
+    if (base_lanes == 1 && new_vec_lanes == 1) {
+        return new_vec;
+    }
+
+    internal_assert(is_scalable_vector(base_vec) && effective_vscale);
+    if (!new_vec->getType()->isVectorTy()) {
+        return builder->CreateInsertElement(base_vec, new_vec, start);
+    } else if (start % new_vec_lanes == 0) {
+        // Most of the ordinal use cases are this pattern
+        // In vector operation, index needs to be normalized by vscale
+        Value *val_start_index = ConstantInt::get(i64_t, start / effective_vscale, true);
+        return builder->CreateInsertVector(base_vec->getType(), base_vec, new_vec, val_start_index);
+    }
+
+    Value *ret = base_vec;
+    int extract_index = 0;
+    int insert_index = start;
+    int sub_slice_size = std::min(start, new_vec_lanes);
+
+    while (extract_index < new_vec_lanes) {
+        if (extract_index + sub_slice_size <= new_vec_lanes &&  // Condition to not overrun
+            extract_index % sub_slice_size == 0 &&              // Requirement of LLVM intrinsic
+            insert_index % sub_slice_size == 0) {               // Requirement of LLVM intrinsic
+
+            internal_assert(extract_index % effective_vscale == 0);
+            internal_assert(insert_index % effective_vscale == 0);
+
+            if (sub_slice_size == 1) {
+                Value *sub_slice = builder->CreateExtractElement(new_vec, extract_index);
+                ret = builder->CreateInsertElement(ret, sub_slice, insert_index);
+            } else {
+                // In vector operation, index needs to be normalized by vscale
+                Value *val_extract_index = ConstantInt::get(i64_t, extract_index / effective_vscale, true);
+                Value *val_insert_index = ConstantInt::get(i64_t, insert_index / effective_vscale, true);
+                llvm::Type *sub_sliced_type = get_vector_type(element_type, sub_slice_size);
+                Value *sub_slice = builder->CreateExtractVector(sub_sliced_type, new_vec, val_extract_index);
+                ret = builder->CreateInsertVector(base_vec->getType(), ret, sub_slice, val_insert_index);
+            }
+            insert_index += sub_slice_size;
+            extract_index += sub_slice_size;
+        } else {
+            // move on to next candidate
+            --sub_slice_size;
+        }
+    }
+    return ret;
+}
+
 Value *CodeGen_LLVM::reverse_vector(llvm::Value *vec) {
     if (effective_vscale > 0) {
         return builder->CreateVectorReverse(vec);
@@ -5396,6 +5545,13 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
     return VectorType::get(t, n, scalable);
 }
 
+llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Value *vec_or_scalar, int n,
+                                          VectorTypeConstraint type_constraint) const {
+    llvm::Type *t = vec_or_scalar->getType();
+    llvm::Type *elt = t->isVectorTy() ? get_vector_element_type(t) : t;
+    return get_vector_type(elt, n, type_constraint);
+}
+
 llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value,
                                         VectorTypeConstraint type_constraint) const {
     bool scalable = false;
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 183463d5fdb6..b30a44640019 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -512,6 +512,20 @@ class CodeGen_LLVM : public IRVisitor {
     /** Concatenate a bunch of llvm vectors. Must be of the same type. */
     virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);
 
+    /** concat_vectors, specialized for scalable vector */
+    virtual llvm::Value *concat_scalable_vectors(const std::vector<llvm::Value *> &);
+
+    /** Equivalent of slice_vector, specialized for scalable vector */
+    virtual llvm::Value *slice_scalable_vector(llvm::Value *vec, int start, int extent);
+
+    /** Extract a sub vector from a vector, all the elements in the sub vector must be in the src vector.
+     * Specialized for scalable vector */
+    llvm::Value *extract_scalable_vector(llvm::Value *vec, int start, int extract_size);
+
+    /** Insert a vector into the "start" position of a base vector.
+     * Specialized for scalable vector */
+    llvm::Value *insert_scalable_vector(llvm::Value *base_vec, llvm::Value *new_vec, int start);
+
     /** Reverse elements in a vector */
     llvm::Value *reverse_vector(llvm::Value *vec);
 
@@ -606,6 +620,9 @@ class CodeGen_LLVM : public IRVisitor {
     };
     llvm::Type *get_vector_type(llvm::Type *, int n,
                                 VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
+
+    llvm::Type *get_vector_type(llvm::Value *vec_or_scalar, int n,
+                                VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
     // @}
 
     llvm::Constant *get_splat(int lanes, llvm::Constant *value,

From 9d8fe1184fd89e01b90602a2d9473cf64b6a4cc8 Mon Sep 17 00:00:00 2001
From: Steve Suzuki <shinsuke.suzuki@arm.com>
Date: Fri, 28 Nov 2025 09:11:37 +0000
Subject: [PATCH 2/6] Move helpers for shuffle scalable vectors to CodeGen_ARM

Theoretically, these are llvm common and not ARM specific,
but for now, keep it for ARM only to avoid any affect to
other targets.
---
 src/CodeGen_ARM.cpp  | 173 +++++++++++++++++++++++++++++++++++++++++++
 src/CodeGen_LLVM.cpp | 156 --------------------------------------
 src/CodeGen_LLVM.h   |  17 -----
 3 files changed, 173 insertions(+), 173 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 800f587a4262..c9d3c2a3e2e2 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -202,6 +202,19 @@ class CodeGen_ARM : public CodeGen_Posix {
     void visit(const Call *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
+
+    llvm::Type *get_vector_type_from_value(llvm::Value *vec_or_scalar, int n);
+    Value *concat_vectors(const std::vector<llvm::Value *> &) override;
+    Value *slice_vector(Value *vec, int start, int extent) override;
+
+    /** Extract a sub vector from a vector, all the elements in the sub vector must be in the src vector.
+     * Specialized for scalable vector */
+    Value *extract_scalable_vector(Value *vec, int start, int extract_size);
+
+    /** Insert a vector into the "start" position of a base vector.
+     * Specialized for scalable vector */
+    Value *insert_scalable_vector(Value *base_vec, Value *new_vec, int start);
+
     Value *interleave_vectors(const std::vector<Value *> &) override;
     Value *shuffle_vectors(Value *a, Value *b, const std::vector<int> &indices) override;
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
@@ -1973,6 +1986,166 @@ void CodeGen_ARM::visit(const Shuffle *op) {
     }
 }
 
+llvm::Type *CodeGen_ARM::get_vector_type_from_value(Value *vec_or_scalar, int n) {
+    llvm::Type *t = vec_or_scalar->getType();
+    llvm::Type *elt = t->isVectorTy() ? get_vector_element_type(t) : t;
+    return CodeGen_Posix::get_vector_type(elt, n);
+}
+
+Value *CodeGen_ARM::concat_vectors(const vector<Value *> &vecs) {
+    // Override only for scalable vector which includes
+    // the case where scalars are concatenated into scalable vector.
+    if (target_vscale() == 0 ||
+        vecs.size() <= 1 ||
+        isa<FixedVectorType>(vecs[0]->getType())) {
+        return CodeGen_Posix::concat_vectors(vecs);
+    }
+
+    int total_lanes = 0;
+    for (auto *v : vecs) {
+        total_lanes += get_vector_num_elements(v->getType());
+    }
+
+    llvm::Type *concat_type = get_vector_type(get_vector_element_type(vecs[0]->getType()), total_lanes);
+    Value *ret = UndefValue::get(concat_type);
+    int insert_index = 0;
+    for (auto *v : vecs) {
+        ret = insert_scalable_vector(ret, v, insert_index);
+        insert_index += get_vector_num_elements(v->getType());
+    }
+    return ret;
+}
+
+Value *CodeGen_ARM::slice_vector(llvm::Value *vec, int start, int slice_size) {
+    // Override only for scalable vector
+    if (target_vscale() == 0 ||
+        !is_scalable_vector(vec)) {
+        return CodeGen_Posix::slice_vector(vec, start, slice_size);
+    }
+
+    const int vec_lanes = get_vector_num_elements(vec->getType());
+    if (slice_size == 1) {
+        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
+    } else if (start == 0) {
+        if (vec_lanes == slice_size) {
+            return vec;
+        } else if (vec_lanes < slice_size) {
+            return insert_scalable_vector(UndefValue::get(get_vector_type_from_value(vec, slice_size)), vec, 0);
+        } else {
+            auto *dst_type = get_vector_type_from_value(vec, slice_size);
+            Value *val_index = ConstantInt::get(i64_t, 0, true);
+            return builder->CreateExtractVector(dst_type, vec, val_index);
+        }
+    } else {
+        const int extract_size = std::min(vec_lanes - start, slice_size);
+        Value *extracted = extract_scalable_vector(vec, start, extract_size);
+        if (slice_size == extract_size) {
+            return extracted;
+        } else {
+            Value *sliced = UndefValue::get(get_vector_type_from_value(vec, slice_size));
+            sliced = insert_scalable_vector(sliced, extracted, 0);
+            return sliced;
+        }
+    }
+}
+
+Value *CodeGen_ARM::extract_scalable_vector(Value *vec, int start, int extract_size) {
+    internal_assert(target_vscale() > 0 && is_scalable_vector(vec));
+    internal_assert(start + extract_size <= get_vector_num_elements(vec->getType()));  // No overrun
+
+    if (extract_size == 1) {
+        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
+    } else {
+        // To follow the requirement of ‘llvm.vector.extract’ intrinsic that
+        // idx must be a constant multiple of the known-minimum vector length of the result type,
+        // the extraction is performed as multiple sub-extraction, where the worst case is extraction of scalar.
+        std::vector<Value *> sub_slices;
+        int i = 0;
+        while (i < extract_size) {
+            int sub_extract_pos = start + i;
+            for (int sub_extract_size = extract_size - i; sub_extract_size > 0; --sub_extract_size) {
+                if (sub_extract_pos % sub_extract_size == 0) {
+                    internal_assert(sub_extract_pos % target_vscale() == 0);
+                    Value *sub_extracted;
+                    if (sub_extract_size == 1) {
+                        sub_extracted = builder->CreateExtractElement(vec, sub_extract_pos);
+                    } else {
+                        // In vector operation, index needs to be normalized by vscale
+                        Value *idx_val = ConstantInt::get(i64_t, sub_extract_pos / target_vscale(), true);
+                        llvm::Type *sub_extract_type = get_vector_type_from_value(vec, sub_extract_size);
+                        sub_extracted = builder->CreateExtractVector(sub_extract_type, vec, idx_val);
+                    }
+                    sub_slices.push_back(sub_extracted);
+
+                    i += sub_extract_size;
+                    break;
+                }
+            }
+        }
+        Value *extracted = concat_vectors(sub_slices);
+        return extracted;
+    }
+}
+
+Value *CodeGen_ARM::insert_scalable_vector(Value *base_vec, Value *new_vec, int start) {
+    const int base_lanes = get_vector_num_elements(base_vec->getType());
+    const int new_vec_lanes = get_vector_num_elements(new_vec->getType());
+    llvm::Type *element_type = get_vector_element_type(base_vec->getType());
+
+    internal_assert(start + new_vec_lanes <= base_lanes);
+
+    if (base_lanes == 1 && new_vec_lanes == 1) {
+        return new_vec;
+    }
+
+    internal_assert(target_vscale() > 0 && is_scalable_vector(base_vec));
+
+    if (!new_vec->getType()->isVectorTy()) {
+        return builder->CreateInsertElement(base_vec, new_vec, start);
+    } else if (start % new_vec_lanes == 0) {
+        // Most of the ordinal use cases are this pattern
+        // In vector operation, index needs to be normalized by vscale
+        Value *val_start_index = ConstantInt::get(i64_t, start / target_vscale(), true);
+        return builder->CreateInsertVector(base_vec->getType(), base_vec, new_vec, val_start_index);
+    }
+
+    // To follow the requirement of ‘llvm.vector.insert’ intrinsic that
+    // idx must be a constant multiple of subvec’s known minimum vector length,
+    // insertion is performed in multiple sub slices.
+    Value *ret = base_vec;
+    int extract_index = 0;
+    int insert_index = start;
+    int sub_slice_size = std::min(start, new_vec_lanes);
+
+    while (extract_index < new_vec_lanes) {
+        if (extract_index + sub_slice_size <= new_vec_lanes &&  // Condition to not overrun
+            extract_index % sub_slice_size == 0 &&              // Requirement of LLVM intrinsic
+            insert_index % sub_slice_size == 0) {               // Requirement of LLVM intrinsic
+
+            internal_assert(extract_index % target_vscale() == 0);
+            internal_assert(insert_index % target_vscale() == 0);
+
+            if (sub_slice_size == 1) {
+                Value *sub_slice = builder->CreateExtractElement(new_vec, extract_index);
+                ret = builder->CreateInsertElement(ret, sub_slice, insert_index);
+            } else {
+                // In vector operation, index needs to be normalized by vscale
+                Value *val_extract_index = ConstantInt::get(i64_t, extract_index / target_vscale(), true);
+                Value *val_insert_index = ConstantInt::get(i64_t, insert_index / target_vscale(), true);
+                llvm::Type *sub_sliced_type = get_vector_type(element_type, sub_slice_size);
+                Value *sub_slice = builder->CreateExtractVector(sub_sliced_type, new_vec, val_extract_index);
+                ret = builder->CreateInsertVector(base_vec->getType(), ret, sub_slice, val_insert_index);
+            }
+            insert_index += sub_slice_size;
+            extract_index += sub_slice_size;
+        } else {
+            // move on to next candidate
+            --sub_slice_size;
+        }
+    }
+    return ret;
+}
+
 Value *CodeGen_ARM::interleave_vectors(const std::vector<Value *> &vecs) {
     if (simd_intrinsics_disabled() || target_vscale() == 0 ||
         vecs.size() < 2 ||
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 746559e9b3a6..4a5b45475533 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4868,10 +4868,6 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
 }
 
 Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
-    if (effective_vscale > 0 && is_scalable_vector(vec)) {
-        return slice_scalable_vector(vec, start, size);
-    }
-
     // Force the arg to be an actual vector
     if (!vec->getType()->isVectorTy()) {
         vec = create_broadcast(vec, 1);
@@ -4935,10 +4931,6 @@ Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
 
     internal_assert(!v.empty());
 
-    if (effective_vscale > 0 && is_scalable_vector(v[0])) {
-        return concat_scalable_vectors(v);
-    }
-
     vector<Value *> vecs = v;
 
     // Force them all to be actual vectors
@@ -4998,147 +4990,6 @@ Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
     return vecs[0];
 }
 
-Value *CodeGen_LLVM::concat_scalable_vectors(const vector<Value *> &vecs) {
-    internal_assert(effective_vscale > 0 && is_scalable_vector(vecs[0]));
-    int total_lanes = 0;
-    for (auto* v: vecs) {
-        total_lanes += get_vector_num_elements(v->getType());
-    }
-
-    llvm::Type *concat_type = get_vector_type(get_vector_element_type(vecs[0]->getType()), total_lanes);
-    Value *ret = UndefValue::get(concat_type);
-    int insert_index = 0;
-    for (auto* v: vecs) {
-        ret = insert_scalable_vector(ret, v, insert_index);
-        insert_index += get_vector_num_elements(v->getType());
-    }
-    return ret;
-}
-
-Value *CodeGen_LLVM::slice_scalable_vector(llvm::Value *vec, int start, int slice_size) {
-    const int vec_lanes = get_vector_num_elements(vec->getType());
-    if (slice_size == 1) {
-        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
-    } else if (start == 0) {
-        if (vec_lanes == slice_size) {
-            return vec;
-        } else if (vec_lanes < slice_size) {
-            return insert_scalable_vector(UndefValue::get(get_vector_type(vec, slice_size)), vec, 0);
-        } else {
-            auto *dst_type = get_vector_type(vec, slice_size);
-            Value *val_index = ConstantInt::get(i64_t, 0, true);
-            return builder->CreateExtractVector(dst_type, vec, val_index);
-        }
-    } else {
-        const int extract_size = std::min(vec_lanes - start, slice_size);
-        Value *extracted = extract_scalable_vector(vec, start, extract_size);
-        if (slice_size == extract_size) {
-            return extracted;
-        } else {
-            Value *sliced = UndefValue::get(get_vector_type(vec, slice_size));
-            sliced = insert_scalable_vector(sliced, extracted, 0);
-            return sliced;
-        }
-    }
-}
-
-Value *CodeGen_LLVM::extract_scalable_vector(Value *vec, int start, int extract_size) {
-    internal_assert(is_scalable_vector(vec) && effective_vscale);
-    internal_assert(start + extract_size <= get_vector_num_elements(vec->getType()));  // No overrun
-
-    if (extract_size == 1) {
-        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
-    } else {
-        // To follow the requirement of ‘llvm.experimental.vector.extract’ intrinsic that
-        // idx must be a constant multiple of the known-minimum vector length of the result type,
-        // the extraction is performed as multiple sub-extraction, where the worst case is extraction of scalar.
-        std::vector<Value *> sub_slices;
-        int i = 0;
-        while (i < extract_size) {
-            int sub_extract_pos = start + i;
-            for (int sub_extract_size = extract_size - i; sub_extract_size > 0; --sub_extract_size) {
-                if (sub_extract_pos % sub_extract_size == 0) {
-                    internal_assert(sub_extract_pos % effective_vscale == 0);
-                    Value *sub_extracted;
-                    if (sub_extract_size == 1) {
-                        sub_extracted = builder->CreateExtractElement(vec, sub_extract_pos);
-                    } else {
-                        // In vector operation, index needs to be normalized by vscale
-                        Value *idx_val = ConstantInt::get(i64_t, sub_extract_pos / effective_vscale, true);
-                        llvm::Type *sub_extract_type = get_vector_type(vec, sub_extract_size);
-                        sub_extracted = builder->CreateExtractVector(sub_extract_type, vec, idx_val);
-                    }
-                    sub_slices.push_back(sub_extracted);
-
-                    i += sub_extract_size;
-                    break;
-                }
-            }
-        }
-        Value *extracted = concat_vectors(sub_slices);
-        return extracted;
-    }
-}
-
-Value *CodeGen_LLVM::insert_scalable_vector(Value *base_vec, Value *new_vec, int start) {
-    // To follow the requirement of ‘llvm.experimental.vector.insert’ intrinsic that
-    // idx must be a constant multiple of subvec’s known minimum vector length,
-    // insertion is performed in multiple sub slices.
-
-    const int base_lanes = get_vector_num_elements(base_vec->getType());
-    const int new_vec_lanes = get_vector_num_elements(new_vec->getType());
-    llvm::Type *element_type = get_vector_element_type(base_vec->getType());
-
-    internal_assert(start + new_vec_lanes <= base_lanes);
-
-    if (base_lanes == 1 && new_vec_lanes == 1) {
-        return new_vec;
-    }
-
-    internal_assert(is_scalable_vector(base_vec) && effective_vscale);
-    if (!new_vec->getType()->isVectorTy()) {
-        return builder->CreateInsertElement(base_vec, new_vec, start);
-    } else if (start % new_vec_lanes == 0) {
-        // Most of the ordinal use cases are this pattern
-        // In vector operation, index needs to be normalized by vscale
-        Value *val_start_index = ConstantInt::get(i64_t, start / effective_vscale, true);
-        return builder->CreateInsertVector(base_vec->getType(), base_vec, new_vec, val_start_index);
-    }
-
-    Value *ret = base_vec;
-    int extract_index = 0;
-    int insert_index = start;
-    int sub_slice_size = std::min(start, new_vec_lanes);
-
-    while (extract_index < new_vec_lanes) {
-        if (extract_index + sub_slice_size <= new_vec_lanes &&  // Condition to not overrun
-            extract_index % sub_slice_size == 0 &&              // Requirement of LLVM intrinsic
-            insert_index % sub_slice_size == 0) {               // Requirement of LLVM intrinsic
-
-            internal_assert(extract_index % effective_vscale == 0);
-            internal_assert(insert_index % effective_vscale == 0);
-
-            if (sub_slice_size == 1) {
-                Value *sub_slice = builder->CreateExtractElement(new_vec, extract_index);
-                ret = builder->CreateInsertElement(ret, sub_slice, insert_index);
-            } else {
-                // In vector operation, index needs to be normalized by vscale
-                Value *val_extract_index = ConstantInt::get(i64_t, extract_index / effective_vscale, true);
-                Value *val_insert_index = ConstantInt::get(i64_t, insert_index / effective_vscale, true);
-                llvm::Type *sub_sliced_type = get_vector_type(element_type, sub_slice_size);
-                Value *sub_slice = builder->CreateExtractVector(sub_sliced_type, new_vec, val_extract_index);
-                ret = builder->CreateInsertVector(base_vec->getType(), ret, sub_slice, val_insert_index);
-            }
-            insert_index += sub_slice_size;
-            extract_index += sub_slice_size;
-        } else {
-            // move on to next candidate
-            --sub_slice_size;
-        }
-    }
-    return ret;
-}
-
 Value *CodeGen_LLVM::reverse_vector(llvm::Value *vec) {
     if (effective_vscale > 0) {
         return builder->CreateVectorReverse(vec);
@@ -5545,13 +5396,6 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
     return VectorType::get(t, n, scalable);
 }
 
-llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Value *vec_or_scalar, int n,
-                                          VectorTypeConstraint type_constraint) const {
-    llvm::Type *t = vec_or_scalar->getType();
-    llvm::Type *elt = t->isVectorTy() ? get_vector_element_type(t) : t;
-    return get_vector_type(elt, n, type_constraint);
-}
-
 llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value,
                                         VectorTypeConstraint type_constraint) const {
     bool scalable = false;
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index b30a44640019..183463d5fdb6 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -512,20 +512,6 @@ class CodeGen_LLVM : public IRVisitor {
     /** Concatenate a bunch of llvm vectors. Must be of the same type. */
     virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);
 
-    /** concat_vectors, specialized for scalable vector */
-    virtual llvm::Value *concat_scalable_vectors(const std::vector<llvm::Value *> &);
-
-    /** Equivalent of slice_vector, specialized for scalable vector */
-    virtual llvm::Value *slice_scalable_vector(llvm::Value *vec, int start, int extent);
-
-    /** Extract a sub vector from a vector, all the elements in the sub vector must be in the src vector.
-     * Specialized for scalable vector */
-    llvm::Value *extract_scalable_vector(llvm::Value *vec, int start, int extract_size);
-
-    /** Insert a vector into the "start" position of a base vector.
-     * Specialized for scalable vector */
-    llvm::Value *insert_scalable_vector(llvm::Value *base_vec, llvm::Value *new_vec, int start);
-
     /** Reverse elements in a vector */
     llvm::Value *reverse_vector(llvm::Value *vec);
 
@@ -620,9 +606,6 @@ class CodeGen_LLVM : public IRVisitor {
     };
     llvm::Type *get_vector_type(llvm::Type *, int n,
                                 VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
-
-    llvm::Type *get_vector_type(llvm::Value *vec_or_scalar, int n,
-                                VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
     // @}
 
     llvm::Constant *get_splat(int lanes, llvm::Constant *value,

From bfd9535e3b66f2d6ba1c74ba7d4583f9f1b71a0c Mon Sep 17 00:00:00 2001
From: Steve Suzuki <shinsuke.suzuki@arm.com>
Date: Mon, 24 Nov 2025 10:07:05 +0000
Subject: [PATCH 3/6] Modify workaround of using FixedVector for ScalableVector

The workaround of checking wide_enough in get_vector_type() was
causing the issue of mixing FixedVector and ScalableVector
in generating a intrinsic instruction in SVE2 codegen.
By this change, we select scalable vector for most of the cases.

Note the workaround for vscale > 1 case will be addressed in
a separate commit.
---
 src/CodeGen_LLVM.cpp | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 4a5b45475533..617a40dbe973 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5360,23 +5360,7 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
     switch (type_constraint) {
     case VectorTypeConstraint::None:
         if (effective_vscale > 0) {
-            bool wide_enough = true;
-            // TODO(https://github.com/halide/Halide/issues/8119): Architecture
-            // specific code should not go here. Ideally part of this can go
-            // away via LLVM fixes and modifying intrinsic selection to handle
-            // scalable vs. fixed vectors. Making this method virtual is
-            // possibly expensive.
-            if (target.arch == Target::ARM) {
-                if (!target.has_feature(Target::NoNEON)) {
-                    // force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this.
-                    int bit_size = std::max((int)t->getScalarSizeInBits(), 8);
-                    wide_enough = (bit_size * n) > 128;
-                } else {
-                    // TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1.
-                    wide_enough = (n / effective_vscale) > 1;
-                }
-            }
-            scalable = wide_enough && ((n % effective_vscale) == 0);
+            scalable = (n % effective_vscale) == 0;
             if (scalable) {
                 n = n / effective_vscale;
             }

From cf1619f56850c2c4e2c3a272ddc3860d175c8a85 Mon Sep 17 00:00:00 2001
From: Steve Suzuki <shinsuke.suzuki@arm.com>
Date: Mon, 13 Oct 2025 12:14:06 +0000
Subject: [PATCH 4/6] Shuffle scalable vector in CodeGen_ARM

By design, LLVM shufflevector doesn't accept scalable vectors.
So, we try to use llvm.vector.xx intrinsic where possible.
However, those are not enough to cover wide usage of shuffles in Halide.
To handle arbitrary index pattern, we decompose a shuffle operation
to a sequence of multiple native shuffles, which are lowered to
Arm SVE2 intrinsic TBL or TBL2.

Another approach could be to perform shuffle in fixed sized vector
by adding conversion between scalable vector and fixed vector.
However, it seems to be only possible via load/store memory,
which would presumably be poor performance.

This change also includes:
- Peep-hole the particular predicate pattern to emit WHILELT instruction
- Shuffle 1bit type scalable vectors as 8bit with type casts
- Peep-hole concat_vectors for padding to align up vector
- Fix redundant broadcast in CodeGen_LLVM
---
 src/CMakeLists.txt             |   1 +
 src/CodeGen_ARM.cpp            | 261 ++++++++++++++++++++++++++++-----
 src/CodeGen_LLVM.cpp           |   4 +-
 src/DecomposeVectorShuffle.cpp | 188 ++++++++++++++++++++++++
 src/DecomposeVectorShuffle.h   | 216 +++++++++++++++++++++++++++
 test/internal.cpp              |   2 +
 6 files changed, 636 insertions(+), 36 deletions(-)
 create mode 100644 src/DecomposeVectorShuffle.cpp
 create mode 100644 src/DecomposeVectorShuffle.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index af419323b24e..7b6ca6daa69b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -276,6 +276,7 @@ target_sources(
     Debug.cpp
     DebugArguments.cpp
     DebugToFile.cpp
+    DecomposeVectorShuffle.cpp
     Definition.cpp
     Deinterleave.cpp
     Derivative.cpp
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index c9d3c2a3e2e2..486f9e9e290f 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -6,6 +6,7 @@
 #include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
 #include "Debug.h"
+#include "DecomposeVectorShuffle.h"
 #include "DistributeShifts.h"
 #include "IREquality.h"
 #include "IRMatch.h"
@@ -20,6 +21,7 @@
 namespace Halide {
 namespace Internal {
 
+using std::optional;
 using std::ostringstream;
 using std::pair;
 using std::string;
@@ -217,6 +219,9 @@ class CodeGen_ARM : public CodeGen_Posix {
 
     Value *interleave_vectors(const std::vector<Value *> &) override;
     Value *shuffle_vectors(Value *a, Value *b, const std::vector<int> &indices) override;
+    Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices);
+    Value *codegen_shuffle_indices(int bits, const std::vector<int> &indices);
+    Value *codegen_whilelt(int total_lanes, int start, int end);
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
     bool codegen_dot_product_vector_reduce(const VectorReduce *, const Expr &);
     bool codegen_pairwise_vector_reduce(const VectorReduce *, const Expr &);
@@ -237,6 +242,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     };
     vector<Pattern> casts, calls, negations;
 
+    int natural_vector_size(const Halide::Type &t) const;
     string mcpu_target() const override;
     string mcpu_tune() const override;
     string mattrs() const override;
@@ -267,6 +273,37 @@ class CodeGen_ARM : public CodeGen_Posix {
             return Shuffle::make_concat({const_true(true_lanes), const_false(false_lanes)});
         }
     }
+
+    /** Handle general shuffle of vectors. See DecomposeVectorShuffle.h about how it works */
+    struct VectorShuffler : public DecomposeVectorShuffle<VectorShuffler, Value *> {
+        VectorShuffler(Value *src_a, Value *src_b, const vector<int> &indices, int vl, CodeGen_ARM &codegen)
+            : DecomposeVectorShuffle(src_a, src_b, indices, vl), codegen(codegen) {
+        }
+
+        int get_vec_length(Value *v) {
+            return codegen.get_vector_num_elements(v->getType());
+        }
+
+        Value *align_up_vector(Value *v, int align) {
+            size_t org_len = get_vec_length(v);
+            return codegen.slice_vector(v, 0, align_up(org_len, align));
+        }
+
+        Value *slice_vec(Value *v, int start, size_t lanes) {
+            return codegen.slice_vector(v, start, lanes);
+        }
+
+        Value *concat_vecs(const vector<Value *> &vecs) {
+            return codegen.concat_vectors(vecs);
+        }
+
+        Value *shuffle_vl_aligned(Value *a, optional<Value *> &b, const vector<int> &indices, int vl) {
+            return codegen.shuffle_scalable_vectors_general(a, b.value_or(nullptr), indices);
+        }
+
+    private:
+        CodeGen_ARM &codegen;
+    };
 };
 
 CodeGen_ARM::CodeGen_ARM(const Target &target)
@@ -1981,9 +2018,71 @@ void CodeGen_ARM::visit(const Shuffle *op) {
 
         value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false);
         value = CodeGen_Posix::shuffle_vectors(value, op->indices);
-    } else {
+        return;
+    }
+
+    if (target_vscale() == 0) {
         CodeGen_Posix::visit(op);
+        return;
     }
+
+    const int total_lanes = op->type.lanes();
+    if (op->type.bits() == 1) {
+        // Peep-hole pattern that matches SVE "whilelt" which represents particular pattern of
+        // vector predicate. e.g. 11100000 (active_lanes=3, all_lanes=8)
+        if (op->is_concat() && op->vectors.size() == 2 &&
+            op->type.is_int_or_uint() &&
+            is_power_of_two(total_lanes) &&
+            total_lanes >= 2 * target_vscale() && total_lanes <= 16 * target_vscale() &&
+            is_const_one(op->vectors[0]) && is_const_zero(op->vectors[1])) {
+
+            int active_lanes = op->vectors[0].type().lanes();
+            value = codegen_whilelt(op->type.lanes(), 0, active_lanes);
+            return;
+        } else {
+            // Rewrite to process 1bit type vector as 8 bit vector, and then cast back
+            std::vector<Expr> vecs_i8;
+            vecs_i8.reserve(op->vectors.size());
+            for (const auto &vec_i1 : op->vectors) {
+                Type upgraded_type = vec_i1.type().with_bits(8);
+                vecs_i8.emplace_back(Cast::make(upgraded_type, vec_i1));
+            }
+            Expr equiv = Shuffle::make(vecs_i8, op->indices);
+            equiv = Cast::make(op->type, equiv);
+            equiv = common_subexpression_elimination(equiv);
+            value = codegen(equiv);
+            return;
+        }
+    } else if (op->is_concat() && op->vectors.size() == 2) {
+        // Here, we deal with some specific patterns of concat(a, b).
+        // Others are decomposed by CodeGen_LLVM at first,
+        // which in turn calles CodeGen_ARM::concat_vectors().
+
+        if (const Broadcast *bc_1 = op->vectors[1].as<Broadcast>()) {
+            // Common pattern where padding is appended to align lanes.
+            // Create broadcast of padding with dst lanes, then insert vec[0] at lane 0.
+            Value *val_0 = codegen(op->vectors[0]);
+            Value *val_1_scalar = codegen(bc_1->value);
+            Value *padding = builder->CreateVectorSplat(llvm::ElementCount::getScalable(total_lanes / target_vscale()), val_1_scalar);
+            value = insert_scalable_vector(padding, val_0, 0);
+            return;
+        }
+    } else if (op->is_broadcast()) {
+        // Undo simplification to avoid arbitrary-indexed shuffle
+        Expr equiv;
+        for (int f = 0; f < op->broadcast_factor(); ++f) {
+            if (equiv.defined()) {
+                equiv = Shuffle::make_concat({equiv, op->vectors[0]});
+            } else {
+                equiv = op->vectors[0];
+            }
+        }
+        equiv = common_subexpression_elimination(equiv);
+        value = codegen(equiv);
+        return;
+    }
+
+    CodeGen_Posix::visit(op);
 }
 
 llvm::Type *CodeGen_ARM::get_vector_type_from_value(Value *vec_or_scalar, int n) {
@@ -2186,52 +2285,139 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
     }
 
     internal_assert(a->getType() == b->getType());
+    llvm::Type *src_type = a->getType();
+    llvm::Type *elt = get_vector_element_type(src_type);
+    const int bits = elt->getScalarSizeInBits();
+    // note: lanes are multiplied by vscale
+    const int natural_lanes = natural_vector_size(Int(bits));
+    const int src_lanes = get_vector_num_elements(src_type);
+    const int dst_lanes = indices.size();
+
+    if (src_type->isVectorTy()) {
+        // i1 -> shuffle with i8 -> i1
+        if (src_type->getScalarSizeInBits() == 1) {
+            internal_assert(src_type->isIntegerTy()) << "1 bit floating point type is unexpected\n";
+            a = builder->CreateIntCast(a, VectorType::get(i8_t, dyn_cast<llvm::VectorType>(src_type)), false);
+            b = builder->CreateIntCast(b, VectorType::get(i8_t, dyn_cast<llvm::VectorType>(src_type)), false);
+            Value *v = shuffle_vectors(a, b, indices);
+            return builder->CreateIntCast(v, VectorType::get(i1_t, dyn_cast<llvm::VectorType>(v->getType())), false);
+        }
+
+        // Check if deinterleaved slice
+        {
+            // Get the stride of slice
+            int slice_stride = 0;
+            const int start_index = indices[0];
+            if (dst_lanes > 1) {
+                const int stride = indices[1] - start_index;
+                bool stride_equal = true;
+                for (int i = 2; i < dst_lanes; ++i) {
+                    stride_equal &= (indices[i] == start_index + i * stride);
+                }
+                slice_stride = stride_equal ? stride : 0;
+            }
 
+            // Lower slice with stride into llvm.vector.deinterleave intrinsic
+            const std::set<int> supported_strides{2, 3, 4, 8};
+            if (supported_strides.find(slice_stride) != supported_strides.end() &&
+                dst_lanes * slice_stride == src_lanes &&
+                indices.front() < slice_stride &&  // Start position cannot be larger than stride
+                is_power_of_two(dst_lanes) &&
+                dst_lanes % target_vscale() == 0 &&
+                dst_lanes / target_vscale() > 1) {
+
+                std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType()));
+
+                // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable
+                llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale);
+                StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type));
+                std::vector<llvm::Type *> arg_types{a->getType()};
+                llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false);
+                FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
+
+                CallInst *deinterleave = builder->CreateCall(fn, {a});
+                // extract one element out of the returned struct
+                Value *extracted = builder->CreateExtractValue(deinterleave, indices.front());
+
+                return extracted;
+            }
+        }
+    }
+
+    // Perform vector shuffle by decomposing the operation to multiple native shuffle steps
+    // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction
+    VectorShuffler shuffler(a, b, indices, natural_lanes, *this);
+    Value *v = shuffler.shuffle();
+    return v;
+}
+
+Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices) {
     llvm::Type *elt = get_vector_element_type(a->getType());
+    const int bits = elt->getScalarSizeInBits();
+    const int natural_lanes = natural_vector_size(Int(bits));
     const int src_lanes = get_vector_num_elements(a->getType());
     const int dst_lanes = indices.size();
+    llvm::Type *dst_type = get_vector_type(elt, dst_lanes);
 
-    // Check if deinterleaved slice
-    {
-        // Get the stride of slice
-        int slice_stride = 0;
-        const int start_index = indices[0];
-        if (dst_lanes > 1) {
-            const int stride = indices[1] - start_index;
-            bool stride_equal = true;
-            for (int i = 2; i < dst_lanes; ++i) {
-                stride_equal &= (indices[i] == start_index + i * stride);
-            }
-            slice_stride = stride_equal ? stride : 0;
-        }
+    internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n";
+    internal_assert(src_lanes == natural_lanes && dst_lanes == natural_lanes)
+        << "Only deal with vector with natural_lanes\n";
 
-        // Lower slice with stride into llvm.vector.deinterleave intrinsic
-        const std::set<int> supported_strides{2, 3, 4, 8};
-        if (supported_strides.find(slice_stride) != supported_strides.end() &&
-            dst_lanes * slice_stride == src_lanes &&
-            indices.front() < slice_stride &&  // Start position cannot be larger than stride
-            is_power_of_two(dst_lanes) &&
-            dst_lanes % target_vscale() == 0 &&
-            dst_lanes / target_vscale() > 1) {
+    // We select TBL or TBL2 intrinsic depending on indices range
+    bool use_tbl = *std::max_element(indices.begin(), indices.end()) < src_lanes;
+    internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n";
 
-            std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType()));
+    auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type));
 
-            // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable
-            llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale);
-            StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type));
-            std::vector<llvm::Type *> arg_types{a->getType()};
-            llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false);
-            FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
+    Value *val_indices = codegen_shuffle_indices(bits, indices);
+    llvm::Type *vt_natural = get_vector_type(elt, natural_lanes);
+    std::vector<llvm::Type *> llvm_arg_types;
+    std::vector<llvm::Value *> llvm_arg_vals;
+    if (use_tbl) {
+        llvm_arg_types = {vt_natural, val_indices->getType()};
+        llvm_arg_vals = {a, val_indices};
+    } else {
+        llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()};
+        llvm_arg_vals = {a, b, val_indices};
+    }
+    llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false);
+    FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
 
-            CallInst *deinterleave = builder->CreateCall(fn, {a});
-            // extract one element out of the returned struct
-            Value *extracted = builder->CreateExtractValue(deinterleave, indices.front());
+    Value *v = builder->CreateCall(fn, llvm_arg_vals);
+    return v;
+}
 
-            return extracted;
-        }
+Value *CodeGen_ARM::codegen_shuffle_indices(int bits, const std::vector<int> &indices) {
+    const int lanes = indices.size();
+    llvm::Type *index_type = IntegerType::get(module->getContext(), bits);
+    llvm::Type *index_vec_type = get_vector_type(index_type, lanes);
+
+    std::vector<Constant *> llvm_indices(lanes);
+    for (int i = 0; i < lanes; i++) {
+        int idx = indices[i];
+        llvm_indices[i] = idx >= 0 ? ConstantInt::get(index_type, idx) : UndefValue::get(index_type);
     }
 
-    return CodeGen_Posix::shuffle_vectors(a, b, indices);
+    Value *v = ConstantVector::get(llvm_indices);
+    v = builder->CreateInsertVector(index_vec_type, UndefValue::get(index_vec_type),
+                                    v, ConstantInt::get(i64_t, 0));
+    return v;
+}
+
+Value *CodeGen_ARM::codegen_whilelt(int total_lanes, int start, int end) {
+    // Generates SVE "whilelt" instruction which represents vector predicate pattern of
+    // e.g. 11100000 (total_lanes = 8 , start = 0, end = 3)
+    //     -> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 3)
+    internal_assert(target_vscale() > 0);
+    internal_assert(total_lanes % target_vscale() == 0);
+    std::string instr = concat_strings("llvm.aarch64.sve.whilelt.nxv", total_lanes / target_vscale(), "i1.i32");
+
+    llvm::Type *pred_type = get_vector_type(llvm_type_of(Int(1)), total_lanes);
+    llvm::FunctionType *fn_type = FunctionType::get(pred_type, {i32_t, i32_t}, false);
+    FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
+
+    value = builder->CreateCall(fn, {ConstantInt::get(i32_t, start), ConstantInt::get(i32_t, end)});
+    return value;
 }
 
 void CodeGen_ARM::visit(const Ramp *op) {
@@ -2659,6 +2845,11 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const {
     return CodeGen_Posix::upgrade_type_for_storage(t);
 }
 
+int CodeGen_ARM::natural_vector_size(const Halide::Type &t) const {
+    internal_assert(t.bits() > 1) << "natural_vector_size requested with 1 bits\n";
+    return native_vector_bits() / t.bits();
+}
+
 string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 617a40dbe973..6af1cc54286a 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4115,7 +4115,9 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
             } else {
                 internal_assert(op->indices[0] == 0);
             }
-            value = create_broadcast(value, op->indices.size());
+            if (op->indices.size() > 1) {
+                value = create_broadcast(value, op->indices.size());
+            }
             return;
         }
     }
diff --git a/src/DecomposeVectorShuffle.cpp b/src/DecomposeVectorShuffle.cpp
new file mode 100644
index 000000000000..5ebfd28dea99
--- /dev/null
+++ b/src/DecomposeVectorShuffle.cpp
@@ -0,0 +1,188 @@
+#include "DecomposeVectorShuffle.h"
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <random>
+#include <string>
+
+namespace Halide {
+namespace Internal {
+namespace {
+
+using namespace std;
+// #define DVS_DEBUG
+
+void print_v(const std::string &preamble, const vector<int> &v, const std::string &epilogue = "\n") {
+#ifdef DVS_DEBUG
+    cout << preamble << "[";
+    for (const auto &e : v) {
+        cout << e << ", ";
+    }
+    cout << "]" << epilogue;
+#endif
+};
+
+//-----------------------------
+// Test for DecomposeVectorShuffle using std::vector<int> as VecTy
+//-----------------------------
+struct STLVectorShuffler : public DecomposeVectorShuffle<STLVectorShuffler, std::vector<int>> {
+
+    STLVectorShuffler(const vector<int> &src_a, const vector<int> &src_b, const vector<int> &indices, int vl)
+        : DecomposeVectorShuffle(src_a, src_b, indices, vl) {
+    }
+
+    int get_vec_length(vector<int> &v) {
+        return static_cast<int>(v.size());
+    }
+
+    vector<int> align_up_vector(vector<int> &v, int align) {
+        size_t org_len = v.size();
+        v.resize(align_up(org_len, align), 0);
+        return v;
+    }
+
+    vector<int> slice_vec(const vector<int> &v, int start, size_t lanes) {
+        assert(start + lanes <= v.size());
+        return vector<int>(v.begin() + start, v.begin() + start + lanes);
+    }
+
+    vector<int> concat_vecs(const vector<vector<int>> &vecs) {
+        vector<int> out;
+        for (const auto &v : vecs) {
+            out.insert(out.end(), v.begin(), v.end());
+        }
+        return out;
+    }
+
+    vector<int> shuffle_vl_aligned(const vector<int> &a, const optional<vector<int>> &b, const vector<int> &indices, int vl) {
+        if (b.has_value()) {
+            assert(a.size() == b->size());
+        }
+        assert(a.size() == indices.size());
+        assert(indices.size() % vl == 0);
+
+        auto result = shuffle_without_divided(a, b.value_or(vector<int>{}), indices);
+
+        print_v("slice a:", a, ",  ");
+        print_v("slice b:", b.value_or(vector<int>{}), ",  ");
+        print_v("indices:", indices);
+        print_v("  => slice output:", result);
+        return result;
+    }
+
+    // Naive implementation of shuffle
+    vector<int> shuffle_without_divided(const vector<int> &a, const vector<int> &b, const vector<int> &indices) {
+        int src_lanes = static_cast<int>(a.size());
+        vector<int> dst(indices.size(), 0xdeadbeaf);
+        for (size_t i = 0; i < indices.size(); ++i) {
+            int idx = indices[i];
+            if (idx < 0) {
+                continue;
+            } else if (idx < src_lanes) {
+                dst[i] = a[idx];
+            } else {
+                int idx_b = idx - src_lanes;
+                assert(idx_b < static_cast<int>(b.size()));
+                dst[i] = b[idx_b];
+            }
+        }
+        return dst;
+    }
+};
+
+void generate_data(int src_lanes, int dst_lanes,
+                   vector<int> &a, vector<int> &b, vector<int> &indices) {
+    a.resize(src_lanes);
+    b.resize(src_lanes);
+    for (int i = 0; i < src_lanes; ++i) {
+        a[i] = i * 10;
+        b[i] = (i + src_lanes) * 10;
+    }
+    random_device rd;
+    mt19937 gen(rd());
+    uniform_int_distribution<> dist(0, src_lanes * 2 - 1);
+    indices.resize(dst_lanes);
+    for (int i = 0; i < dst_lanes; ++i) {
+        indices[i] = dist(gen);
+    }
+
+    print_v("input a: ", a);
+    print_v("input b: ", b);
+    print_v("indices: ", indices, "\n\n");
+}
+
+bool compare_vectors(const vector<int> &ref, const vector<int> &tar) {
+    print_v("\noutput: ", tar, "\n\n");
+
+    if (ref.size() != tar.size()) {
+        cerr << "Vector sizes are different\n";
+        return false;
+    }
+    for (size_t i = 0; i < ref.size(); ++i) {
+        if (ref[i] != tar[i]) {
+            cerr << "Mismatch at index " << i
+                 << ": expected " << ref[i] << ", got " << tar[i] << "\n";
+            return false;
+        }
+    }
+    return true;
+}
+
+void run_single_test(int src_lanes, int dst_lanes, int vl) {
+    vector<int> a, b, indices;
+    generate_data(src_lanes, dst_lanes, a, b, indices);
+
+    STLVectorShuffler shuffler(a, b, indices, vl);
+    auto ref = shuffler.shuffle_without_divided(a, b, indices);
+    auto tar = shuffler.shuffle();
+    assert(compare_vectors(ref, tar));
+}
+
+void run_test(int src_lanes, int dst_lanes, int vl, int repeat) {
+#ifdef DVS_DEBUG
+    cout << "\nRunning " << repeat << " tests for\n src_lanes: " << src_lanes
+         << ", dst_lanes: " << dst_lanes
+         << ", vl: " << vl << "\n";
+#endif
+
+    for (int t = 0; t < repeat; ++t) {
+        run_single_test(src_lanes, dst_lanes, vl);
+    }
+}
+
+}  // namespace
+
+void decompose_vector_shuffle_test() {
+    int repeat = 100;
+    run_test(8, 8, 4, repeat);
+    run_test(19, 9, 4, repeat);
+    run_test(5, 3, 8, repeat);
+    cout << "test_decompose_vector_shuffle passed\n";
+}
+
+}  // namespace Internal
+}  // namespace Halide
+
+// #define CLI_TEST_DECOMPOSE_TO_NATIVE_SHUFFLES
+#ifdef CLI_TEST_DECOMPOSE_TO_NATIVE_SHUFFLES
+int main(int argc, char *argv[]) {
+    int src_lanes = 19;
+    int dst_lanes = 9;
+    int vl = 4;
+    int repeat = 100;
+
+    if (argc >= 3) {
+        src_lanes = stoi(argv[1]);
+        dst_lanes = stoi(argv[2]);
+    }
+    if (argc >= 4) {
+        vl = stoi(argv[3]);
+        assert(__popcount(vl) == 1 && vl > 1);  // power of 2 only
+    }
+
+    Halide::Internal::run_test(src_lanes, dst_lanes, vl, 100);
+    cout << "All tests passed\n";
+    return 0;
+}
+
+#endif  // CLI_TEST_DECOMPOSE_TO_NATIVE_SHUFFLES
diff --git a/src/DecomposeVectorShuffle.h b/src/DecomposeVectorShuffle.h
new file mode 100644
index 000000000000..2b21320d6a84
--- /dev/null
+++ b/src/DecomposeVectorShuffle.h
@@ -0,0 +1,216 @@
+#ifndef HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
+#define HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
+
+/** \file
+ *
+ * Perform vector shuffle by decomposing the operation to
+ * a sequence of the sub shuffle steps where each step is a shuffle of:
+ * - One or two slices as input (slice_a and slice_b)
+ * - Produce one slice (dst slice)
+ * - All the slices have the same length as target native vector (vl)
+ *
+ * The structure of the sequence of steps consists of:
+ * 1. Outer loop to iterate the slices of dst vector.
+ * 2. Inner loop to iterate the native shuffle steps to complete a single dst slice.
+ *    This can be multiple steps because a single native shuffle can take
+ *    only 2 slices (native vector length x 2) at most, while we may need
+ *    to fetch from wider location in the src vector.
+ *
+ * The following example, log of test code, illustrates how it works.
+ *
+ * src_lanes: 17, dst_lanes: 7, vl: 4
+ *  input a: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, ]
+ *  input b: [170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, ]
+ *  indices: [6, 13, 24, 14, 7, 11, 5, ]
+ *
+ *  slice a:[40, 50, 60, 70, ],  slice b:[120, 130, 140, 150, ],  indices:[2, 5, -1, 6, ]
+ *    => slice output:[60, 130, -559038801, 140, ]
+ *  slice a:[60, 130, -559038801, 140, ],  slice b:[210, 220, 230, 240, ],  indices:[0, 1, 7, 3, ]
+ *    => slice output:[60, 130, 240, 140, ]
+ *  slice a:[40, 50, 60, 70, ],  slice b:[80, 90, 100, 110, ],  indices:[3, 7, 1, -1, ]
+ *    => slice output:[70, 110, 50, -559038801, ]
+ *
+ *  output: [60, 130, 240, 140, 70, 110, 50, ]
+ *
+ */
+#include "Util.h"
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+namespace Halide {
+namespace Internal {
+
+/** Base class for the algorithm logic of shuffle decomposition which is implemented
+ * independently from the type of vector and the implementation of primitive vector operations.
+ * Therefore, the concrete class must provide the following member functions
+ * for the specific vector type it handles.
+ * - get_vec_length
+ * - align_up_vector
+ * - slice_vec
+ * - concat_vecs
+ * - shuffle_vl_aligned
+ *  */
+template<typename T, typename VecTy>
+struct DecomposeVectorShuffle {
+
+    struct NativeShuffle {
+        int slice_a;
+        int slice_b;
+        std::vector<int> lane_map;
+
+        NativeShuffle(int vl, int a, int b)
+            : slice_a(a), slice_b(b) {
+            lane_map.resize(vl, -1);
+        }
+    };
+
+    /** Enum to represent the special cases of slice index */
+    enum {
+        SLICE_INDEX_NONE = -1,
+        SLICE_INDEX_CARRY_PREV_RESULT = -2,
+    };
+
+    DecomposeVectorShuffle(const VecTy &src_a, const VecTy &src_b, const std::vector<int> &indices, int vl)
+        : src_a(src_a), src_b(src_b), indices(indices), vl(vl) {
+    }
+
+    VecTy shuffle() {
+        src_lanes = derived.get_vec_length(src_a);
+        dst_lanes = static_cast<int>(indices.size());
+        src_lanes_aligned = align_up(src_lanes, vl);
+
+        std::vector<std::vector<NativeShuffle>> all_steps = decompose_to_native_shuffles();
+
+        src_a = derived.align_up_vector(src_a, vl);
+        src_b = derived.align_up_vector(src_b, vl);
+
+        // process each block divided by vl
+        std::vector<VecTy> shuffled_dst_slices;
+        shuffled_dst_slices.reserve(all_steps.size());
+
+        for (auto &steps_for_dst_slice : all_steps) {
+            VecTy dst_slice;
+
+            for (const auto &step : steps_for_dst_slice) {
+                // Obtain 1st slice a
+                VecTy a;
+                if (step.slice_a == SLICE_INDEX_CARRY_PREV_RESULT) {
+                    a = dst_slice;
+                } else {
+                    a = get_vl_slice(step.slice_a);
+                }
+                // Obtain 2nd slice b
+                std::optional<VecTy> b;
+                if (step.slice_b == SLICE_INDEX_NONE) {
+                    b = std::nullopt;
+                } else {
+                    b = std::optional<VecTy>(get_vl_slice(step.slice_b));
+                }
+                // Perform shuffle where vector length is aligned
+                dst_slice = derived.shuffle_vl_aligned(a, b, step.lane_map, vl);
+            }
+
+            shuffled_dst_slices.push_back(dst_slice);
+        }
+
+        return derived.slice_vec(derived.concat_vecs(shuffled_dst_slices), 0, dst_lanes);
+    }
+
+private:
+    std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles() {
+
+        // Adjust indices so that src vectors are aligned up to multiple of vl
+        std::vector<int> aligned_indices = indices;
+        for (int &idx : aligned_indices) {
+            if (idx >= src_lanes) {
+                idx += src_lanes_aligned - src_lanes;
+            }
+        }
+
+        const int num_dst_slices = align_up(dst_lanes, vl) / vl;
+        std::vector<std::vector<NativeShuffle>> all_steps(num_dst_slices);
+
+        for (int dst_slice = 0; dst_slice < num_dst_slices; dst_slice++) {
+            std::unordered_map<int, int> slice_to_step;
+            auto &steps = all_steps[dst_slice];
+            const int dst_start = dst_slice * vl;
+
+            for (int dst_index = dst_start; dst_index < dst_start + vl && dst_index < dst_lanes; ++dst_index) {
+                const int src_index = aligned_indices[dst_index];
+                const int src_slice = src_index / vl;
+                const int lane_in_src_slice = src_index % vl;
+                const int lane_in_dst_slice = dst_index - dst_start;
+                if (src_index < 0) {
+                    continue;
+
+                } else if (steps.empty()) {
+                    // first slice in this block
+                    slice_to_step[src_slice] = 0;
+                    steps.emplace_back(vl, src_slice, SLICE_INDEX_NONE);
+                    steps.back().lane_map[lane_in_dst_slice] = lane_in_src_slice;
+
+                } else if (auto itr = slice_to_step.find(src_slice); itr != slice_to_step.end()) {
+                    // slice already seen
+                    NativeShuffle &step = steps[itr->second];
+                    bool is_a = (step.slice_a != SLICE_INDEX_CARRY_PREV_RESULT && step.slice_a == src_slice);
+                    int offset = is_a ? 0 : vl;
+                    step.lane_map[lane_in_dst_slice] = lane_in_src_slice + offset;
+
+                } else if (steps[0].slice_b == SLICE_INDEX_NONE) {
+                    // add as 'b' of first step if b is unused
+                    slice_to_step[src_slice] = 0;
+                    steps[0].slice_b = src_slice;
+                    steps[0].lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
+
+                } else {
+                    // otherwise chain a new step
+                    slice_to_step[src_slice] = static_cast<int>(steps.size());
+                    // new step uses previous result as 'a', so we use 'b' for this one
+                    steps.emplace_back(vl, SLICE_INDEX_CARRY_PREV_RESULT, src_slice);
+
+                    // Except for the first step, we need to arrange indices
+                    // so that the output carried from the previous step is kept
+                    auto &lane_map = steps.back().lane_map;
+                    // initialize lane_map as identical copy
+                    for (size_t lane_idx = 0; lane_idx < lane_map.size(); ++lane_idx) {
+                        lane_map[lane_idx] = lane_idx;
+                    }
+                    // update for this index
+                    lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
+                }
+            }
+        }
+
+        return all_steps;
+    }
+
+    // Helper to extract slice with lanes=vl
+    VecTy get_vl_slice(int slice_index) {
+        const int num_slices_a = src_lanes_aligned / vl;
+        int start_index = slice_index * vl;
+        if (slice_index < num_slices_a) {
+            return derived.slice_vec(src_a, start_index, vl);
+        } else {
+            start_index -= src_lanes_aligned;
+            return derived.slice_vec(src_b, start_index, vl);
+        }
+    }
+
+    T &derived = static_cast<T &>(*this);
+    VecTy src_a;
+    VecTy src_b;
+    std::vector<int> indices;
+    int vl;
+    int src_lanes;
+    int src_lanes_aligned;
+    int dst_lanes;
+};
+
+// Test called by test/internal.cpp
+void decompose_vector_shuffle_test();
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/test/internal.cpp b/test/internal.cpp
index 08283fa9cf54..22cdc7c8c6fb 100644
--- a/test/internal.cpp
+++ b/test/internal.cpp
@@ -4,6 +4,7 @@
 #include "CPlusPlusMangle.h"
 #include "CSE.h"
 #include "CodeGen_C.h"
+#include "DecomposeVectorShuffle.h"
 #include "Deinterleave.h"
 #include "Func.h"
 #include "Generator.h"
@@ -41,6 +42,7 @@ int main(int argc, const char **argv) {
     propagate_estimate_test();
     uniquify_variable_names_test();
     spirv_ir_test();
+    decompose_vector_shuffle_test();
 
     printf("Success!\n");
     return 0;

From a7bc84b67fbca6137fc7c503849ee83fece2fd12 Mon Sep 17 00:00:00 2001
From: Steve Suzuki <shinsuke.suzuki@arm.com>
Date: Thu, 11 Dec 2025 20:06:38 +0000
Subject: [PATCH 5/6] Add DecomposeVectorShuffle to Makefile

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 54c61a622ae8..f8d4968bbc8a 100644
--- a/Makefile
+++ b/Makefile
@@ -491,6 +491,7 @@ SOURCE_FILES = \
   Debug.cpp \
   DebugArguments.cpp \
   DebugToFile.cpp \
+  DecomposeVectorShuffle.cpp \
   Definition.cpp \
   Deinterleave.cpp \
   Derivative.cpp \

From 9c9e6213c03b011278b2c4ac3cf7defc6b6c162b Mon Sep 17 00:00:00 2001
From: Steve Suzuki <shinsuke.suzuki@arm.com>
Date: Sat, 13 Dec 2025 12:06:13 +0000
Subject: [PATCH 6/6] Improve performance of vector broadcast in SVE2

Modified codegen of vector broadcast in SVE2 to emit
TBL ARM intrin instead of llvm.vector.insert.

Fix performance test failure of nested_vectorization_gemm
---
 src/CodeGen_ARM.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 486f9e9e290f..da8f61390f0c 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -2067,19 +2067,6 @@ void CodeGen_ARM::visit(const Shuffle *op) {
             value = insert_scalable_vector(padding, val_0, 0);
             return;
         }
-    } else if (op->is_broadcast()) {
-        // Undo simplification to avoid arbitrary-indexed shuffle
-        Expr equiv;
-        for (int f = 0; f < op->broadcast_factor(); ++f) {
-            if (equiv.defined()) {
-                equiv = Shuffle::make_concat({equiv, op->vectors[0]});
-            } else {
-                equiv = op->vectors[0];
-            }
-        }
-        equiv = common_subexpression_elimination(equiv);
-        value = codegen(equiv);
-        return;
     }
 
     CodeGen_Posix::visit(op);