From 96128eda8297e36bdf847d1c87bc6fac7aadc3c1 Mon Sep 17 00:00:00 2001 From: Freebase6912 Date: Wed, 10 Sep 2025 16:21:30 +0800 Subject: [PATCH 1/5] Fix bug 0905: vectorize with broadcasted value --- src/transform/loop_vectorize.cc | 196 ++++++++++++++------------------ src/transform/loop_vectorize.h | 4 + 2 files changed, 91 insertions(+), 109 deletions(-) diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc index 2731a2e4f..2b7035d39 100644 --- a/src/transform/loop_vectorize.cc +++ b/src/transform/loop_vectorize.cc @@ -56,15 +56,9 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer { return vector_size_; } - bool GetDynamic() { return dynamic_; } - - PrimExpr GetCondition() { return condition_; } - private: void VisitStmt_(const ForNode *node) final { inner_for_ = node; - iter_map_.Set(node->loop_var, Range(node->min, node->extent)); - arith::IRVisitorWithAnalyzer::VisitStmt_(node); } @@ -117,72 +111,48 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer { if (!extent_ptr) return; - const DataType &access_type = buffer->dtype; - // i // 2, i % 8 can also be vectorized as factor 16 - int max_vector_size = vector_load_bits_max_ / access_type.bits(); - // so we should disable this GCD optimization - max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value); - auto last_dim = buffer->shape.back(); - auto mod_set = analyzer_.modular_set(last_dim); - // when dynamic shape like [m, k]: coeff=1, base=0, GCD will block - // conditionally tail vectorize - if (buffer->shape.back().as()) { - max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff); - auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base); - // If gcd_base is equal to the last dimension, - // we should analyze the second-to-last dimension - // in relation to the last dimension. - if (gcd_base < Downcast(last_dim)->value) { - max_vector_size = gcd_base; - } - vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_); - - // Generate strides if not existed - auto strides = buffer->strides; - if (buffer->strides.empty()) { - PrimExpr stride = 1; - for (int i = indices.size() - 1; i >= 0; --i) { - strides.push_back(stride); - stride = stride * buffer->shape[i]; - } - strides = Array{strides.rbegin(), strides.rend()}; + // 1. Compute raw element offset + auto strides = buffer->strides; + if (buffer->strides.empty()) { + PrimExpr stride = 1; + for (int i = indices.size() - 1; i >= 0; --i) { + strides.push_back(stride); + stride = stride * buffer->shape[i]; } + strides = Array{strides.rbegin(), strides.rend()}; + } + PrimExpr elem_offset = 0; + for (int i = 0; i < indices.size(); ++i) { + elem_offset += indices[i] * strides[i]; + } - // Generate and check element offset expression - ICHECK(indices.size() == strides.size()) << "Invalid indices and strides"; - PrimExpr elem_offset = 0; - for (int i = 0; i < indices.size(); ++i) { - elem_offset += indices[i] * strides[i]; - } - while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var, - inner_for_->extent, vector_size_, - &analyzer_)) { - vector_size_ /= 2; - } - } else if (vector_size_ <= vector_load_bits_max_ / buffer->dtype.bits()) { - // dynamic shape load: get the vectorization condition - dynamic_ = true; - PrimExpr offset = buffer.OffsetOf(indices).back(); - condition_ = (FloorMod(offset, vector_size_) == 0); + // 2. If element offset is independent with loop_var, ignore it + if (CanProveIndependent(elem_offset, inner_for_->loop_var, &analyzer_)) { + return; + } + + // 3. Tight vectorize bound + int max_vector_size = vector_load_bits_max_ / buffer->dtype.bits(); + max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value); + vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_); + + // 4. Try to vectorize buffer load + while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var, + inner_for_->extent, vector_size_, &analyzer_)) { + vector_size_ /= 2; } } const int vector_load_bits_max_ = 128; const ForNode *inner_for_{}; - Map iter_map_; bool has_nonlocal_memory_access_ = false; int vector_size_ = 128; - // conditionally vectorize - bool dynamic_ = false; - PrimExpr condition_; }; class VectorizeRewriter : public StmtExprMutator { public: - VectorizeRewriter(const VectorizePlanResult &plan) - : vector_size_(plan.vector_size), condition_(plan.condition), - dynamic_(plan.dynamic) {} + VectorizeRewriter(int vector_size) : vector_size_(vector_size) {} private: Stmt VisitStmt_(const ForNode *node) final { @@ -197,23 +167,19 @@ class VectorizeRewriter : public StmtExprMutator { ICHECK(extent % vector_size_ == 0) << "extent: " << extent << " vector_size_: " << vector_size_; ICHECK(is_zero(fnode->min)); - if (!dynamic_) { // check dynamic shape - if (extent == vector_size_) { - fnode.CopyOnWrite()->kind = ForKind::kVectorized; - return fnode; - } else { - Var inner_var = Var("vec"); - Var outer_var = Var(old_var->name_hint); - Map vmap; - vmap.Set(fnode->loop_var, outer_var * vector_size_ + inner_var); - Stmt body = Substitute(fnode->body, vmap); - body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body); - body = For(outer_var, 0, extent / vector_size_, fnode->kind, body, - fnode->thread_binding, fnode->annotations, fnode->span); - return body; - } - } else { + if (extent == vector_size_) { + fnode.CopyOnWrite()->kind = ForKind::kVectorized; return fnode; + } else { + Var inner_var = Var("vec"); + Var outer_var = Var(old_var->name_hint); + Map vmap; + vmap.Set(fnode->loop_var, outer_var * vector_size_ + inner_var); + Stmt body = Substitute(fnode->body, vmap); + body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body); + body = For(outer_var, 0, extent / vector_size_, fnode->kind, body, + fnode->thread_binding, fnode->annotations, fnode->span); + return body; } } else { return ret; @@ -222,35 +188,48 @@ class VectorizeRewriter : public StmtExprMutator { const ForNode *inner_for_{}; const int vector_size_; - const PrimExpr condition_; - const bool dynamic_; }; int GetVectorizeSize(const For &loop) { return VectorizePlanner().Plan(loop); } -VectorizePlanResult GetVectorizePlanResult(const For &loop) { - VectorizePlanner planner; - int vector_size = planner.Plan(loop); - bool dynamic = planner.GetDynamic(); - PrimExpr condition = planner.GetCondition(); - return {vector_size, dynamic, condition}; +bool CanProveIndependent(const PrimExpr &expr, Var var, + arith::Analyzer *analyzer) { + // 1. if var doesn't exist, it is independent + struct FindVarVisitor : ExprVisitor { + Var target; + bool found = false; + FindVarVisitor(Var target) : target(std::move(target)) {} + void run(const PrimExpr &expr) { this->VisitExpr(expr); } + void VisitExpr_(const VarNode *node) final { + if (node == target.get()) { + found = true; + } + } + }; + FindVarVisitor visitor(var); + visitor.run(expr); + if (!visitor.found) + return true; + // 2. if \forall v_1, v_2, f(v_1) == f(v_2), f is independent with v + Var var_1("_t", var.dtype()); + auto expr_1 = Substitute(expr, {{var, var_1}}); + if (analyzer->CanProveEqual(expr, expr_1)) { + return true; + } + return false; } bool IndiceCanVectorize(const PrimExpr &expr, Var var, const PrimExpr &iter_var_size, - int target_vectorized_size, arith::Analyzer *analyzer) { + int target_vectorized_size, + arith::Analyzer *analyzer) { ICHECK(target_vectorized_size >= 1); if (target_vectorized_size == 1) return true; // Extent must be divisible if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size), - 0)) - return false; - - // The base offset must be divisible - if (!analyzer->CanProveEqual( - FloorMod(Substitute(expr, {{var, 0}}), target_vectorized_size), 0)) { + 0)) { return false; } @@ -259,35 +238,34 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var, analyzer->Bind(v0, Range(0, target_vectorized_size)); analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv( iter_var_size, target_vectorized_size)))); - PrimExpr expr_transformed = analyzer->Simplify( + PrimExpr access_pos = analyzer->Simplify( Substitute(expr, {{var, v0 + v1 * target_vectorized_size}})); - Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size)); - PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed); - - // This simplify is necessary for thread region specified - // optimizations. - expr_vectorized = analyzer->Simplify(expr_vectorized); - auto ramp_node = expr_vectorized.as(); - if (!ramp_node) { - // Broadcast value - if (expr_vectorized.dtype().lanes() == 1) - return true; - else - return false; - } else { - return is_one(ramp_node->stride); + // for (int ph_v = target_vectorized_size; ph_v > 1; ph_v /= 2) { + // ph_v: physical load/store vectorized size + // TODO: allow a more generalized vectorize: B[i] = A[i // 2] + auto ph_v = target_vectorized_size; + auto group = target_vectorized_size / ph_v; + // Check if access_pos is contingentous: ap === v0 // group (mod ph_v) + auto is_contingous = analyzer->CanProveEqual(FloorMod(access_pos, ph_v), + FloorDiv(v0, group)); + // Check if access is aligned + auto is_aligned = analyzer->CanProveEqual( + FloorMod(Substitute(expr, {{var, 0}}), ph_v), 0); + if (is_contingous && is_aligned) { + return true; } + // } + return false; } For VectorizeLoop(const For &loop, int vectorize_hint) { - VectorizePlanResult res{128, false, 0}; if (vectorize_hint <= 0) { - res = GetVectorizePlanResult(loop); - vectorize_hint = res.vector_size; + VectorizePlanner planner; + vectorize_hint = planner.Plan(loop); } if (vectorize_hint == 1) return loop; - auto rewriter = VectorizeRewriter(res); + auto rewriter = VectorizeRewriter(vectorize_hint); return Downcast(rewriter(loop)); } diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h index 253461e8a..4ab20c668 100644 --- a/src/transform/loop_vectorize.h +++ b/src/transform/loop_vectorize.h @@ -37,6 +37,10 @@ int GetVectorizeSize(const For &loop); For VectorizeLoop(const For &loop, int vectorize_hint = -1); +// Can prove expr is independent with var, i.e. the value of expr doesn't change +// when var changes +bool CanProveIndependent(const PrimExpr &expr, Var var, + arith::Analyzer *analyzer); bool IndiceCanVectorize(const PrimExpr &expr, Var var, const PrimExpr &iter_var_size, int target_vectorized_size, arith::Analyzer *analyzer); From 1fb176ab0df057f97f27e70b14f150b40609bd3d Mon Sep 17 00:00:00 2001 From: Freebase6912 Date: Wed, 10 Sep 2025 16:21:54 +0800 Subject: [PATCH 2/5] fix lint error --- src/transform/loop_vectorize.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc index 2b7035d39..29417bd8f 100644 --- a/src/transform/loop_vectorize.cc +++ b/src/transform/loop_vectorize.cc @@ -221,8 +221,7 @@ bool CanProveIndependent(const PrimExpr &expr, Var var, bool IndiceCanVectorize(const PrimExpr &expr, Var var, const PrimExpr &iter_var_size, - int target_vectorized_size, - arith::Analyzer *analyzer) { + int target_vectorized_size, arith::Analyzer *analyzer) { ICHECK(target_vectorized_size >= 1); if (target_vectorized_size == 1) return true; @@ -246,11 +245,11 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var, auto ph_v = target_vectorized_size; auto group = target_vectorized_size / ph_v; // Check if access_pos is contingentous: ap === v0 // group (mod ph_v) - auto is_contingous = analyzer->CanProveEqual(FloorMod(access_pos, ph_v), - FloorDiv(v0, group)); + auto is_contingous = + analyzer->CanProveEqual(FloorMod(access_pos, ph_v), FloorDiv(v0, group)); // Check if access is aligned - auto is_aligned = analyzer->CanProveEqual( - FloorMod(Substitute(expr, {{var, 0}}), ph_v), 0); + auto is_aligned = + analyzer->CanProveEqual(FloorMod(Substitute(expr, {{var, 0}}), ph_v), 0); if (is_contingous && is_aligned) { return true; } From c08b815555933e08b6bc2a29b3018b2c09f55012 Mon Sep 17 00:00:00 2001 From: Freebase6912 Date: Fri, 12 Sep 2025 11:04:43 +0800 Subject: [PATCH 3/5] [Refactor] Use `tvm::tir::UseVar` and use Vectorizer --- src/transform/loop_vectorize.cc | 59 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc index 29417bd8f..3c72d44e9 100644 --- a/src/transform/loop_vectorize.cc +++ b/src/transform/loop_vectorize.cc @@ -35,6 +35,8 @@ #include "arith/int_operator.h" #include "arith/ir_visitor_with_analyzer.h" #include "common/loop_vectorization_utils.h" +#include "tvm/tir/analysis.h" +#include "tvm/tir/var.h" namespace tvm { namespace tl { @@ -195,21 +197,11 @@ int GetVectorizeSize(const For &loop) { return VectorizePlanner().Plan(loop); } bool CanProveIndependent(const PrimExpr &expr, Var var, arith::Analyzer *analyzer) { // 1. if var doesn't exist, it is independent - struct FindVarVisitor : ExprVisitor { - Var target; - bool found = false; - FindVarVisitor(Var target) : target(std::move(target)) {} - void run(const PrimExpr &expr) { this->VisitExpr(expr); } - void VisitExpr_(const VarNode *node) final { - if (node == target.get()) { - found = true; - } - } - }; - FindVarVisitor visitor(var); - visitor.run(expr); - if (!visitor.found) + bool used_var = UsesVar( + expr, [&](const VarNode *v) { return GetRef(v).same_as(var); }); + if (!used_var) { return true; + } // 2. if \forall v_1, v_2, f(v_1) == f(v_2), f is independent with v Var var_1("_t", var.dtype()); auto expr_1 = Substitute(expr, {{var, var_1}}); @@ -228,7 +220,12 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var, // Extent must be divisible if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size), - 0)) { + 0)) + return false; + + // The base offset must be divisible + if (!analyzer->CanProveEqual( + FloorMod(Substitute(expr, {{var, 0}}), target_vectorized_size), 0)) { return false; } @@ -237,24 +234,24 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var, analyzer->Bind(v0, Range(0, target_vectorized_size)); analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv( iter_var_size, target_vectorized_size)))); - PrimExpr access_pos = analyzer->Simplify( + PrimExpr expr_transformed = analyzer->Simplify( Substitute(expr, {{var, v0 + v1 * target_vectorized_size}})); - // for (int ph_v = target_vectorized_size; ph_v > 1; ph_v /= 2) { - // ph_v: physical load/store vectorized size - // TODO: allow a more generalized vectorize: B[i] = A[i // 2] - auto ph_v = target_vectorized_size; - auto group = target_vectorized_size / ph_v; - // Check if access_pos is contingentous: ap === v0 // group (mod ph_v) - auto is_contingous = - analyzer->CanProveEqual(FloorMod(access_pos, ph_v), FloorDiv(v0, group)); - // Check if access is aligned - auto is_aligned = - analyzer->CanProveEqual(FloorMod(Substitute(expr, {{var, 0}}), ph_v), 0); - if (is_contingous && is_aligned) { - return true; + Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size)); + PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed); + + // This simplify is necessary for thread region specified + // optimizations. + expr_vectorized = analyzer->Simplify(expr_vectorized); + auto ramp_node = expr_vectorized.as(); + if (!ramp_node) { + // Broadcast value + if (expr_vectorized.dtype().lanes() == 1) + return true; + else + return false; + } else { + return is_one(ramp_node->stride); } - // } - return false; } For VectorizeLoop(const For &loop, int vectorize_hint) { From cbf7b3cd39149ba2bf30de01947ca7022eb05fc0 Mon Sep 17 00:00:00 2001 From: Freebase6912 Date: Mon, 15 Sep 2025 11:20:48 +0800 Subject: [PATCH 4/5] Add loop size check in vectorize planner --- src/transform/loop_vectorize.cc | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc index 3c72d44e9..6ac9c6386 100644 --- a/src/transform/loop_vectorize.cc +++ b/src/transform/loop_vectorize.cc @@ -24,19 +24,14 @@ #include "loop_vectorize.h" -#include -#include -#include - -#include - -#include "../layout/layout.h" -#include "../layout/utils.h" #include "arith/int_operator.h" #include "arith/ir_visitor_with_analyzer.h" #include "common/loop_vectorization_utils.h" #include "tvm/tir/analysis.h" #include "tvm/tir/var.h" +#include +#include +#include namespace tvm { namespace tl { @@ -61,6 +56,14 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer { private: void VisitStmt_(const ForNode *node) final { inner_for_ = node; + auto extent_ptr = as_const_int(node->extent); + // Here I disable dynamic shape completely, + // In order to do it, the Planner should accept an analyzer with arithmetic info outside to prove the dividiblity of vector size + if(!extent_ptr) { + vector_size_ = 1; + return; + } + vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr); arith::IRVisitorWithAnalyzer::VisitStmt_(node); } @@ -109,10 +112,6 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer { void UpdateVectorSize(const Array &indices, const Buffer &buffer) { if (!inner_for_) return; - auto extent_ptr = inner_for_->extent.as(); - if (!extent_ptr) - return; - // 1. Compute raw element offset auto strides = buffer->strides; if (buffer->strides.empty()) { @@ -134,9 +133,8 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer { } // 3. Tight vectorize bound - int max_vector_size = vector_load_bits_max_ / buffer->dtype.bits(); - max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value); - vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_); + vector_size_ = arith::ZeroAwareGCD(vector_size_, vector_load_bits_max_ / + buffer->dtype.bits()); // 4. Try to vectorize buffer load while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var, From d8ec4620a90c1f5e4f8f81da25d8ee2fa4f28bd9 Mon Sep 17 00:00:00 2001 From: Freebase6912 Date: Mon, 15 Sep 2025 11:21:36 +0800 Subject: [PATCH 5/5] fix lint error --- src/transform/loop_vectorize.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc index 6ac9c6386..3b33fa985 100644 --- a/src/transform/loop_vectorize.cc +++ b/src/transform/loop_vectorize.cc @@ -57,9 +57,10 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer { void VisitStmt_(const ForNode *node) final { inner_for_ = node; auto extent_ptr = as_const_int(node->extent); - // Here I disable dynamic shape completely, - // In order to do it, the Planner should accept an analyzer with arithmetic info outside to prove the dividiblity of vector size - if(!extent_ptr) { + // Here I disable dynamic shape completely, + // In order to do it, the Planner should accept an analyzer with + // arithmetic info outside to prove the dividiblity of vector size + if (!extent_ptr) { vector_size_ = 1; return; }