src/op/utils.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -47,8 +47,13 @@ inline bool IsGlobalBuffer(const Buffer &buffer) { @@
       return buffer.defined() && buffer.scope() == "global";
     }
-    inline bool IsLocalBuffer(const Buffer &buffer) {
-      return buffer.defined() && buffer.scope() == "local";
+    inline bool IsLocalBuffer(const Buffer &buffer, bool allow_var = false) {
+      if (allow_var) {
+        return buffer.defined() &&
+               (buffer.scope() == "local" || buffer.scope() == "local.var");
+      } else {
+        return buffer.defined() && buffer.scope() == "local";
+      }
     }
     inline bool IsLocalVarBuffer(const Buffer &buffer) {
@@ Expand Down @@

src/transform/loop_vectorize.cc

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -129,15 +129,15 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
  
            return arith::IRMutatorWithAnalyzer::VisitExpr_(node);

          }

        }

        UpdateVectorSize(node->indices, node->buffer);

        UpdateVectorSize(node->indices, node->buffer, false);

        return arith::IRMutatorWithAnalyzer::VisitExpr_(node);

      }

      Stmt VisitStmt_(const BufferStoreNode *node) final {

        if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||

            node->buffer.scope() == "shared.dyn")

          has_nonlocal_memory_access_ = true;

        UpdateVectorSize(node->indices, node->buffer);

        UpdateVectorSize(node->indices, node->buffer, true);

        return arith::IRMutatorWithAnalyzer::VisitStmt_(node);

      }

    @@ -170,7 +170,8 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
  
        return arith::IRMutatorWithAnalyzer::VisitExpr_(node);

      }

      void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {

      void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer,

                            bool is_store) {

        if (!inner_for_)

          return;

        // 1. Compute raw element offset

    @@ -187,8 +188,12 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
  
        for (int i = 0; i < indices.size(); ++i) {

          elem_offset += indices[i] * strides[i];

        }

        // 2. If element offset is independent with loop_var, ignore it

        // 2. If element offset is independent with loop_var, ignore it.

        if (CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_)) {

          // Specially, if it's a BufferStore, we should not vectorize it.

          if (is_store) {

            vector_size_ = 1;

          }

          return;

        }

        // 3. Check if current vector_size_ works with invariant boundary check

    @@ -198,6 +203,10 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
  
          vector_size_ = arith::ZeroAwareGCD(

              vector_size_, vector_load_bits_max_ /

                                (buffer->dtype.bits() * buffer->dtype.lanes()));

        } else if (is_store) {

          // If the indices is invariant for BufferStore, we should also not

          // vectorize it.

          vector_size_ = 1;

        }

        // 4. Try to vectorize buffer load

        while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,

src/transform/lower_tile_op.cc

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -735,7 +735,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
  
        bool store_into_local = false;

        PostOrderVisit(root, [&](const ObjectRef &obj) {

          if (const auto *store = obj.as<BufferStoreNode>()) {

            if (IsLocalBuffer(store->buffer)) {

            if (IsLocalBuffer(store->buffer, true)) {

              store_into_local = true;

            }

          }

    @@ -748,11 +748,11 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
  
        bool local_register_only = true;

        PostOrderVisit(root, [&](const ObjectRef &obj) {

          if (const auto *store = obj.as<BufferStoreNode>()) {

            if (!IsLocalBuffer(store->buffer)) {

            if (!IsLocalBuffer(store->buffer, true)) {

              local_register_only = false;

            }

          } else if (const auto *load = obj.as<BufferLoadNode>()) {

            if (!IsLocalBuffer(load->buffer)) {

            if (!IsLocalBuffer(load->buffer, true)) {

              local_register_only = false;

            }

          }

    @@ -766,11 +766,13 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
  
        bool has_non_local = false;

        PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {

          if (const auto *load = obj.as<BufferLoadNode>()) {

            if (!IsLocalBuffer(load->buffer) && !IsFragmentBuffer(load->buffer)) {

            if (!IsLocalBuffer(load->buffer, true) &&

                !IsFragmentBuffer(load->buffer)) {

              has_non_local = true;

            }

          } else if (const auto *store = obj.as<BufferStoreNode>()) {

            if (!IsLocalBuffer(store->buffer) && !IsFragmentBuffer(store->buffer)) {

            if (!IsLocalBuffer(store->buffer, true) &&

                !IsFragmentBuffer(store->buffer)) {

              has_non_local = true;

            }

          }

testing/python/issue/test_tilelang_issue_1549.py

-Original file line number
+Diff line change
@@ -0,0 +1,34 @@
+    import tilelang as tl
+    import tilelang.testing
+    import tilelang.language as T
+    import torch
+    def test_issue_1549_strange_var_vectorization():
+        @tl.jit
+        def get_wrong_kernel(M: int = 4096):
+            dtype = "int32"
+            num_threads = 64
+            @T.prim_func
+            def main(
+                Data: T.Tensor((M,), dtype),
+            ):
+                with T.Kernel(1, threads=num_threads) as _:
+                    # Pre-allocated scalar variables (causes issue in 0.1.7.post1)
+                    idx = T.alloc_var(T.int32)
+                    for i in T.Parallel(M):
+                        idx = i
+                        Data[i] = idx
+            return main
+        kernel = get_wrong_kernel()
+        M = 2048
+        kernel = get_wrong_kernel(M)
+        data = torch.randint(0, 100, (M,), dtype=torch.int32, device="cuda")
+        kernel(data)
+    if __name__ == "__main__":
+        tilelang.testing.main()

[BugFix] Fix some bugs in lowering ParallelOp and VectorizeLoop #1607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

LeiWang1999 merged 3 commits into tile-ai:main from SiriusNEO:chaofan/vectorize_1229

Jan 6, 2026

-Original file line number
+Diff line change
@@ Expand Up / @@ -47,8 +47,13 @@ inline bool IsGlobalBuffer(const Buffer &buffer) { @@
       return buffer.defined() && buffer.scope() == "global";
     }
-    inline bool IsLocalBuffer(const Buffer &buffer) {
-      return buffer.defined() && buffer.scope() == "local";
+    inline bool IsLocalBuffer(const Buffer &buffer, bool allow_var = false) {
+      if (allow_var) {
+        return buffer.defined() &&
+               (buffer.scope() == "local" || buffer.scope() == "local.var");
+      } else {
+        return buffer.defined() && buffer.scope() == "local";
+      }
     }
     inline bool IsLocalVarBuffer(const Buffer &buffer) {
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,34 @@
+    import tilelang as tl
+    import tilelang.testing
+    import tilelang.language as T
+    import torch
+    def test_issue_1549_strange_var_vectorization():
+        @tl.jit
+        def get_wrong_kernel(M: int = 4096):
+            dtype = "int32"
+            num_threads = 64
+            @T.prim_func
+            def main(
+                Data: T.Tensor((M,), dtype),
+            ):
+                with T.Kernel(1, threads=num_threads) as _:
+                    # Pre-allocated scalar variables (causes issue in 0.1.7.post1)
+                    idx = T.alloc_var(T.int32)
+                    for i in T.Parallel(M):
+                        idx = i
+                        Data[i] = idx
+            return main
+        kernel = get_wrong_kernel()
+        M = 2048
+        kernel = get_wrong_kernel(M)
+        data = torch.randint(0, 100, (M,), dtype=torch.int32, device="cuda")
+        kernel(data)
+    if __name__ == "__main__":
+        tilelang.testing.main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BugFix] Fix some bugs in lowering ParallelOp and VectorizeLoop #1607

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!