tile-ai · LeiWang1999 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/src/transform/common/constr_visitor.h b/src/transform/common/constr_visitor.h
@@ -125,6 +125,18 @@ struct ConstrSet {
     }
   }
 
+  /*! \brief Convert the constraint set to a conjunction (AND) of all
+   * constraints */
+  PrimExpr ToConjunction() const {
+    if (constrs_.empty())
+      return Bool(true);
+    PrimExpr result = constrs_[0].ToGenericConstr();
+    for (size_t i = 1; i < constrs_.size(); ++i) {
+      result = tir::And(result, constrs_[i].ToGenericConstr());
+    }
+    return result;
+  }
+
   void format(std::ostream &os) const {
     os << "ConstrSet(size=" << constrs_.size() << ") {\n";
     for (size_t i = 0; i < constrs_.size(); ++i) {

diff --git a/src/transform/thread_storage_sync.cc b/src/transform/thread_storage_sync.cc
@@ -1270,23 +1270,46 @@ struct TileLangThreadSyncPlanner : public ConstrVisitor {
       }
     }
     if (has_same_index) {
-      bool range_is_equal = true;
-      arith::Analyzer prev_analyzer, curr_analyzer;
-      prev.cset.Populate(prev_analyzer);
-      curr.cset.Populate(curr_analyzer);
-      for (unsigned idx = 0; idx != 3; ++idx) {
-        Var prev_var = prev.threads[prev.threads.size() + idx - 3]->var;
-        Var curr_var = curr.threads[curr.threads.size() + idx - 3]->var;
-        auto prev_bound = prev_analyzer.const_int_bound(prev_var);
-        auto curr_bound = curr_analyzer.const_int_bound(curr_var);
-        if (prev_bound->min_value != curr_bound->min_value ||
-            prev_bound->max_value != curr_bound->max_value) {
-          range_is_equal = false;
-          break;
+      // Use Z3 to check if prev and curr constraints are equivalent.
+      // If equivalent, the same set of threads execute both accesses, so no
+      // sync is needed.
+      //
+      // Formally, let P(t) denote the predicate for prev's constraint set and
+      // C(t) denote the predicate for curr's constraint set, where t represents
+      // the thread indices (threadIdx.x, threadIdx.y, threadIdx.z).
+      //
+      // We check bidirectional implication:
+      //   1. P(t) => C(t): Every thread executing prev also executes curr
+      //   2. C(t) => P(t): Every thread executing curr also executes prev
+      //
+      // If both hold, then P(t) <=> C(t), meaning the exact same set of threads
+      // execute both accesses. Combined with has_same_index (same buffer index
+      // expression), this guarantees each thread only accesses locations it
+      // wrote itself, eliminating cross-thread conflicts.
+      PrimExpr prev_constr = prev.cset.ToConjunction();
+      PrimExpr curr_constr = curr.cset.ToConjunction();
+
+      arith::Analyzer analyzer;
+      for (const auto &iv : prev.threads) {
+        if (iv->dom.defined()) {
+          analyzer.Bind(iv->var, iv->dom);
         }
       }
-      if (range_is_equal)
+
+      // Check P => C: ¬P ∨ C
+      bool prev_implies_curr = analyzer.z3_prover.CanProve(
+          tir::Or(tir::Not(prev_constr), curr_constr));
+      // Check C => P: ¬C ∨ P
+      bool curr_implies_prev = analyzer.z3_prover.CanProve(
+          tir::Or(tir::Not(curr_constr), prev_constr));
+
+      if (prev_implies_curr && curr_implies_prev) {
+        // If constraints are equivalent, they are not in conflict
         return false;
+      } else {
+        // If constraints are not equivalent, they are in conflict
+        return true;
+      }
     }
 
     for (size_t i = 0; i < prev.buffer_indices.size(); i++) {

diff --git a/testing/python/issue/test_tilelang_issue_1106.py b/testing/python/issue/test_tilelang_issue_1106.py
@@ -31,8 +31,15 @@ def test_kernel(a: T.Tensor[(m,), dtype], b: T.Tensor[(m,), dtype]):
 def test_issue_1106():
     m = 200
     kernel = get_kernel(m)
-    assert "__syncthreads" not in kernel.get_kernel_source()
+    source = kernel.get_kernel_source()
+    # Ensure __syncthreads is not inside the for loop
+    for_start = source.find("for (int i = 0;")
+    for_end = source.find("__syncthreads")
+    assert for_end > for_start, "__syncthreads should be after the for loop, not inside it"
+    # Check that __syncthreads appears after the closing brace of the outer for loop
+    assert source[for_end - 4 : for_end - 2] == "}\n", "__syncthreads should not be inside any for loop"
-    for_start = source.find("for (int i = 0;")
-    for_end = source.find("__syncthreads")
-    assert for_end > for_start, "__syncthreads should be after the for loop, not inside it"
-    # Check that __syncthreads appears after the closing brace of the outer for loop
-    assert source[for_end - 4 : for_end - 2] == "}\n", "__syncthreads should not be inside any for loop"
+    for_start = source.find("for (int i = 0;")
+    syncthreads_pos = source.find("__syncthreads")
+    assert for_start != -1, "Expected for loop not found in generated source"
+    assert syncthreads_pos != -1, "__syncthreads not found in generated source"
+    assert syncthreads_pos > for_start, "__syncthreads should be after the for loop, not inside it"
+    # Check that __syncthreads appears after the closing brace of the outer for loop
+    preceding_content = source[:syncthreads_pos].rstrip()
+    assert preceding_content.endswith("}"), "__syncthreads should not be inside any for loop"
-    for_start = source.find("for (int i = 0;")
-    for_end = source.find("__syncthreads")
-    assert for_end > for_start, "__syncthreads should be after the for loop, not inside it"
-    # Check that __syncthreads appears after the closing brace of the outer for loop
-    assert source[for_end - 4 : for_end - 2] == "}\n", "__syncthreads should not be inside any for loop"
+    for_start = source.find("for (int i = 0;")
+    syncthreads_pos = source.find("__syncthreads")
+    assert for_start != -1, "Expected for loop not found in generated source"
+    assert syncthreads_pos != -1, "__syncthreads not found in generated source"
+    assert syncthreads_pos > for_start, "__syncthreads should be after the for loop, not inside it"
+    # Check that __syncthreads appears after the closing brace of the outer for loop
+    preceding_content = source[:syncthreads_pos].rstrip()
+    assert preceding_content.endswith("}"), "__syncthreads should not be inside any for loop"
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    test_issue_1106()
-if __name__ == "__main__":
-    tilelang.testing.main()
-    # tilelang.testing.main()
-    test_issue_1106()
+if __name__ == "__main__":
+    tilelang.testing.main()
-if __name__ == "__main__":
-    tilelang.testing.main()
-    # tilelang.testing.main()
-    test_issue_1106()
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -3,7 +3,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 from tvm.script import tir as T
-from tvm import te
 
 
 def run_passes(func: tvm.tir.PrimFunc):
@@ -42,6 +41,28 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
     assert "T.tvm_storage_sync" in str(mod)
 
 
+@tilelang.testing.requires_cuda
+def test_sync_if_with_same_index_with_modulo_if():
+    @T.prim_func(check_well_formed=False)
+    def func() -> None:
+        threadIdx_x = T.env_thread("threadIdx.x")
+        blockIdx_x = T.env_thread("blockIdx.x")
+        p0 = T.alloc_buffer([1], dtype="float32", scope="local")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        temp_shared = T.alloc_buffer([32], dtype="float32", scope="shared")
+        T.launch_thread(blockIdx_x, 1)
+        T.launch_thread(threadIdx_x, 32)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        if threadIdx_x % 4 == 0:
+            temp_shared[threadIdx_x] = p0[0]
+        result_local[0] = temp_shared[threadIdx_x]
+
+    mod = run_passes(func)
+    assert "T.tvm_storage_sync" in str(mod)
+
+
 @tilelang.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
     @T.prim_func