apache
diff --git a/‎src/arith/iter_affine_map.cc‎
Lines changed: 63 additions & 26 deletions b/‎src/arith/iter_affine_map.cc‎
Lines changed: 63 additions & 26 deletions
diff --git a/‎tests/python/unittest/test_arith_intset.py‎
Lines changed: 53 additions & 18 deletions b/‎tests/python/unittest/test_arith_intset.py‎
Lines changed: 53 additions & 18 deletions
@@ -372,12 +372,12 @@ class IterMapRewriter : public ExprMutator {
   //                                                              IterSplit(k, scale=1)),
   //                                                      extent=9)
   //                                             scale=1))
-  // Example(2): expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
+  // Example(2): expr = i*8 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
   //          predicate: 1 <= j*2 + k < 9
-  // Then,    flattened form = IterSum(IterSplit(i, scale=9),
+  // Then,    flattened form = IterSum(IterSplit(i, scale=8),
   //                                   IterSplit(j, scale=2),
   //                                   IterSplit(k, scale=1))
-  //          normal form    = IterSum(IterSplit(i, scale=9),
+  //          normal form    = IterSum(IterSplit(i, scale=8),
   //                                   IterSplit(IterMark(IterSum(IterSplit(j, scale=2),
   //                                                              IterSplit(k, scale=1), base=-1),
   //                                                      extent=9-1)
@@ -495,7 +495,7 @@ class IterMapRewriter : public ExprMutator {
    */
   IterSumExpr NormalizeToIterOnBoundExpr(IterSumExpr expr, PrimExpr predicate_induced_min,
                                          PrimExpr predicate_induced_max) {
-    // remove base temporarily since `TryFuseIters` require zero base iter sum
+    // normalize to zero base
     PrimExpr base = expr->base;
     if (!is_zero(base)) {
       expr.CopyOnWrite()->base = 0;
@@ -506,39 +506,40 @@ class IterMapRewriter : public ExprMutator {
     ICHECK(!opt.defined() || opt.value()->args.size() == 1);
     // scale should be 1
     if (opt.defined() && is_one(opt.value()->args[0]->scale)) {
-      IterSplitExpr fused_split = opt.value()->args[0];
-      IterSumExpr sum = Downcast<IterSumExpr>(fused_split->source->source);
+      const IterSplitExpr split = opt.value()->args[0];
+      IterSumExpr structured_form = Downcast<IterSumExpr>(split->source->source);
       // get the flattened form
-      auto it = flattened_map_.find(sum);
+      auto it = flattened_map_.find(structured_form);
       ICHECK(it != flattened_map_.end());
       IterSumExpr flattened_form = it->second;
-      // get the mark
+      // get the mark and offset of the structured_form
       auto it_mark = sum_fuse_map_.find(flattened_form);
       ICHECK(it_mark != sum_fuse_map_.end());
       IterMark mark = it_mark->second.mark;
       PrimExpr mark_offset = it_mark->second.offset;
-      // update iter mark iter range to [0, mark->extent) ^ [pred_min, pred_max)
-      PrimExpr mark_min = 0;
-      PrimExpr mark_max = mark->extent;
+      PrimExpr iter_min = mark_offset;
+      PrimExpr iter_max = iter_min + mark->extent;
       if (predicate_induced_min.defined()) {
-        mark_min = max(predicate_induced_min, mark_min);
+        iter_min = max(predicate_induced_min, iter_min);
       }
       if (predicate_induced_max.defined()) {
-        mark_max = min(predicate_induced_max, mark_max);
+        iter_max = min(predicate_induced_max, iter_max);
       }
-      // mark.CopyOnWrite()->min = mark_min;
-      mark.CopyOnWrite()->source = mark->source - mark_min;
-      mark.CopyOnWrite()->extent = mark_max - mark_min;
-      mark_offset = mark_offset + mark_min;
-
-      // update the bound of the lhs based on predicate_induced_extent
-      sum_fuse_map_[flattened_form] = {mark, mark_offset};
+      if (!is_zero(iter_min)) {
+        // structured form's offset should be updated
+        flattened_map_.erase(structured_form);
+        structured_form.CopyOnWrite()->base = -iter_min;
+        mark.CopyOnWrite()->source = structured_form;
+        flattened_map_[structured_form] = flattened_form;
+      }
+      mark.CopyOnWrite()->extent = iter_max - iter_min;
+      sum_fuse_map_[flattened_form] = {mark, iter_min};
 
       // we need to note down the flattened form of constrained iterators
       // to check the validity of constraints, see also CheckConstraints()
       constrained_iters_flattened_.push_back(flattened_form);
-      expr.CopyOnWrite()->args = Array<IterSplitExpr>({fused_split});
-      expr.CopyOnWrite()->base = base + mark_min;
+      expr.CopyOnWrite()->args = Array<IterSplitExpr>({split});
+      expr.CopyOnWrite()->base = base + iter_min;
       return expr;
     }
     Fail(Diagnostic::Error(expr->span)
@@ -554,7 +555,7 @@ class IterMapRewriter : public ExprMutator {
    */
   IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
     // We are normalizing a regular iter
-    if (expr->args.size() <= 1) return expr;
+    if (expr->args.size() < 1) return expr;
     Optional<IterSumExpr> opt = TryFuseIters(expr);
     if (opt.defined()) {
       return opt.value();
@@ -593,6 +594,7 @@ class IterMapRewriter : public ExprMutator {
   Optional<IterSumExpr> TryFuseIters(IterSumExpr expr) {
     // select the iterators in order
     std::vector<bool> visited(expr->args.size(), false);
+    size_t num_visited = 0;
     std::vector<IterSplitExpr> flattened_iters, grouped_iters;
     // canonicalize the expression into two different forms: flattened form and structured form
     // step0. check if find the base scale first
@@ -606,7 +608,11 @@ class IterMapRewriter : public ExprMutator {
         }
       }
     }
-    if (!base_scale) return NullOpt;
+    if (!base_scale) {
+      diag_ctx_.Emit(Diagnostic::Error(expr->span)
+                     << "Fuse iters failed, can not find a valid base scale");
+      return NullOpt;
+    }
     // check if it can be remapped into a fused pattern.
     PrimExpr expected_extra_base = 0;
     PrimExpr expected_scale = base_scale.value();
@@ -616,7 +622,11 @@ class IterMapRewriter : public ExprMutator {
       for (; j < expr->args.size(); ++j) {
         if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break;
       }
-      if (j == expr->args.size()) return NullOpt;
+      if (j == expr->args.size()) {
+        diag_ctx_.Emit(Diagnostic::Error(expr->span)
+                       << "Fuse iters failed, can not find expected scale " << expected_scale);
+        return NullOpt;
+      }
       // look for the longest constrained iter started from expr->args[j]
       // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
       //          predicate: j*2 + k < 9
@@ -637,6 +647,8 @@ class IterMapRewriter : public ExprMutator {
         // Example: expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
         //          predicate = j*2 + k < 9
         //          then j*2 + k matches the lower two splits of expr
+        size_t flattened_iters_pos = flattened_iters.size();
+        bool match_constraint_suffix = false;
         for (auto it = constraint_to_match.value()->args.rbegin();
              it != constraint_to_match.value()->args.rend(); ++it) {
           size_t k = 0;
@@ -646,10 +658,32 @@ class IterMapRewriter : public ExprMutator {
                 break;
             }
           }
-          if (k == expr->args.size()) return NullOpt;
+          if (k == expr->args.size()) {
+            if (i == 0 && num_visited == visited.size()) {
+              // if match failed because of iterations are used out instead of scale mismatch,
+              // and all used iters are visited during current match round, fallback to skip the
+              // constraint. Example: exprs = [i * 2 + j, k], i in [0, 3), j in [0, 2), k in [0, 4)
+              //          predicate = i * 8 + j * 4 + k < 10
+              for (size_t pos = flattened_iters_pos; pos < flattened_iters.size(); ++pos) {
+                grouped_iters.push_back(flattened_iters[pos]);
+                expected_scale *= flattened_iters[pos]->extent;
+              }
+              match_constraint_suffix = true;
+              break;
+            }
+            diag_ctx_.Emit(Diagnostic::Error(expr->span)
+                           << "Fuse iters failed, can not find flattened iter match constraint "
+                           << constraint_to_match.value());
+            return NullOpt;
+          }
           visited[k] = true;
+          num_visited += 1;
           flattened_iters.push_back(expr->args[k]);
         }
+        if (match_constraint_suffix) {
+          // all iters are used to match the constraint, but only a suffix is matched.
+          break;
+        }
         auto iter = sum_fuse_map_.find(constraint_to_match.value());
         ICHECK(iter != sum_fuse_map_.end());
         const IterMarkWithOffset& iter_matched = iter->second;
@@ -661,6 +695,7 @@ class IterMapRewriter : public ExprMutator {
       } else {
         // constraint_to_match not found, skip this iterator
         visited[j] = true;
+        num_visited += 1;
         flattened_iters.push_back(expr->args[j]);
         grouped_iters.push_back(expr->args[j]);
         expected_scale *= expr->args[j]->extent;
@@ -681,6 +716,8 @@ class IterMapRewriter : public ExprMutator {
       // old iter
       if (!analyzer_->CanProveEqual(expected_extra_base, it->second.offset * base_scale.value())) {
         // the extra offset is not consistent with old
+        diag_ctx_.Emit(Diagnostic::Error(expr->span)
+                       << "Fuse iters failed, the extra offset is not consistent with old");
         return NullOpt;
       }
       return IterSumExpr({IterSplitExpr(it->second.mark, base_scale.value())},
 
@@ -16,6 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm import tir
 from tvm.ir.base import structural_equal
 
 
@@ -218,14 +219,9 @@ def test_region_lower_bound_for_non_perfect_tile():
     h1 = tvm.tir.Var("h1", "int32")
     h2 = tvm.tir.Var("h2", "int32")
     h3 = tvm.tir.Var("h3", "int32")
-    # h1, h2 are bounded, h3 is free
-    var_dom = {
-        h2: tvm.ir.Range(begin=0, end=2),
-        h1: tvm.ir.Range(begin=0, end=5),
-    }
     analyzer = tvm.arith.Analyzer()
 
-    def do_test_point_access(point, predicates, expect):
+    def do_test_point_access(point, predicates, var_dom, expect):
         regions = tvm.arith.estimate_region_lower_bound(
             region=[
                 tvm.ir.Range.from_min_extent(min_value=point, extent=1),
@@ -237,29 +233,68 @@ def do_test_point_access(point, predicates, expect):
             assert regions is None
         else:
             assert len(regions) == 1
-            assert structural_equal(
-                analyzer.simplify(expect[0], 3), analyzer.simplify(regions[0].min_value, 3)
-            )
-            assert structural_equal(
-                analyzer.simplify(expect[1], 3), analyzer.simplify(regions[0].max_value, 3)
-            )
-
-    # normal case of a non-uniform tiling
+            for binding, expect_min, expect_max in expect:
+                min_diff = expect_min - regions[0].min_value
+                assert analyzer.simplify(tir.stmt_functor.substitute(min_diff, binding), 3) == 0
+                max_diff = expect_max - regions[0].max_value
+                assert analyzer.simplify(tir.stmt_functor.substitute(max_diff, binding), 3) == 0
+
+    # non-uniform tiling, single inner variable
     # h3 == 0: region is [1, 9]
     # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 9]
     # h3 > 26: region is [h3 * 8, 223]
+    do_test_point_access(
+        point=h3 * 8 + h2,
+        predicates=[1 <= h3 * 8 + h2, h3 * 8 + h2 < 224],
+        var_dom={
+            h2: tvm.ir.Range(begin=0, end=10),
+        },
+        expect=[
+            (
+                {},
+                tvm.tir.max(h3 * 8, 1),
+                tvm.tir.max(h3 * 8, 1)
+                - tvm.tir.max(h3 * 8, 214)
+                - tvm.tir.max(1 - h3 * 8, 0)
+                + 223,
+            ),
+            ({h3: 0}, 1, 9),
+            ({h3: 10}, h3 * 8, h3 * 8 + 9),
+            ({h3: 27}, h3 * 8, 223),
+        ],
+    )
+
+    # non-uniform tiling, two inner variables
     do_test_point_access(
         point=h3 * 8 + h2 * 5 + h1,
         predicates=[1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h2 * 5 + h1 < 224],
-        expect=(
-            tvm.tir.max(h3 * 8, 1),
-            tvm.tir.max(h3 * 8, 1) - tvm.tir.max(h3 * 8, 214) - tvm.tir.max(1 - h3 * 8, 0) + 223,
-        ),
+        var_dom={
+            h2: tvm.ir.Range(begin=0, end=2),
+            h1: tvm.ir.Range(begin=0, end=5),
+        },
+        expect=[
+            (
+                {},
+                tvm.tir.max(h3 * 8, 1),
+                tvm.tir.max(h3 * 8, 1)
+                - tvm.tir.max(h3 * 8, 214)
+                - tvm.tir.max(1 - h3 * 8, 0)
+                + 223,
+            ),
+            ({h3: 0}, 1, 9),
+            ({h3: 10}, h3 * 8, h3 * 8 + 9),
+            ({h3: 27}, h3 * 8, 223),
+        ],
     )
+
     # should fail on incompatible predicates
     do_test_point_access(
         point=h3 * 8 + h2 * 5 + h1,
         predicates=[1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224],
+        var_dom={
+            h2: tvm.ir.Range(begin=0, end=2),
+            h1: tvm.ir.Range(begin=0, end=5),
+        },
         expect=None,
     )