apache · junrushao · Sep 9, 2021 · Sep 7, 2021 · Sep 7, 2021 · Sep 8, 2021
diff --git a/include/tvm/arith/int_set.h b/include/tvm/arith/int_set.h
@@ -121,17 +121,24 @@ class IntSet : public ObjectRef {
    * \return The result set containing the indices in the vector.
    */
   static IntSet Vector(PrimExpr vec);
+  /*!
+   * \brief Construct a set representing a range [min, min + extent).
+   * \param min The minimum of the range range
+   * \param extent The extent of the range.
+   * \return The constructed set.
+   */
+  static IntSet FromMinExtent(PrimExpr min, PrimExpr extent);
   /*!
    * \brief Construct a set representing a range.
    * \param r The range
-   * \return constructed set.
+   * \return The constructed set.
    */
   static IntSet FromRange(tvm::Range r);
   /*!
    * \brief Construct a set representing a interval.
    * \param min The minimum value of the interval.
    * \param max The maximum value of the interval.
-   * \return constructed set.
+   * \return The constructed set.
    */
   static IntSet Interval(PrimExpr min, PrimExpr max);
 

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
@@ -305,6 +305,42 @@ class ScheduleNode : public runtime::Object {
   virtual BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                              const String& storage_scope) = 0;
   /******** Schedule: Compute location ********/
+  /*!
+   * \brief Move a producer block under the specific loop, and regenerate the loops induced by the
+   * block so that the buffer region generated by the producer block could cover those regions
+   * written by the producers. It requires:
+   * 1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+   * 2) The scope block has stage-pipeline property
+   * 3) The subtree of the scope block, where the given block is in, satisfies the compact dataflow
+   * condition. i.e. all the blocks in the scope block's subtree must be either complete block or
+   * reduction block
+   * 4) The block is not an output block, i.e. the buffer regions written by the block are allocated
+   * under the current scope
+   * 5) All the consumers of the block are under the given loop
+   *
+   * \param block_rv The block to be moved
+   * \param loop_rv The loop where the block to be moved under
+   * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+   */
+  virtual void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                         bool preserve_unit_loops) = 0;
+  /*!
+   * \brief Move a consumer block under the specific loop, and regenerate the loops induced by the
+   * block so that the buffer region generated by the consumer block could cover those regions read
+   * by the consumers. It requires:
+   * 1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+   * 2) The scope block has stage-pipeline property
+   * 3) The subtree of the scope block, where the given block is in, satisfies the compact dataflow
+   * condition. i.e. all the blocks in the scope block's subtree must be either complete block or
+   * reduction block
+   * 4) All the producers of the block are under the given loop
+   *
+   * \param block_rv The block to be moved
+   * \param loop_rv The loop where the block to be moved under
+   * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1
+   */
+  virtual void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
+                                bool preserve_unit_loops) = 0;
   /*!
    * \brief Inline a block into its consumer(s). It requires:
    * 1) The block is a complete non-root block, which only produces one buffer

diff --git a/include/tvm/tir/schedule/state.h b/include/tvm/tir/schedule/state.h
@@ -128,11 +128,6 @@ class ScheduleStateNode : public Object {
    */
   TVM_DLL void Replace(const tir::StmtSRef& src_sref, const Stmt& tgt_stmt,
                        const Map<Block, Block>& block_sref_reuse);
-  /*!
-   * \brief Recalculate the `affine_binding` flag of the scope block info.
-   * \param scope_sref The sref to the interested scope block.
-   */
-  TVM_DLL void UpdateAffineFlag(const StmtSRef& scope_sref);
   /*!
    * \brief Trigger the verification according to the `debug_mask` bitmask.
    * 1) If the bitmask `kVerifySRefTree` is on, verify the correctness of the sref tree.

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
@@ -927,6 +927,183 @@ def after_cache_write(a: ty.handle, b: ty.handle) -> None:
 
     ########## Schedule: Compute location ##########
 
+    def compute_at(
+        self,
+        block: BlockRV,
+        loop: LoopRV,
+        preserve_unit_loops: bool = False,
+    ) -> None:
+        """Compute-At. Move a producer block under the specific loop, and regenerate the loops
+        induced by the block so that the buffer region generated by the producer block could cover
+        those regions written by the producers. It requires:
+
+        1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+
+        2) The scope block has stage-pipeline property
+
+        3) The subtree of the scope block, where the given block is in, satisfies the compact
+        dataflow condition. i.e. all the blocks in the scope block's subtree must be either
+        complete block or reduction block
+
+        4) The block is not an output block, i.e. the buffer regions written by the block are
+        allocated under the current scope
+
+        5) All the consumers of the block are under the given loop
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block to be moved
+
+        loop: LoopRV
+            The loop where the block to be moved under
+
+        preserve_unit_loops: bool
+            Whether to keep the trivial loops whose extents are 1
+
+        Examples
+        --------
+
+        Before compute-at, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def before_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                with tir.block([128, 128], "B") as [vi, vj]:
+                    B[vi, vj] = A[vi, vj] * 2.0
+                with tir.block([128, 128], "C") as [vi, vj]:
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+        Create the schedule and do compute-at:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_compute_at)
+            block = sch.get_block("B")
+            loop, _ = sch.get_loops(sch.get_block("C"))
+            sch.compute_at(block, loop, preserve_unit_loops=False)
+            print(tvm.script.asscript(sch.mod["main"]))
+
+        After applying compute-at, the IR becomes:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def after_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                for i in tir.serial(0, 128):
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "B") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            B[vi, vj] = A[vi, vj] * 2.0
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "C") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            C[vi, vj] = B[vi, vj] + 1.0
+
+        """
+        _ffi_api.ScheduleComputeAt(  # type: ignore # pylint: disable=no-member
+            self,
+            block,
+            loop,
+            preserve_unit_loops,
+        )
+
+    def reverse_compute_at(
+        self,
+        block: BlockRV,
+        loop: LoopRV,
+        preserve_unit_loops: bool = False,
+    ) -> None:
+        """Reverse-Compute-At. Move a consumer block under the specific loop, and regenerate
+        the loops induced by the block so that the buffer region generated by the consumer
+        block could cover those regions read by the consumers. It requires:
+
+        1) `block` and `loop` are under the same scope, `loop` is not the ancestor of `block`
+
+        2) The scope block has stage-pipeline property
+
+        3) The subtree of the scope block, where the given block is in, satisfies the compact
+        dataflow condition. i.e. all the blocks in the scope block's subtree must be either
+        complete block or reduction block
+
+        4) All the producers of the block are under the given loop
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block to be moved
+
+        loop: LoopRV
+            The loop where the block to be moved under
+
+        preserve_unit_loops: bool
+            Whether to keep the trivial loops whose extents are 1
+
+        Examples
+        --------
+
+        Before reverse-compute-at, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def before_reverse_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                with tir.block([128, 128], "B") as [vi, vj]:
+                    B[vi, vj] = A[vi, vj] * 2.0
+                with tir.block([128, 128], "C") as [vi, vj]:
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+        Create the schedule and do reverse-compute-at:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_reverse_compute_at)
+            block = sch.get_block("C")
+            loop, _ = sch.get_loops(sch.get_block("B"))
+            sch.reverse_compute_at(block, loop, preserve_unit_loops=False)
+            print(tvm.script.asscript(sch.mod["main"]))
+
+        After applying reverse-compute-at, the IR becomes:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def after_reverse_compute_at(a: ty.handle, c: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128), "float32")
+                B = tir.alloc_buffer((128, 128), "float32")
+                C = tir.match_buffer(c, (128, 128), "float32")
+                for i in tir.serial(0, 128):
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "B") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            B[vi, vj] = A[vi, vj] * 2.0
+                    for j in tir.serial(0, 128):
+                        with tir.block([128, 128], "C") as [vi, vj]:
+                            tir.bind(vi, i)
+                            tir.bind(vj, j)
+                            C[vi, vj] = B[vi, vj] + 1.0
+
+        """
+        _ffi_api.ScheduleReverseComputeAt(  # type: ignore # pylint: disable=no-member
+            self,
+            block,
+            loop,
+            preserve_unit_loops,
+        )
+
     def compute_inline(self, block: BlockRV) -> None:
         """Inline a block into its consumer(s). It requires:
 
@@ -1189,10 +1366,15 @@ def after_rfactor(a: ty.handle, b: ty.handle) -> None:
         """
         return _ffi_api.ScheduleRFactor(self, loop, factor_axis)  # type: ignore # pylint: disable=no-member
 
-    ######## Schedule: Block annotatoin ########
+    ######## Schedule: Block annotation ########
 
     def storage_align(  # pylint: disable=too-many-arguments
-        self, block: BlockRV, buffer_index: int, axis: int, factor: int, offset: int
+        self,
+        block: BlockRV,
+        buffer_index: int,
+        axis: int,
+        factor: int,
+        offset: int,
     ) -> None:
         """Set alignment requirement for specific dimension such that
         stride[axis] == k * factor + offset for some k. This is useful to set memory layout for more

diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
@@ -607,6 +607,13 @@ inline bool ProveEqual(Analyzer* analyzer, PrimExpr lhs, PrimExpr rhs) {
   return is_zero(analyzer->Simplify(lhs - rhs));
 }
 
+IntSet IntSet::FromMinExtent(PrimExpr min, PrimExpr extent) {
+  if (is_one(extent)) {
+    return IntSet::SinglePoint(min);
+  }
+  return IntervalSet(min, extent + min - 1);
+}
+
 IntSet IntSet::FromRange(Range r) {
   // must make sure it can be matched back by MatchRange.
   if (is_one(r->extent)) {
@@ -815,46 +822,45 @@ IntSet EvalSet(Range r, const Map<IterVar, IntSet>& dom_map) {
   return EvalSet(r, ConvertDomMap(dom_map));
 }
 
-Optional<Array<arith::IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
-                                                        const Map<Var, Range>& var_dom,
-                                                        const PrimExpr& predicate,
-                                                        arith::Analyzer* analyzer) {
+Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
+                                                 const Map<Var, Range>& var_dom,
+                                                 const PrimExpr& predicate, Analyzer* analyzer) {
   int ndim = region.size();
-  Array<arith::IterSumExpr> iter_sum_exprs{nullptr};
+  Array<IterSumExpr> iter_sum_exprs{nullptr};
   {
     Array<PrimExpr> affine_indices;
     affine_indices.reserve(ndim);
     for (const Range& range : region) {
       affine_indices.push_back(range->min);
     }
-    iter_sum_exprs = arith::DetectIterMap(
+    iter_sum_exprs = DetectIterMap(
         /*indices=*/affine_indices, /*input_iters=*/var_dom,
         /*predicate=*/predicate, /*require_bijective=*/false, analyzer);
   }
   if (iter_sum_exprs.empty()) {
     return NullOpt;
   }
   ICHECK_EQ(iter_sum_exprs.size(), ndim);
-  Array<arith::IntSet> result;
+  Array<IntSet> result;
   result.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
-    const arith::IterSumExpr& sum_expr = iter_sum_exprs[i];
+    const IterSumExpr& sum_expr = iter_sum_exprs[i];
     const Range& range = region[i];
     if (sum_expr->args.empty()) {
-      result.push_back(arith::IntSet::Interval(sum_expr->base, sum_expr->base + range->extent));
+      result.push_back(IntSet::FromMinExtent(sum_expr->base, range->extent));
       continue;
     }
     ICHECK_EQ(sum_expr->args.size(), 1);
-    const arith::IterSplitExpr& split = sum_expr->args[0];
+    const IterSplitExpr& split = sum_expr->args[0];
     if (!analyzer->CanProve(range->extent >= split->scale)) {
       return NullOpt;
     }
     const PrimExpr& base = sum_expr->base;
     // IterSplitExpr: (source // lower_factor) % extent * scale
     // where `(source // lower_factor) % extent` is within [0, extent - 1]
     // Therefore, the range of `region[i]->min` is `base + [0, (extent - 1) * scale]`
-    result.push_back(arith::IntSet::Interval(
-        base, split->extent * split->scale + base + (range->extent - split->scale) - 1));
+    result.push_back(
+        IntSet::FromMinExtent(base, split->extent * split->scale + (range->extent - split->scale)));
   }
   return result;
 }