masahi
diff --git a/‎include/tvm/meta_schedule/schedule_rule.h‎
Lines changed: 10 additions & 0 deletions b/‎include/tvm/meta_schedule/schedule_rule.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/tvm/tir/stmt.h‎
Lines changed: 5 additions & 0 deletions b/‎include/tvm/tir/stmt.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/tvm/meta_schedule/postproc/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/meta_schedule/postproc/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/meta_schedule/postproc/rewrite_tensorize.py‎
Lines changed: 33 additions & 0 deletions b/‎python/tvm/meta_schedule/postproc/rewrite_tensorize.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎python/tvm/meta_schedule/schedule_rule/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tvm/meta_schedule/schedule_rule/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py‎
Lines changed: 47 additions & 0 deletions b/‎python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎src/meta_schedule/postproc/rewrite_tensorize.cc‎
Lines changed: 104 additions & 0 deletions b/‎src/meta_schedule/postproc/rewrite_tensorize.cc‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎src/meta_schedule/schedule_rule/auto_tensorize.cc‎
Lines changed: 99 additions & 0 deletions b/‎src/meta_schedule/schedule_rule/auto_tensorize.cc‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎src/meta_schedule/schedule_rule/auto_tensorize.h‎
Lines changed: 35 additions & 0 deletions b/‎src/meta_schedule/schedule_rule/auto_tensorize.h‎
Lines changed: 35 additions & 0 deletions
@@ -150,6 +150,16 @@ class ScheduleRule : public runtime::ObjectRef {
                                                Optional<Array<Integer>> vector_load_lens,    //
                                                Optional<Map<String, ObjectRef>> reuse_read,  //
                                                Optional<Map<String, ObjectRef>> reuse_write);
+
+  TVM_DLL static ScheduleRule MultiLevelTilingWithIntrin(
+      String intrin_name,                           //
+      String structure,                             //
+      Optional<Array<String>> tile_binds,           //
+      Optional<Integer> max_innermost_factor,       //
+      Optional<Array<Integer>> vector_load_lens,    //
+      Optional<Map<String, ObjectRef>> reuse_read,  //
+      Optional<Map<String, ObjectRef>> reuse_write);
+
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
    * \param max_jobs_per_core The maximum number of jobs to be launched per CPU core. It sets the
 
@@ -1509,6 +1509,11 @@ constexpr const char* meta_schedule_unroll_explicit = "meta_schedule.unroll_expl
 /*! \brief Mark auto-unroll setting on the block. */
 constexpr const char* meta_schedule_unroll_implicit = "meta_schedule.unroll_implicit";
 
+/*!
+ * \brief Mark that the block should be further rewritten using tensorization.
+ */
+constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensorize";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
 
@@ -22,3 +22,4 @@
 from .rewrite_reduction_block import RewriteReductionBlock
 from .rewrite_unbound_block import RewriteUnboundBlock
 from .verify_gpu_code import VerifyGPUCode
+from .rewrite_tensorize import RewriteTensorize
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A postprocessor that tensorize related components."""
+
+from tvm._ffi.registry import register_object
+from .. import _ffi_api
+from .postproc import Postproc
+import tvm.tir.tensor_intrin
+
+
+@register_object("meta_schedule.RewriteTensorize")
+class RewriteTensorize(Postproc):
+    """A postprocessor that tensorize related components."""
+
+    def __init__(self, vectorize_init_loop=False) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.PostprocRewriteTensorize,  # type: ignore # pylint: disable=no-member
+            vectorize_init_loop
+        )
@@ -22,7 +22,7 @@
 from .add_rfactor import AddRFactor
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
-from .multi_level_tiling import MultiLevelTiling, ReuseType
+from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType
 from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll
 from .random_compute_location import RandomComputeLocation
 from .schedule_rule import PyScheduleRule, ScheduleRule
@@ -82,3 +82,50 @@ def __init__(
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
         )
+
+
+@register_object("meta_schedule.MultiLevelTilingWithIntrin")
+class MultiLevelTilingWithIntrin(ScheduleRule):
+    """Multi-level tiling with reuse.
+
+    Parameters
+    ----------
+    structure : str
+        The tiling structure. Recommended:
+        - 'SSRSRS' on CPU
+        - 'SSSRRSRS' on GPU
+    tile_bind : Optional[List[str]]
+        For each level of tiles, which thread axis it is bound to. Recommended:
+        - None on CPU
+        - [blockIdx.x, vthread.x, threadIdx.x] on GPU
+    max_innermost_factor : Optional[int]
+        The maximum size of the innermost factor. None means no limit
+    vector_load_lens : Optional[List[int]]
+        The length of vector lane in vectorized cooperative fetching.
+        None means disable vectorization
+    reuse_read : Optional[ReuseType]
+        Data reuse configuration for reading. None means no reuse.
+    reuse_write : Optional[ReuseType]
+        Data reuse configuration for writing. None means no reuse.
+    """
+
+    def __init__(
+        self,
+        intrin_name: str,
+        structure: str,
+        tile_binds: Optional[List[str]] = None,
+        max_innermost_factor: Optional[int] = None,
+        vector_load_lens: Optional[List[int]] = None,
+        reuse_read: Optional[ReuseType] = None,
+        reuse_write: Optional[ReuseType] = None,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleMultiLevelTilingWithIntrin,  # type: ignore # pylint: disable=no-member
+            intrin_name,
+            structure,
+            tile_binds,
+            max_innermost_factor,
+            vector_load_lens,
+            reuse_read.as_dict() if reuse_read is not None else None,
+            reuse_write.as_dict() if reuse_write is not None else None,
+        )
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <algorithm>
+
+#include "../utils.h"
+#include "tvm/runtime/container/base.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::BlockRV;
+using tir::LoopRV;
+
+void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
+                        const tir::PrimFuncNode* func, bool vectorize_init_loop) {
+  std::vector<std::pair<std::string, std::function<void(tir::BlockRV)>>> jobs;
+
+  tir::PostOrderVisit(func->body, [=, &jobs](const ObjectRef& obj) -> bool {
+    if (const auto* block = obj.as<tir::BlockNode>()) {
+      tir::StmtSRef block_sref = sch->GetSRef(block);
+      if (Optional<String> intrin_name =
+              tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize)) {
+        std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
+        if (block_name.find("init") == std::string::npos) {
+          jobs.emplace_back(block_name, [sch, intrin_name](tir::BlockRV block) {
+            try {
+              sch->Tensorize(block, intrin_name.value());
+            } catch (const std::exception& e) {
+              LOG(WARNING) << "Tensorize failed with error " << e.what();
+            }
+          });
+        } else if (vectorize_init_loop) {
+          jobs.emplace_back(block_name, [sch](tir::BlockRV block) {
+            Array<BlockRV> child_blocks = sch->GetChildBlocks(block);
+            ICHECK(child_blocks.size() == 1);
+            Array<LoopRV> init_loops = sch->GetLoops(child_blocks[0]);
+            ICHECK(init_loops.size() == 1);
+            sch->Vectorize(init_loops[0]);
+          });
+        }
+      }
+    }
+    return true;
+  });
+
+  for (auto kv : jobs) {
+    tir::BlockRV block = sch->GetBlock(kv.first, func_name);
+    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
+    kv.second(block);
+  }
+}
+
+class RewriteTensorizeNode : public PostprocNode {
+ public:
+  void InitializeWithTuneContext(const TuneContext& context) final {}
+
+  bool Apply(const tir::Schedule& sch) final;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+
+  bool vectorize_init_loop = false;
+
+  static constexpr const char* _type_key = "meta_schedule.RewriteTensorize";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RewriteTensorizeNode, PostprocNode);
+};
+
+bool RewriteTensorizeNode::Apply(const tir::Schedule& sch) {
+  for (const auto& kv : sch->mod()->functions) {
+    GlobalVar g_var = kv.first;
+    BaseFunc base_func = kv.second;
+    if (const tir::PrimFuncNode* prim_func = base_func.as<tir::PrimFuncNode>()) {
+      ApplyTensorization(sch, g_var->name_hint, prim_func, vectorize_init_loop);
+    }
+  }
+  return true;
+}
+
+Postproc RewriteTensorize(bool vectorize_init_loop) {
+  ObjectPtr<RewriteTensorizeNode> n = make_object<RewriteTensorizeNode>();
+  n->vectorize_init_loop = vectorize_init_loop;
+  return Postproc(n);
+}
+
+TVM_REGISTER_NODE_TYPE(RewriteTensorizeNode);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocRewriteTensorize").set_body_typed(RewriteTensorize);
+
+}  // namespace meta_schedule
+}  // namespace tvm
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "auto_tensorize.h"
+
+#include "../../tir/schedule/analysis.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::LoopRV;
+
+Optional<LoopRV> TilingwithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
+                                        const String& intrin_name) {
+  Optional<tir::TensorizeInfo> opt_tensorize_info = GetTensorizeLoopMapping(
+      sch->state(), sch->GetSRef(block_rv), tir::TensorIntrin::Get(intrin_name)->desc);
+  if (!opt_tensorize_info) return NullOpt;
+  const tir::TensorizeInfoNode* info = opt_tensorize_info.value().get();
+  // Construct a mapping from tir loops back to LoopRVs
+  Map<tir::StmtSRef, LoopRV> loop2rv;
+  {
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    for (const LoopRV& loop_rv : loop_rvs) {
+      loop2rv.Set(sch->GetSRef(loop_rv), loop_rv);
+    }
+  }
+  // Split the loops
+  arith::Analyzer analyzer;
+  std::unordered_set<const tir::StmtSRefNode*> inner_loops;
+  std::vector<LoopRV> reorder_suffix;
+  reorder_suffix.resize(info->loop_map.size());
+  for (const auto& kv : info->loop_map) {
+    // Extract mapping (block_loop => desc_loop)
+    const tir::StmtSRef& block_loop_sref = kv.first;
+    const tir::ForNode* block_loop = block_loop_sref->StmtAs<tir::ForNode>();
+    const tir::ForNode* desc_loop = kv.second.get();
+    ICHECK(block_loop != nullptr && desc_loop != nullptr);
+    // Extract the loop extent
+    PrimExpr block_extent = analyzer.Simplify(block_loop->extent);
+    PrimExpr desc_extent = analyzer.Simplify(desc_loop->extent);
+    const auto* int_block_extent = block_extent.as<IntImmNode>();
+    const auto* int_desc_extent = desc_extent.as<IntImmNode>();
+    ICHECK(int_block_extent != nullptr && int_desc_extent != nullptr);
+    // Check divisibility
+    int64_t total = int_block_extent->value;
+    int64_t inner = int_desc_extent->value;
+    ICHECK_EQ(total % inner, 0);
+    int64_t outer = int_block_extent->value / int_desc_extent->value;
+    // Do the split
+    Array<LoopRV> split = sch->Split(loop2rv.at(block_loop_sref), {Integer(outer), Integer(inner)});
+    ICHECK_EQ(split.size(), 2);
+    inner_loops.insert(sch->GetSRef(split[1]).operator->());
+    // The inner split will be reordered to the loop domain that is tensorized
+    int desc_loop_index = info->desc_loop_indexer.at(GetRef<tir::For>(desc_loop));
+    reorder_suffix[desc_loop_index] = split[1];
+  }
+  // Reorder the loops
+  std::vector<LoopRV> reorder_list;
+  bool meet = false;
+  Array<LoopRV> all_loops = sch->GetLoops(block_rv);
+  for (const LoopRV& loop : all_loops) {
+    if (inner_loops.count(sch->GetSRef(loop).operator->())) {
+      meet = true;
+    } else if (meet) {
+      reorder_list.push_back(loop);
+    }
+  }
+  reorder_list.insert(reorder_list.end(), reorder_suffix.begin(), reorder_suffix.end());
+  sch->Reorder(reorder_list);
+  ICHECK(!reorder_suffix.empty());
+  return reorder_suffix[0];
+}
+
+tir::BlockRV TileForIntrin(tir::Schedule sch, tir::BlockRV block, const std::string& intrin_name) {
+  Optional<tir::LoopRV> tiled_loop_rv = TilingwithTensorIntrin(sch, block, intrin_name);
+  ICHECK(tiled_loop_rv.defined());
+  tir::BlockRV outer_block = sch->Blockize(tiled_loop_rv.value());
+  sch->Annotate(outer_block, tir::attr::meta_schedule_auto_tensorize, String(intrin_name));
+  return outer_block;
+}
+
+}  // namespace meta_schedule
+}  // namespace tvm
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_TENSORIZE_H_
+#define TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_TENSORIZE_H_
+
+#include <tvm/tir/schedule/schedule.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+Optional<tir::LoopRV> TilingwithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
+                                             const String& intrin_name);
+
+tir::BlockRV TileForIntrin(tir::Schedule sch, tir::BlockRV block, const std::string& intrin_name);
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_TENSORIZE_H_