apache
diff --git a/‎include/tvm/meta_schedule/schedule_rule.h‎
Lines changed: 10 additions & 0 deletions b/‎include/tvm/meta_schedule/schedule_rule.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/tvm/tir/stmt.h‎
Lines changed: 5 additions & 0 deletions b/‎include/tvm/tir/stmt.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/tvm/meta_schedule/postproc/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/meta_schedule/postproc/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/meta_schedule/postproc/rewrite_tensorize.py‎
Lines changed: 33 additions & 0 deletions b/‎python/tvm/meta_schedule/postproc/rewrite_tensorize.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎python/tvm/meta_schedule/schedule_rule/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tvm/meta_schedule/schedule_rule/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py‎
Lines changed: 47 additions & 0 deletions b/‎python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎python/tvm/meta_schedule/tune.py‎
Lines changed: 3 additions & 3 deletions b/‎python/tvm/meta_schedule/tune.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/tvm/tir/tensor_intrin/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/tir/tensor_intrin/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/tir/tensor_intrin/dot_product_common.py‎
Lines changed: 55 additions & 0 deletions b/‎python/tvm/tir/tensor_intrin/dot_product_common.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/meta_schedule/postproc/rewrite_tensorize.cc‎
Lines changed: 104 additions & 0 deletions b/‎src/meta_schedule/postproc/rewrite_tensorize.cc‎
Lines changed: 104 additions & 0 deletions
@@ -150,6 +150,16 @@ class ScheduleRule : public runtime::ObjectRef {
                                                Optional<Array<Integer>> vector_load_lens,    //
                                                Optional<Map<String, ObjectRef>> reuse_read,  //
                                                Optional<Map<String, ObjectRef>> reuse_write);
+
+  TVM_DLL static ScheduleRule MultiLevelTilingWithIntrin(
+      String intrin_name,                           //
+      String structure,                             //
+      Optional<Array<String>> tile_binds,           //
+      Optional<Integer> max_innermost_factor,       //
+      Optional<Array<Integer>> vector_load_lens,    //
+      Optional<Map<String, ObjectRef>> reuse_read,  //
+      Optional<Map<String, ObjectRef>> reuse_write);
+
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
    * \param max_jobs_per_core The maximum number of jobs to be launched per CPU core. It sets the
 
@@ -1509,6 +1509,11 @@ constexpr const char* meta_schedule_unroll_explicit = "meta_schedule.unroll_expl
 /*! \brief Mark auto-unroll setting on the block. */
 constexpr const char* meta_schedule_unroll_implicit = "meta_schedule.unroll_implicit";
 
+/*!
+ * \brief Mark that the block should be further rewritten using tensorization.
+ */
+constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensorize";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
 
@@ -22,3 +22,4 @@
 from .rewrite_reduction_block import RewriteReductionBlock
 from .rewrite_unbound_block import RewriteUnboundBlock
 from .verify_gpu_code import VerifyGPUCode
+from .rewrite_tensorize import RewriteTensorize
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A postprocessor that tensorize related components."""
+
+from tvm._ffi.registry import register_object
+from .. import _ffi_api
+from .postproc import Postproc
+import tvm.tir.tensor_intrin
+
+
+@register_object("meta_schedule.RewriteTensorize")
+class RewriteTensorize(Postproc):
+    """A postprocessor that tensorize related components."""
+
+    def __init__(self, vectorize_init_loop=False) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.PostprocRewriteTensorize,  # type: ignore # pylint: disable=no-member
+            vectorize_init_loop
+        )
@@ -22,7 +22,7 @@
 from .add_rfactor import AddRFactor
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
-from .multi_level_tiling import MultiLevelTiling, ReuseType
+from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType
 from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll
 from .random_compute_location import RandomComputeLocation
 from .schedule_rule import PyScheduleRule, ScheduleRule
@@ -82,3 +82,50 @@ def __init__(
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
         )
+
+
+@register_object("meta_schedule.MultiLevelTilingWithIntrin")
+class MultiLevelTilingWithIntrin(ScheduleRule):
+    """Multi-level tiling with reuse.
+
+    Parameters
+    ----------
+    structure : str
+        The tiling structure. Recommended:
+        - 'SSRSRS' on CPU
+        - 'SSSRRSRS' on GPU
+    tile_bind : Optional[List[str]]
+        For each level of tiles, which thread axis it is bound to. Recommended:
+        - None on CPU
+        - [blockIdx.x, vthread.x, threadIdx.x] on GPU
+    max_innermost_factor : Optional[int]
+        The maximum size of the innermost factor. None means no limit
+    vector_load_lens : Optional[List[int]]
+        The length of vector lane in vectorized cooperative fetching.
+        None means disable vectorization
+    reuse_read : Optional[ReuseType]
+        Data reuse configuration for reading. None means no reuse.
+    reuse_write : Optional[ReuseType]
+        Data reuse configuration for writing. None means no reuse.
+    """
+
+    def __init__(
+        self,
+        intrin_name: str,
+        structure: str,
+        tile_binds: Optional[List[str]] = None,
+        max_innermost_factor: Optional[int] = None,
+        vector_load_lens: Optional[List[int]] = None,
+        reuse_read: Optional[ReuseType] = None,
+        reuse_write: Optional[ReuseType] = None,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleMultiLevelTilingWithIntrin,  # type: ignore # pylint: disable=no-member
+            intrin_name,
+            structure,
+            tile_binds,
+            max_innermost_factor,
+            vector_load_lens,
+            reuse_read.as_dict() if reuse_read is not None else None,
+            reuse_write.as_dict() if reuse_write is not None else None,
+        )
@@ -411,7 +411,7 @@ def _sch_rules(sch_rules: Optional[FnScheduleRule], target: Target) -> List[Sche
         # pylint: disable=protected-access
         if target.kind.name == "llvm":
             return DefaultLLVM._sch_rules()
-        if target.kind.name == "cuda":
+        if target.kind.name in ["cuda", "rocm", "vulkan"]:
             return DefaultCUDA._sch_rules()
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
@@ -425,7 +425,7 @@ def _postproc(postproc: Optional[FnPostproc], target: Target) -> List[Postproc]:
         # pylint: disable=protected-access
         if target.kind.name == "llvm":
             return DefaultLLVM._postproc()
-        if target.kind.name == "cuda":
+        if target.kind.name in ["cuda", "rocm", "vulkan"]:
             return DefaultCUDA._postproc()
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
@@ -444,7 +444,7 @@ def _mutator_probs(
         # pylint: disable=protected-access
         if target.kind.name == "llvm":
             return DefaultLLVM._mutator_probs()
-        if target.kind.name == "cuda":
+        if target.kind.name in ["cuda", "rocm", "vulkan"]:
             return DefaultCUDA._mutator_probs()
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
 
@@ -18,3 +18,4 @@
 """Intrinsics for tensorization."""
 from .x86 import *
 from .arm_cpu import *
+from .dot_product_common import *
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Dot product related intrinsics."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+@T.prim_func
+def dp4a_desc(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+        for i in range(0, 4):
+            with T.block("update"):
+                vi = T.axis.remap("R", [i])
+                C[0] = C[0] + T.cast(A[vi], "int32") * T.cast(B[vi], "int32")
+
+
+@T.prim_func
+def dp4a_impl(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+
+        C[0] += T.call_pure_extern(
+            "__dp4a", A.vload([0], "int8x4"), B.vload([0], "int8x4"), T.int32(0), dtype="int32"
+        )
+
+
+DP4A_INTRIN = "dp4a"
+
+TensorIntrin.register(DP4A_INTRIN, dp4a_desc, dp4a_impl)
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <algorithm>
+
+#include "../utils.h"
+#include "tvm/runtime/container/base.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::BlockRV;
+using tir::LoopRV;
+
+void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
+                        const tir::PrimFuncNode* func, bool vectorize_init_loop) {
+  std::vector<std::pair<std::string, std::function<void(tir::BlockRV)>>> jobs;
+
+  tir::PostOrderVisit(func->body, [=, &jobs](const ObjectRef& obj) -> bool {
+    if (const auto* block = obj.as<tir::BlockNode>()) {
+      tir::StmtSRef block_sref = sch->GetSRef(block);
+      if (Optional<String> intrin_name =
+              tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize)) {
+        std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
+        if (block_name.find("init") == std::string::npos) {
+          jobs.emplace_back(block_name, [sch, intrin_name](tir::BlockRV block) {
+            try {
+              sch->Tensorize(block, intrin_name.value());
+            } catch (const std::exception& e) {
+              LOG(WARNING) << "Tensorize failed with error " << e.what();
+            }
+          });
+        } else if (vectorize_init_loop) {
+          jobs.emplace_back(block_name, [sch](tir::BlockRV block) {
+            Array<BlockRV> child_blocks = sch->GetChildBlocks(block);
+            ICHECK(child_blocks.size() == 1);
+            Array<LoopRV> init_loops = sch->GetLoops(child_blocks[0]);
+            ICHECK(init_loops.size() == 1);
+            sch->Vectorize(init_loops[0]);
+          });
+        }
+      }
+    }
+    return true;
+  });
+
+  for (auto kv : jobs) {
+    tir::BlockRV block = sch->GetBlock(kv.first, func_name);
+    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
+    kv.second(block);
+  }
+}
+
+class RewriteTensorizeNode : public PostprocNode {
+ public:
+  void InitializeWithTuneContext(const TuneContext& context) final {}
+
+  bool Apply(const tir::Schedule& sch) final;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+
+  bool vectorize_init_loop = false;
+
+  static constexpr const char* _type_key = "meta_schedule.RewriteTensorize";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RewriteTensorizeNode, PostprocNode);
+};
+
+bool RewriteTensorizeNode::Apply(const tir::Schedule& sch) {
+  for (const auto& kv : sch->mod()->functions) {
+    GlobalVar g_var = kv.first;
+    BaseFunc base_func = kv.second;
+    if (const tir::PrimFuncNode* prim_func = base_func.as<tir::PrimFuncNode>()) {
+      ApplyTensorization(sch, g_var->name_hint, prim_func, vectorize_init_loop);
+    }
+  }
+  return true;
+}
+
+Postproc RewriteTensorize(bool vectorize_init_loop) {
+  ObjectPtr<RewriteTensorizeNode> n = make_object<RewriteTensorizeNode>();
+  n->vectorize_init_loop = vectorize_init_loop;
+  return Postproc(n);
+}
+
+TVM_REGISTER_NODE_TYPE(RewriteTensorizeNode);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocRewriteTensorize").set_body_typed(RewriteTensorize);
+
+}  // namespace meta_schedule
+}  // namespace tvm