From 400ab33e72c062df0b64afcfdb9b14904dbb860d Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 10:35:02 +0900
Subject: [PATCH 01/10] [ROCM] Support dp4a on AMDGPU by sdot4 intrinsic

commit 0225f2bfe3f413cd4764c2dba6c922af2520146b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 08:56:10 2022 +0900

    share op strategy between cuda and rocm

commit 762c7e8611c9ec3cca3321428e2362c81fe89b9b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 08:28:34 2022 +0900

    fixed rocm batch_matmul strategy for mixed i8i8i32

commit ce53e8d141f7f901303ec6a91674337cbf2b2384
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 06:17:30 2022 +0900

    add rocm sdot4 TIR intrin

commit f4562b991f9180b61be7339b2890de1584656c10
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 06:03:44 2022 +0900

    rocm sdot4 works

commit 6cc62805f82dd884a18a1c4c0e9bae5866e00da0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 05:32:07 2022 +0900

    more wip

commit 0602f4a3157d4cb5a3f280a3a3c514bb6535aac8
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 03:47:37 2022 +0900

    Squashed commit of the following:

    commit 65b8bcf955f44540d6a52c8416e60f3047c8366c
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Wed Apr 13 20:36:49 2022 +0900

        [WIP] adding DP4A support to rocm

    commit 4f8f308ab6bb85ef3bdcc2b8e846c2eea15f2167
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Wed Apr 13 14:03:25 2022 +0900

        Squashed commit of the following:

        commit 1711be38a17e3b6171350009f1da05824cd0b340
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 13:11:40 2022 +0900

            fixed condition for real

        commit 8a48fb5262e80e318cd81d5ff51bf95fd5eb576e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 09:57:42 2022 +0900

            Revert "Skip applying sch_rule when both ann and sch_rule are defined"

            This reverts commit 4915c6a5a91ff87038e71f8aff9f31db684b4a95.

        commit daea033d2cb06388ef27ddadb80fc5bce72181d2
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Mon Apr 11 09:31:05 2022 +0900

            [Metaschedule] Support rocm and spirv

        commit eb0cae2c779808cced074d189e8f487bf46ea89f
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 07:25:04 2022 +0900

            dp4a works

        commit 4915c6a5a91ff87038e71f8aff9f31db684b4a95
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 06:13:45 2022 +0900

            Skip applying sch_rule when both ann and sch_rule are defined

        commit 7b3d71c6b21a9c5de9ef2b89d0a7db2800a5f3a2
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 04:40:31 2022 +0900

            fixed intrin description

        commit 7666cd7a5b0ce182791662673fbe45944c84d0ae
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Tue Apr 12 19:59:47 2022 +0900

            add DP4A intrin

        commit 7086bdb75546a2680d12dc8f80c040cea23f729a
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Tue Apr 12 19:03:44 2022 +0900

            works

        commit db343974bfae86e51078e40e6170022a782d8e0a
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Tue Apr 12 12:49:52 2022 +0900

            more hack to tensorize loop mapping to make resnet50 e2e work

        commit 2409674a7884a60beb50d7aa3345c4b907b8cd13
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Mon Apr 11 13:40:59 2022 +0900

            wip support pad + qnn.conv2d folding

        commit 613cb7ec33b6df41f1ebe0f0a0ac8eca7c73cff1
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sun Apr 10 12:04:08 2022 +0900

            hack to tensorize loop mapping to make conv2d work

        commit 9e4f9df6a409396a8a4a20d967c4f51accf5d210
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sun Apr 10 11:34:13 2022 +0900

            wrap tensorize with try/catch

        commit d4b496d858da0ae43063d47cb03a28b803d0269f
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sun Apr 10 11:33:39 2022 +0900

            revert change in task_scheduler.cc

        commit 476129be7b286f5d109402280aea585e89f6dc1d
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sat Apr 9 05:54:10 2022 +0900

            try / catch in ThreadedApply

        commit d8226ff26f25eba17d4000f25131822874bdc2cc
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Fri Apr 8 17:17:59 2022 +0900

            filter out invalid candidate

        commit 2632899a2759885d338e25f2a25ba0b2c555f0c3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Fri Apr 8 10:09:48 2022 +0900

            try graceful exit in parallel_for_dynamic

        commit 9d6741c3dd29c4dde861aa1d3b2ca85f560f5ac6
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Fri Apr 8 09:35:51 2022 +0900

            [QNN] Fix broadcast for invalid axis

        commit 6ccde0959343ce4246ef99505b4f54de469a1a5c
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 20:51:15 2022 +0900

            refactor rewrite_tensorize

        commit 2ce206699f10b03b9611c4683018f7e0c70c7eb5
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 20:48:17 2022 +0900

            allow missing schedule_rule in post order apply

        commit 3a69353a29abfc454e28d4e530d22a3e2043712e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 19:42:48 2022 +0900

            refactor rewrite_tensorize

        commit 43e0b2f7f98299679807aaf1ffb13cce2b5f5ce3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 18:25:14 2022 +0900

            rewrite_vnni -> rewrite_tensorize

        commit 823797e2627a9bfa812b72019468569ee79eb4c6
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 18:12:12 2022 +0900

            VNNI -> WithIntrin

        commit 4284a47e5933aa89c1c3362b15ad53b14782fc81
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:45:41 2022 +0900

            introduce TileForIntrin

        commit b87ef32e30e1e71b3f39789f7289976a8cba4ab4
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:34:04 2022 +0900

            move TilingwithTensorIntrin to auto_tensorize.cc

        commit 2fc118b3726586ba13f7de950beaa299b83a0af3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:28:45 2022 +0900

            clean up headers

        commit d8b2aa325c91b524bec22dc1ec2fc52c9f060fce
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:09:32 2022 +0900

            clean up using namespace

        commit eb05d25e2b71f4a1232a8796d1413011ec7629d3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:03:05 2022 +0900

            refactored init

        commit 5e6b0a08d447c0470c2c8a993e4bd62673e34fe3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 16:57:14 2022 +0900

            compiled

        commit 2b8c430e2fec7ceb285eed7bc7aa73bb9a74a997
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 12:51:55 2022 +0900

            wip MultiLevelTiling refactor

        commit 7c21a9fea0511c88bd82f49f799b5198252df40a
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:58:33 2022 +0900

            function doc string not supported by tvmscript

        commit 40f9742bc9c3aa11e8c2c0551d1827ad47fc0f39
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:56:45 2022 +0900

            update vnni intrin name

        commit 4814f825a5315efd2a3da8c36d2ce6b5df5447cd
        Merge: e0c5eb84b 07bbb38f7
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:44:47 2022 +0900

            Merge branch 'tir-tensor-intrin' into auto-tensorize-vnni

        commit 07bbb38f7fb52db4a2ecde3d5c87cf4d5cd000a1
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:24:56 2022 +0900

            more lint fix

        commit 15e60b42362cc64b1428b219c8eada414d1b8372
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:16:08 2022 +0900

            black

        commit 7a757fe53758e06418ea1367b348b47c8cd2dcf9
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:12:54 2022 +0900

            pylint

        commit 9a3e508b6f4529158e703b4617f2ddaa351a89eb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:58:52 2022 +0900

            simplify import

        commit d8e43ecf1c0a79a2c195ff31e1e699a447a11335
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:52:50 2022 +0900

            use vectorlow/high in arm intrin

        commit 625cd2774ec455307646b0c26bb3971d89613d1e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:34:57 2022 +0900

            fixed offset factor

        commit 69e72b6b612588e670937e003435afa647030ceb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:12:02 2022 +0900

            Add ARM intrin

        commit 1351fdea6b22f231a290a6c28e06732c9cf993cf
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:27:27 2022 +0900

            use buffer syntax sugar

        commit 0ced85fd097ed48aad8714912718d8735791e1fb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:17:43 2022 +0900

            rename vnni.py to x86.py

        commit 38a5aca87ec438446593a3af17760339211f5ad9
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:24:44 2022 +0900

            add VNNI unittest

        commit 88b763ec48c20cf68db8bc3bae3fa3ae78996ee8
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:10:06 2022 +0900

            refactored existing test using VNNI intrin

        commit 711a0076d9be2b9aa80ada67e1edda5ba1fdf1fd
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:04:58 2022 +0900

            [TIR] Add VNNI dot product intrinsic for TIR

        commit e0c5eb84bf6a0ad2ba0cddc4bdf22a799dc4b8a0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:42:26 2022 +0900

            merge fix

        commit b171748139e53f0cf75ff4b6fde436f9d8a5fe91
        Merge: 71fe3bdf0 82e152a3c
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:33:59 2022 +0900

            Merge branch 'tir-tensor-intrin' into auto-tensorize-vnni

        commit 71fe3bdf02ae10ddbe090a4fd1020f545a05bb41
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:57:38 2022 +0900

            move tensor intrin under tir

        commit 0c51badef45af2a1025ab42fe38d1b3f07ab493e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:12:39 2022 +0900

            remove log

        commit fed910e03eb94c169d4a160b8f3cad406d04c6aa
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:11:22 2022 +0900

            more revert

        commit 7150aff9fba167d88dbfb40d48727de8a144b9c0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:10:44 2022 +0900

            revert stmt_functor change

        commit 155107b98b09c5e5cc7f19afbd327b0557a02843
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:10:09 2022 +0900

            refactored RewriteVNNI a bit

        commit ca15255e3a882b89b05bb83079640c929fb63096
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 05:41:13 2022 +0900

            add RewriteVNNI

        commit dc9f71d5e3122b50fa8ae6a4462f959f13870b05
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 05:38:56 2022 +0900

            vectorized init loop

        commit fcc31ee20ddfafd47f566bf98ff40a9f684d12eb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 04:55:36 2022 +0900

            tensorize worked

        commit 2b534377a45b9ab84bf35c3d7c03ecae7616d17f
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 6 19:11:05 2022 +0900

            TilingwithTensorIntrin works

        commit 86baa31e773fc864f77dc113bc9a93b79f3fc652
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 6 08:58:27 2022 +0900

            Ported auto-tensorization code

        commit 82e152a3c91144041ade783116a50565ebb48b89
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:24:56 2022 +0900

            more lint fix

        commit 88d9bdd3b21302bc2dd068a990df15c375a1a8ef
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:16:08 2022 +0900

            black

        commit 31fe7eb8075445161d804d170772eac8e90d3425
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:12:54 2022 +0900

            pylint

        commit 7876754effc40ad089349534dacd75df19d38fc4
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:58:52 2022 +0900

            simplify import

        commit 56f2e9a85069426021e2872eb1da95bf134ac7e0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:52:50 2022 +0900

            use vectorlow/high in arm intrin

        commit 995cc8d6fcec70a3fadcfb1c6fee7b9f0b5a0951
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:34:57 2022 +0900

            fixed offset factor

        commit 86bbd4955b34257d68d957cb4a2536aea3ef9bac
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:12:02 2022 +0900

            Add ARM intrin

        commit 120fd96e80307b4301ee3fc93e6793e0b40485f0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:27:27 2022 +0900

            use buffer syntax sugar

        commit 0f0682d00c3961afd1f492ae55f180c5b5502767
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:17:43 2022 +0900

            rename vnni.py to x86.py

        commit f88c31ead1fa6db4bfd2c88eeaf5f665e4c6dddb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:24:44 2022 +0900

            add VNNI unittest

        commit 6cc80094adac398762924b0b31a4c741417ba9dc
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:10:06 2022 +0900

            refactored existing test using VNNI intrin

        commit 11a29c704cdaad96aeeca39c9c753ef006d27a50
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:04:58 2022 +0900

            [TIR] Add VNNI dot product intrinsic for TIR
---
 python/tvm/relay/op/strategy/cuda.py          |   8 +-
 python/tvm/relay/op/strategy/rocm.py          | 171 ++----------------
 python/tvm/relay/qnn/op/legalizations.py      |  10 +-
 python/tvm/tir/tensor_intrin/__init__.py      |   2 +
 .../tir/tensor_intrin/dot_product_common.py   |  55 ++++++
 python/tvm/tir/tensor_intrin/rocm.py          |  47 +++++
 python/tvm/topi/cuda/batch_matmul.py          |   2 +-
 python/tvm/topi/cuda/conv2d_alter_op.py       |   6 +-
 python/tvm/topi/cuda/conv2d_int8.py           |   4 +-
 python/tvm/topi/cuda/dense.py                 |   5 +-
 python/tvm/topi/cuda/tensor_intrin.py         |   6 +-
 python/tvm/topi/rocm/dense.py                 |  79 +-------
 .../topi/python/test_topi_conv2d_int8.py      |  59 +++---
 tests/python/topi/python/test_topi_dense.py   |   1 -
 14 files changed, 181 insertions(+), 274 deletions(-)
 create mode 100644 python/tvm/tir/tensor_intrin/dot_product_common.py
 create mode 100644 python/tvm/tir/tensor_intrin/rocm.py

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 08da62e640e1..4253d93f6500 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -145,7 +145,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
             if (
-                (target.kind.name in ["cuda", "vulkan"])
+                (target.kind.name in ["cuda", "vulkan", "rocm"])
                 and data.dtype in ("int8", "uint8")
                 and kernel.dtype in ("int8", "uint8")
             ):
@@ -297,7 +297,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                                     Need to satisfy tensor core schedule."
                 )
         elif (
-            (target.kind.name in ["cuda", "vulkan"])
+            (target.kind.name in ["cuda", "vulkan", "rocm"])
             and layout == "NCHW4c"
             and data.dtype in ["int8", "uint8"]
         ):
@@ -376,7 +376,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             ic_chunk = in_channels // 4
 
             if (
-                (target.kind.name in ["cuda", "vulkan"])
+                (target.kind.name in ["cuda", "vulkan", "rocm"])
                 and data.dtype in ["int8", "uint8"]
                 and kernel.dtype in ["int8", "uint8"]
                 and channels % groups == 0
@@ -836,7 +836,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
     b, i = get_const_tuple(data.shape)
     o, _ = get_const_tuple(weights.shape)
     if (
-        target.kind.name in ["cuda", "vulkan"]
+        target.kind.name in ["cuda", "vulkan", "rocm"]
         and data.dtype == "int8"
         and weights.dtype == "int8"
         and out_type.dtype == "int32"
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index 1453128eeb67..a6cc94d2b116 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -24,155 +24,33 @@
 
 from .generic import *
 from .. import op as _op
-from .cuda import judge_winograd, naive_schedule
+from .cuda import batch_matmul_strategy_cuda, conv2d_strategy_cuda, dense_strategy_cuda
 
 
 @conv2d_strategy.register("rocm")
 def conv2d_strategy_rocm(attrs, inputs, out_type, target):
     """conv2d rocm strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
     groups = attrs.groups
     layout = attrs.data_layout
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    kernel_layout = attrs.kernel_layout
     padding = attrs.get_int_tuple("padding")
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8.
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_nchw),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
-                name="conv2d_nchw.cuda",
-            )
-            _, _, kh, kw = get_const_tuple(kernel.shape)
-            if (
-                2 < kh < 8
-                and 2 < kw < 8
-                and kh == kw
-                and stride_h == 1
-                and stride_w == 1
-                and dilation_h == 1
-                and dilation_w == 1
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd),
-                    name="conv2d_nchw_winograd.cuda",
-                    plevel=5,
-                )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
-                wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.gpu",
-            )
-            N, H, W, _ = get_const_tuple(data.shape)
-            KH, KW, CI, CO = get_const_tuple(kernel.shape)
 
-            (_, judge_winograd_autotvm, judge_winograd_auto_scheduler,) = judge_winograd(
-                N,
-                H,
-                W,
-                KH,
-                KW,
-                CI,
-                CO,
-                padding,
-                stride_h,
-                stride_w,
-                dilation_h,
-                dilation_w,
-                data.dtype,
-                kernel.dtype,
-                pre_flag=False,
-            )
+    strategy = conv2d_strategy_cuda(attrs, inputs, out_type, target)
 
-            if judge_winograd_autotvm:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_direct),
-                    name="conv2d_nhwc_winograd_direct.cuda",
-                    plevel=5,
-                )
+    # add miopen implementation
+    if (
+        "miopen" in target.libs
+        and groups == 1
+        and layout == "NCHW"
+        and padding[0] == padding[2]
+        and padding[1] == padding[3]
+    ):
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
+            wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
+            name="conv2d_nchw_miopen.rocm",
+            plevel=50,
+        )
 
-            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
-                    naive_schedule,  # this implementation should never be picked by autotvm
-                    name="conv2d_nhwc.winograd",
-                    plevel=15,
-                )
-        elif layout == "HWCN":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
-                name="conv2d_hwcn.cuda",
-            )
-        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
-            assert kernel_layout == "OIHW4o4i"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
-                name="conv2d_NCHWc_int8.cuda",
-            )
-        else:
-            raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))
-        # add miopen implementation
-        if (
-            "miopen" in target.libs
-            and layout == "NCHW"
-            and padding[0] == padding[2]
-            and padding[1] == padding[3]
-        ):
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
-                wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
-                name="conv2d_nchw_miopen.rocm",
-                plevel=15,
-            )
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.cuda",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.cuda",
-            )
-        else:
-            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
-    else:  # group_conv2d
-        if layout == "NCHW":
-            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.cuda",
-            )
-        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
-            assert kernel_layout == "OIHW4o4i"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
-                name="group_conv2d_NCHWc_int8.cuda",
-            )
-        else:
-            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
 
 
@@ -180,12 +58,8 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target):
 def dense_strategy_rocm(attrs, inputs, out_type, target):
     """Dense strategy for ROCM"""
     assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.rocm.dense),
-        wrap_topi_schedule(topi.rocm.schedule_dense),
-        name="dense.rocm",
-    )
+    strategy = dense_strategy_cuda(attrs, inputs, out_type, target)
+
     if target.kind.name == "rocm" and "rocblas" in target.libs:
         assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
         strategy.add_implementation(
@@ -200,13 +74,8 @@ def dense_strategy_rocm(attrs, inputs, out_type, target):
 @batch_matmul_strategy.register("rocm")
 def batch_matmul_strategy_rocm(attrs, inputs, out_type, target):
     """Batch matmul strategy for ROCM"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_batch_matmul(topi.cuda.batch_matmul),
-        wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
-        name="batch_matmul.cuda",
-        plevel=10,
-    )
+    strategy = batch_matmul_strategy_cuda(attrs, inputs, out_type, target)
+
     if target.kind.name == "rocm" and "rocblas" in target.libs:
         assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
         strategy.add_implementation(
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 93b1ad7a44a8..0d198c470bb6 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -387,6 +387,12 @@ def is_aarch64_arm():
     return "aarch64" in target.attrs.get("mtriple", "")
 
 
+def is_rocm():
+    """Checks whether we are compiling for a rocm/spirv target."""
+    target = tvm.target.Target.current(allow_none=False)
+    return "rocm" in target.keys
+
+
 def is_vulkan():
     """Checks whether we are compiling for a vulkan/spirv target."""
     target = tvm.target.Target.current(allow_none=False)
@@ -456,7 +462,7 @@ def _qnn_dense_legalize_intel_cpu(attrs, inputs, types):
 
 @qnn_conv2d_legalize.register(["cuda", "gpu"])
 def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
-    if is_vulkan():
+    if is_vulkan() or is_rocm():
         # prefers the dtypes to be same. Mixed type is not yet supported.
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
     if is_cuda():
@@ -467,7 +473,7 @@ def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
 
 @qnn_dense_legalize.register(["cuda", "gpu"])
 def _qnn_dense_legalize_cuda(attrs, inputs, types):
-    if is_vulkan():
+    if is_vulkan() or is_rocm():
         # prefers the dtypes to be same. Mixed type is not yet supported.
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
     if is_cuda():
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
index 62159851b3d4..4115c3b90070 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -18,3 +18,5 @@
 """Intrinsics for tensorization."""
 from .x86 import *
 from .arm_cpu import *
+from .dot_product_common import *
+from .rocm import *
diff --git a/python/tvm/tir/tensor_intrin/dot_product_common.py b/python/tvm/tir/tensor_intrin/dot_product_common.py
new file mode 100644
index 000000000000..c531b80380e3
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/dot_product_common.py
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Dot product related intrinsics."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+@T.prim_func
+def dp4a_desc(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+        for i in range(0, 4):
+            with T.block("update"):
+                vi = T.axis.remap("R", [i])
+                C[0] = C[0] + T.cast(A[vi], "int32") * T.cast(B[vi], "int32")
+
+
+@T.prim_func
+def dp4a_impl(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+
+        C[0] += T.call_pure_extern(
+            "__dp4a", A.vload([0], "int8x4"), B.vload([0], "int8x4"), T.int32(0), dtype="int32"
+        )
+
+
+DP4A_INTRIN = "dp4a"
+
+TensorIntrin.register(DP4A_INTRIN, dp4a_desc, dp4a_impl)
diff --git a/python/tvm/tir/tensor_intrin/rocm.py b/python/tvm/tir/tensor_intrin/rocm.py
new file mode 100644
index 000000000000..2095eb163521
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/rocm.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for AMDGPU tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+from .dot_product_common import dp4a_desc
+
+
+@T.prim_func
+def sdot4(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+
+        C[0] += T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.amdgcn.sdot4"),
+            T.uint32(4),
+            T.reinterpret(A.vload([0], "int8x4"), dtype="int32"),
+            T.reinterpret(B.vload([0], "int8x4"), dtype="int32"),
+            T.int32(0),
+            T.bool(1),
+            dtype="int32"
+        )
+
+
+AMDGPU_SDOT4_INTRIN = "sdot4"
+
+TensorIntrin.register(AMDGPU_SDOT4_INTRIN, dp4a_desc, sdot4)
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 5fce9d7a3f5d..859db6f00ebb 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -372,7 +372,7 @@ def _schedule_batch_matmul_int8(cfg, s, output):
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
 
-    if "vulkan" in target.keys:
+    if "vulkan" in target.keys or "rocm" in target.keys:
         do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index eaafe15e9600..7f52685e5d6d 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -34,7 +34,7 @@
 @nn.conv2d_alter_layout.register(["cuda", "gpu"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
-    doit = "vulkan" in target.keys or "cuda" in target.keys
+    doit = "vulkan" in target.keys or "cuda" in target.keys or "rocm" in target.keys
     if not doit:
         return None
     dispatch_ctx = autotvm.task.DispatchContext.current
@@ -87,7 +87,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     if cfg.is_fallback:  # if is fallback, clear query cache and return None
         autotvm.task.clear_fallback_cache(target, workload)
         do_new_layout = False
-        if "vulkan" in target.keys:
+        if "vulkan" in target.keys or "rocm" in target.keys:
             do_new_layout = "+dotprod" in target.mattr or target.supports_integer_dot_product
         if not do_new_layout:
             return None
@@ -351,7 +351,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
     """
 
     target = tvm.target.Target.current(allow_none=False)
-    doit = "vulkan" in target.keys or "cuda" in target.keys
+    doit = "vulkan" in target.keys or "cuda" in target.keys or "rocm" in target.keys
     if not doit:
         return None
     # Dilation not supported yet. Return None if dilation is not (1, 1)
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index 15120f6a2532..3c530445e92f 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -312,8 +312,8 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
-    if "vulkan" in target.keys:
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    # if "vulkan" in target.keys or "rocm" in target.keys:
+    #     do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
         s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 862e7b5bc59d..e7e651eefd8a 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -173,8 +173,9 @@ def _schedule_dense_int8(cfg, s, output):
     ko, kt = cfg["tile_k"].apply(s, CC, ko)
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
-    if "vulkan" in target.keys:
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    # if "vulkan" in target.keys or "rocm" in target.keys:
+    #     do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+
     if do_tensorize:
         dtypes = (data.dtype, weight.dtype)
         s[CC].tensorize(ki, dp4a("shared", "shared", "local", dtypes))
diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py
index c0596fc43262..6bb143140a41 100644
--- a/python/tvm/topi/cuda/tensor_intrin.py
+++ b/python/tvm/topi/cuda/tensor_intrin.py
@@ -71,7 +71,11 @@ def _instr(index):
             vec_y = yy.vload(0, dtype=vec_y_dtype)
             prev_z = 0 if index == 0 else zz.vload(0)
 
-            new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
+            # new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
+            new_z = tvm.tir.call_llvm_pure_intrin(zz_dtype, "llvm.amdgcn.sdot4", tvm.tir.const(4, "uint32"),
+                                                  tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
+                                                  tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
+                                                  prev_z, True)
             ib.emit(zz.vstore(0, new_z))
 
             return ib.get()
diff --git a/python/tvm/topi/rocm/dense.py b/python/tvm/topi/rocm/dense.py
index 2f3ce77cc7ba..983f235f0ec8 100644
--- a/python/tvm/topi/rocm/dense.py
+++ b/python/tvm/topi/rocm/dense.py
@@ -19,85 +19,8 @@
 from tvm import te
 from tvm import autotvm
 from tvm.contrib import rocblas
-from .. import generic, nn
+from .. import generic
 from .. import tag
-from ..utils import traverse_inline
-
-
-@autotvm.register_topi_compute("dense.rocm")
-def dense(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator for rocm backend.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.te.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [out_dim]
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim dense"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if out_dtype is None:
-        out_dtype = data.dtype
-    return nn.dense(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule("dense.rocm")
-def schedule_dense(cfg, outs):
-    """Schedule for dense operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense":
-            Dense = op.output(0)
-            num_thread = 64
-            k = Dense.op.reduce_axis[0]
-            ko, kf = s[Dense].split(k, factor=num_thread)
-            DenseF = s.rfactor(Dense, kf)
-
-            if Dense.op in s.outputs:
-                Out = Dense
-            else:
-                Out = outs[0].op.output(0)
-                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
-            s[Out].bind(s[Out].op.axis[0], te.thread_axis("blockIdx.y"))
-            s[Out].bind(s[Out].op.axis[1], te.thread_axis("blockIdx.x"))
-
-            tx = s[Dense].op.reduce_axis[0]
-            thread_x = te.thread_axis("threadIdx.x")
-            s[Dense].bind(tx, thread_x)
-            s[DenseF].compute_at(s[Dense], tx)
-            s[Dense].set_store_predicate(thread_x.var.equal(0))
-            s[Out].set_store_predicate(thread_x.var.equal(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
 
 
 @autotvm.register_topi_compute("dense_rocblas.rocm")
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 860118531e51..b93236b8cee6 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -346,45 +346,45 @@ def get_ref_data():
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
     targets = [
-        (
-            "cuda",
-            lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-            topi.cuda.schedule_conv2d_NCHWc_int8,
-            4,
-            False,
-        ),
-        # Disable on CI since it does not support spirv int8 dot product
         # (
-        #     "vulkan -from_device=0",
+        #     "cuda",
         #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
         #     topi.cuda.schedule_conv2d_NCHWc_int8,
         #     4,
         #     False,
         # ),
+        # Disable on CI since it does not support spirv int8 dot product
+        (
+            "rocm",
+            lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+            topi.cuda.schedule_conv2d_NCHWc_int8,
+            4,
+            False,
+        ),
     ]
 
     build_only_aarch64 = platform.machine() != "aarch64"
 
-    targets.append(
-        (
-            "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
-            topi.arm_cpu.conv2d_NCHWc_int8,
-            topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-            8,
-            build_only_aarch64,
-        )
-    )
-
-    if in_dtype == "int8":
-        targets.append(
-            (
-                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
-                topi.arm_cpu.conv2d_NCHWc_int8,
-                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-                8,
-                build_only_aarch64,
-            )
-        )
+    # targets.append(
+    #     (
+    #         "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
+    #         topi.arm_cpu.conv2d_NCHWc_int8,
+    #         topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+    #         8,
+    #         build_only_aarch64,
+    #     )
+    # )
+
+    # if in_dtype == "int8":
+    #     targets.append(
+    #         (
+    #             "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
+    #             topi.arm_cpu.conv2d_NCHWc_int8,
+    #             topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+    #             8,
+    #             build_only_aarch64,
+    #         )
+    #     )
 
     for target, compute, schedule, oc_block_factor, build_only in targets:
         check_target(target, compute, schedule, oc_block_factor, build_only)
@@ -517,6 +517,7 @@ def test_conv2d_nchw(in_dtype):
     with Int8Fallback():
         # ResNet18 workloads where channels in / out are multiple of oc_block_factor
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
+        return
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 1, 1, 0)
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 3, 2, 1)
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 1, 2, 0)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 8f58415da329..2826d70ba0ed 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -52,7 +52,6 @@
     ],
     "mali": [(topi.mali.dense, topi.mali.schedule_dense)],
     "bifrost": [(topi.bifrost.dense, topi.bifrost.schedule_dense)],
-    "rocm": [(topi.rocm.dense, topi.rocm.schedule_dense)],
     "hls": [(topi.nn.dense, topi.hls.schedule_dense)],
 }
 

From 5d494fdf16c26bfe8d1ecfb81e5f332781425db0 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 10:58:10 +0900
Subject: [PATCH 02/10] cleanup

---
 python/tvm/relay/qnn/op/legalizations.py      | 28 ++------
 python/tvm/topi/cuda/batch_matmul.py          |  4 +-
 python/tvm/topi/cuda/conv2d_alter_op.py       | 12 ++--
 python/tvm/topi/cuda/conv2d_int8.py           |  6 +-
 python/tvm/topi/cuda/dense.py                 |  6 +-
 python/tvm/topi/utils.py                      |  6 ++
 .../topi/python/test_topi_conv2d_int8.py      | 65 ++++++++++---------
 7 files changed, 59 insertions(+), 68 deletions(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 0d198c470bb6..e669e14032f9 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -24,6 +24,7 @@
 from tvm.relay.qnn.op.canonicalizations import create_integer_lookup_op
 
 from ....topi.x86.utils import target_has_sse42
+from ....topi.utils import is_target
 from .. import op as reg
 
 #################################################
@@ -387,24 +388,6 @@ def is_aarch64_arm():
     return "aarch64" in target.attrs.get("mtriple", "")
 
 
-def is_rocm():
-    """Checks whether we are compiling for a rocm/spirv target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "rocm" in target.keys
-
-
-def is_vulkan():
-    """Checks whether we are compiling for a vulkan/spirv target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "vulkan" in target.keys
-
-
-def is_cuda():
-    """Checks whether we are compiling for a cuda target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "cuda" in target.keys
-
-
 ########################
 # ARM CPU legalizations.
 ########################
@@ -462,10 +445,10 @@ def _qnn_dense_legalize_intel_cpu(attrs, inputs, types):
 
 @qnn_conv2d_legalize.register(["cuda", "gpu"])
 def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
-    if is_vulkan() or is_rocm():
+    if is_target("vulkan"):
         # prefers the dtypes to be same. Mixed type is not yet supported.
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-    if is_cuda():
+    if is_target(["cuda", "rocm"]):
         # CUDA prefers both datatypes to be int8.
         return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.conv2d)
     return None
@@ -473,11 +456,10 @@ def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
 
 @qnn_dense_legalize.register(["cuda", "gpu"])
 def _qnn_dense_legalize_cuda(attrs, inputs, types):
-    if is_vulkan() or is_rocm():
+    if is_target("vulkan"):
         # prefers the dtypes to be same. Mixed type is not yet supported.
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
-    if is_cuda():
+    if is_target(["cuda", "rocm"]):
         # CUDA prefers both datatypes to be the int8.
         return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.dense)
-
     return None
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 859db6f00ebb..8c156dc5db8e 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -22,7 +22,7 @@
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn, generic
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor, is_target
 from .tensor_intrin import dp4a
 
 
@@ -372,7 +372,7 @@ def _schedule_batch_matmul_int8(cfg, s, output):
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
 
-    if "vulkan" in target.keys or "rocm" in target.keys:
+    if is_target(["vulkan", "rocm"]):
         do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 7f52685e5d6d..35d50eb3673c 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -22,7 +22,7 @@
 from tvm import te, relay, autotvm
 
 from .. import nn
-from ..utils import get_const_tuple
+from ..utils import get_const_tuple, is_target
 from .conv2d_winograd import _infer_tile_size
 from .tensorcore_alter_op import pad_to_tensorcore
 from ..nn import conv2d_legalize
@@ -34,8 +34,7 @@
 @nn.conv2d_alter_layout.register(["cuda", "gpu"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
-    doit = "vulkan" in target.keys or "cuda" in target.keys or "rocm" in target.keys
-    if not doit:
+    if not is_target(["vulkan", "rocm", "cuda"]):
         return None
     dispatch_ctx = autotvm.task.DispatchContext.current
 
@@ -87,7 +86,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     if cfg.is_fallback:  # if is fallback, clear query cache and return None
         autotvm.task.clear_fallback_cache(target, workload)
         do_new_layout = False
-        if "vulkan" in target.keys or "rocm" in target.keys:
+        if is_target(["vulkan", "rocm"]):
             do_new_layout = "+dotprod" in target.mattr or target.supports_integer_dot_product
         if not do_new_layout:
             return None
@@ -349,10 +348,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
     result : tvm.relay.Expr
         The legalized expr
     """
-
-    target = tvm.target.Target.current(allow_none=False)
-    doit = "vulkan" in target.keys or "cuda" in target.keys or "rocm" in target.keys
-    if not doit:
+    if not is_target(["vulkan", "rocm", "cuda"]):
         return None
     # Dilation not supported yet. Return None if dilation is not (1, 1)
     dilation = attrs.get_int_tuple("dilation")
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index 3c530445e92f..a8b21a1deca0 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -26,7 +26,7 @@
 from ..nn.pad import pad
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline, is_target
 
 
 def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
@@ -312,8 +312,8 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
-    # if "vulkan" in target.keys or "rocm" in target.keys:
-    #     do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    if is_target(["vulkan", "rocm"]):
+        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
         s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index e7e651eefd8a..859f6c1097c6 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -24,7 +24,7 @@
 from .tensor_intrin import dp4a
 from .. import tag
 from .. import generic
-from ..utils import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple, is_target
 
 logger = logging.getLogger("topi")
 
@@ -173,8 +173,8 @@ def _schedule_dense_int8(cfg, s, output):
     ko, kt = cfg["tile_k"].apply(s, CC, ko)
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
-    # if "vulkan" in target.keys or "rocm" in target.keys:
-    #     do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    if is_target(["vulkan", "rocm"]):
+        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
         dtypes = (data.dtype, weight.dtype)
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index af68ee905e56..3ea82530890d 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -524,3 +524,9 @@ def ceil_div(a, b):
 def swap(arr, axis):
     """swap arr[axis] and arr[-1]"""
     return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]]
+
+
+def is_target(names):
+    names = [names] if isinstance(names, str) else names
+    target = tvm.target.Target.current(allow_none=False)
+    return any(name in target.keys for name in names)
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index b93236b8cee6..688151d4a5b0 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -346,45 +346,52 @@ def get_ref_data():
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
     targets = [
-        # (
-        #     "cuda",
-        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-        #     topi.cuda.schedule_conv2d_NCHWc_int8,
-        #     4,
-        #     False,
-        # ),
-        # Disable on CI since it does not support spirv int8 dot product
         (
-            "rocm",
+            "cuda",
             lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
             topi.cuda.schedule_conv2d_NCHWc_int8,
             4,
             False,
         ),
+        # Disable on CI since it does not support spirv int8 dot product or rocm
+        # (
+        #     "vulkan -from_device=0",
+        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+        #     topi.cuda.schedule_conv2d_NCHWc_int8,
+        #     4,
+        #     False,
+        # ),
+        # (
+        #     "rocm",
+        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+        #     topi.cuda.schedule_conv2d_NCHWc_int8,
+        #     4,
+        #     False,
+        # ),
     ]
 
     build_only_aarch64 = platform.machine() != "aarch64"
 
-    # targets.append(
-    #     (
-    #         "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
-    #         topi.arm_cpu.conv2d_NCHWc_int8,
-    #         topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-    #         8,
-    #         build_only_aarch64,
-    #     )
-    # )
-
-    # if in_dtype == "int8":
-    #     targets.append(
-    #         (
-    #             "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
-    #             topi.arm_cpu.conv2d_NCHWc_int8,
-    #             topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-    #             8,
-    #             build_only_aarch64,
-    #         )
-    #     )
+    targets.append(
+        (
+            "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
+            topi.arm_cpu.conv2d_NCHWc_int8,
+            topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+            8,
+            build_only_aarch64,
+        )
+    )
+
+    if in_dtype == "int8":
+        targets.append(
+            (
+                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
+                topi.arm_cpu.conv2d_NCHWc_int8,
+                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+                8,
+                build_only_aarch64,
+            )
+        )
 
     for target, compute, schedule, oc_block_factor, build_only in targets:
         check_target(target, compute, schedule, oc_block_factor, build_only)

From 3ed4273c0de9d955548a8f7f4c15b0ba58fde595 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 11:02:43 +0900
Subject: [PATCH 03/10] black

---
 python/tvm/tir/tensor_intrin/rocm.py              |  2 +-
 python/tvm/topi/cuda/tensor_intrin.py             | 13 +++++++++----
 python/tvm/topi/utils.py                          |  1 +
 tests/python/topi/python/test_topi_conv2d_int8.py |  1 -
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/rocm.py b/python/tvm/tir/tensor_intrin/rocm.py
index 2095eb163521..7a989d0bccaa 100644
--- a/python/tvm/tir/tensor_intrin/rocm.py
+++ b/python/tvm/tir/tensor_intrin/rocm.py
@@ -38,7 +38,7 @@ def sdot4(
             T.reinterpret(B.vload([0], "int8x4"), dtype="int32"),
             T.int32(0),
             T.bool(1),
-            dtype="int32"
+            dtype="int32",
         )
 
 
diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py
index 6bb143140a41..34b00e45729b 100644
--- a/python/tvm/topi/cuda/tensor_intrin.py
+++ b/python/tvm/topi/cuda/tensor_intrin.py
@@ -72,10 +72,15 @@ def _instr(index):
             prev_z = 0 if index == 0 else zz.vload(0)
 
             # new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
-            new_z = tvm.tir.call_llvm_pure_intrin(zz_dtype, "llvm.amdgcn.sdot4", tvm.tir.const(4, "uint32"),
-                                                  tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
-                                                  tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
-                                                  prev_z, True)
+            new_z = tvm.tir.call_llvm_pure_intrin(
+                zz_dtype,
+                "llvm.amdgcn.sdot4",
+                tvm.tir.const(4, "uint32"),
+                tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
+                tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
+                prev_z,
+                True,
+            )
             ib.emit(zz.vstore(0, new_z))
 
             return ib.get()
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index 3ea82530890d..f1c6fb5aa4f4 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -527,6 +527,7 @@ def swap(arr, axis):
 
 
 def is_target(names):
+    """Return True if the name of the current target is one of provided names"""
     names = [names] if isinstance(names, str) else names
     target = tvm.target.Target.current(allow_none=False)
     return any(name in target.keys for name in names)
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 688151d4a5b0..5edc3a393fd6 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -524,7 +524,6 @@ def test_conv2d_nchw(in_dtype):
     with Int8Fallback():
         # ResNet18 workloads where channels in / out are multiple of oc_block_factor
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
-        return
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 1, 1, 0)
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 3, 2, 1)
         verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 1, 2, 0)

From cc37698244ac272d64bf947091d60d49d307a493 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 11:12:14 +0900
Subject: [PATCH 04/10] update dot prod intrin

---
 python/tvm/topi/cuda/tensor_intrin.py | 32 ++++++++++++++++++---------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py
index 34b00e45729b..dfd10c6256ea 100644
--- a/python/tvm/topi/cuda/tensor_intrin.py
+++ b/python/tvm/topi/cuda/tensor_intrin.py
@@ -18,6 +18,7 @@
 """Tensor intrinsics on CUDA."""
 import tvm
 from tvm import te
+from ..utils import traverse_inline, get_const_tuple, is_target
 
 
 def dp4a(x_scope="local", y_scope="local", z_scope="local", dtypes=("int8", "int8")):
@@ -71,16 +72,27 @@ def _instr(index):
             vec_y = yy.vload(0, dtype=vec_y_dtype)
             prev_z = 0 if index == 0 else zz.vload(0)
 
-            # new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
-            new_z = tvm.tir.call_llvm_pure_intrin(
-                zz_dtype,
-                "llvm.amdgcn.sdot4",
-                tvm.tir.const(4, "uint32"),
-                tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
-                tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
-                prev_z,
-                True,
-            )
+            if is_target("rocm"):
+                # TODO(masahi): Here we are assuming that we are compiling for gfx10 or later
+                # We can refine the specification for dot product on rocm if needed later.
+
+                # We can just use "llvm.amdgcn.udot4" for u8u8u32, but it is not tested.
+                assert (
+                    dtypes[0] == "int8" and dtypes[0] == "int8"
+                ), "u8u8u32 dot product for rocm not supported yet"
+
+                new_z = tvm.tir.call_llvm_pure_intrin(
+                    zz_dtype,
+                    "llvm.amdgcn.sdot4",
+                    tvm.tir.const(4, "uint32"),
+                    tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
+                    tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
+                    prev_z,
+                    True,
+                )
+            else:
+                new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
+
             ib.emit(zz.vstore(0, new_z))
 
             return ib.get()

From 9e975ab3880d3ee967394457149d59c4d81a6120 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 11:33:38 +0900
Subject: [PATCH 05/10] add mattr kind

---
 python/tvm/topi/cuda/batch_matmul.py | 3 ---
 src/target/target_kind.cc            | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 8c156dc5db8e..ff625d6d714c 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -333,9 +333,6 @@ def _callback(op):
     return s
 
 
-_dp4a = dp4a("shared", "shared", "local")
-
-
 def _schedule_batch_matmul_int8(cfg, s, output):
     input_x, input_y = s[output].op.input_tensors
     if len(input_y.op.input_tensors) == 1 and input_y.op.input_tensors[0] == input_x:
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 6fef8b48c396..2bc792f398cd 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -306,6 +306,7 @@ TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA)
 TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
+    .add_attr_option<String>("mattr")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(64))

From f9d6a8d4caf58b2d6ca27d4e8695c4e648ae5cc7 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 16:32:43 +0900
Subject: [PATCH 06/10] conv2d topi test working

---
 src/target/target_kind.cc                     |  2 +-
 .../topi/python/test_topi_conv2d_int8.py      | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 2bc792f398cd..96c193d34aa1 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -306,7 +306,7 @@ TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA)
 TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
-    .add_attr_option<String>("mattr")
+    .add_attr_option<Array<String>>("mattr")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(64))
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 5edc3a393fd6..cb106af7a571 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -361,13 +361,6 @@ def get_ref_data():
         #     4,
         #     False,
         # ),
-        # (
-        #     "rocm",
-        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-        #     topi.cuda.schedule_conv2d_NCHWc_int8,
-        #     4,
-        #     False,
-        # ),
     ]
 
     build_only_aarch64 = platform.machine() != "aarch64"
@@ -383,15 +376,22 @@ def get_ref_data():
     )
 
     if in_dtype == "int8":
-        targets.append(
+        targets += [
             (
                 "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
                 topi.arm_cpu.conv2d_NCHWc_int8,
                 topi.arm_cpu.schedule_conv2d_NCHWc_int8,
                 8,
                 build_only_aarch64,
-            )
-        )
+            ),
+            (
+                "rocm -mattr=+dotprod",
+                lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+                topi.cuda.schedule_conv2d_NCHWc_int8,
+                4,
+                False,
+            ),
+        ]
 
     for target, compute, schedule, oc_block_factor, build_only in targets:
         check_target(target, compute, schedule, oc_block_factor, build_only)

From 90c39f1275057450f244baca2647715dea570e9e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 16:46:05 +0900
Subject: [PATCH 07/10] add dense and bmm test

---
 tests/python/relay/test_op_level1.py          | 38 +++++++++++++++++++
 tests/python/relay/test_op_level10.py         | 35 +++++++++++++++++
 .../topi/python/test_topi_conv2d_int8.py      |  2 +-
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index c7aceb685bcf..d4238f81e01b 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -676,5 +676,43 @@ def test_dense_vnni():
         np.testing.assert_equal(out, ref)
 
 
+@pytest.mark.skip("Requires GFX10 AMDGPU")
+def test_dense_rocm_sdot4():
+    data_shape = (32, 96)
+    weight_shape = (128, 96)
+
+    data_dtype = "int8"
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=weight_shape, dtype="int8")
+    bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
+    dense = relay.nn.dense(data, weight, out_dtype="int32")
+    out = relay.nn.bias_add(dense, bias)
+    mod = tvm.IRModule.from_expr(out)
+
+    target = "rocm -mattr=+dotprod"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target)
+
+    asm = lib.lib.imported_modules[0].get_source("asm")
+    assert "v_dot4_i32_i8" in asm
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
+    b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+    c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
+
+    runtime.set_input("data", a)
+    runtime.set_input("weight", b)
+    runtime.set_input("bias", c)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+    ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
+
+    np.testing.assert_equal(out, ref)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 85a3dd5636f1..8ee5adbb318d 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -447,6 +447,41 @@ def test_batch_matmul_vnni():
         np.testing.assert_equal(out, ref)
 
 
+@pytest.mark.skip("Requires GFX10 AMDGPU")
+def test_batch_matmul_rocm_sdot4():
+    x_shape = (16, 32, 96)
+    y_shape = (16, 128, 96)
+
+    lhs_dtype = "int8"
+    x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
+    y = relay.var("y", shape=y_shape, dtype="int8")
+    bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
+
+    mod = tvm.IRModule.from_expr(bmm)
+
+    target = "rocm -mattr=+dotprod"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target)
+
+    asm = lib.lib.imported_modules[0].get_source("asm")
+    assert "v_dot4_i32_i8" in asm
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
+    y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
+
+    runtime.set_input("x", x_np)
+    runtime.set_input("y", y_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+    ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32")
+
+    np.testing.assert_equal(out, ref)
+
+
 @tvm.testing.uses_gpu
 def test_shape_of():
     shape = (10, 5, 12)
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index cb106af7a571..17c5573b2c70 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -353,7 +353,7 @@ def get_ref_data():
             4,
             False,
         ),
-        # Disable on CI since it does not support spirv int8 dot product or rocm
+        # Disable on CI since it does not support spirv int8 dot product
         # (
         #     "vulkan -from_device=0",
         #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),

From 98f735963de898b346aa0958e2a60c70c9a3a8cc Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 16:53:34 +0900
Subject: [PATCH 08/10] add conv2d relay test

---
 tests/python/relay/test_op_level2.py | 76 ++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index bd9536742a8b..273d230d3af5 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1582,18 +1582,24 @@ def _test_upsampling3d(layout, method, coordinate_transformation_mode="half_pixe
 
     def get_shape():
         if layout == "NCDHW":
-            return (c, d, h, w), (
-                c,
-                int(round(d * scale_d)),
-                int(round(h * scale_h)),
-                int(round(w * scale_w)),
+            return (
+                (c, d, h, w),
+                (
+                    c,
+                    int(round(d * scale_d)),
+                    int(round(h * scale_h)),
+                    int(round(w * scale_w)),
+                ),
             )
         else:
-            return (d, h, w, c), (
-                int(round(d * scale_d)),
-                int(round(h * scale_h)),
-                int(round(w * scale_w)),
-                c,
+            return (
+                (d, h, w, c),
+                (
+                    int(round(d * scale_d)),
+                    int(round(h * scale_h)),
+                    int(round(w * scale_w)),
+                    c,
+                ),
             )
 
     ishape, oshape = get_shape()
@@ -1944,5 +1950,55 @@ def _test_correlation(
     )
 
 
+@pytest.mark.skip("Requires GFX10 AMDGPU")
+def test_conv2d_rocm_sdot4():
+    d_shape = (1, 64, 56, 56)
+    w_shape = (64, 64, 3, 3)
+    padding = (1, 1)
+    strides = (1, 1)
+    data_dtype = "int8"
+    weight_dtype = "int8"
+    out_dtype = "int32"
+
+    data = relay.var("data", shape=d_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
+    out_channel = w_shape[0]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[2:],
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype=out_dtype,
+    )
+
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype("int8")
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("int8")
+
+    target = "rocm -mattr=+dotprod"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params={"weight": weight_np})
+
+    asm = lib.lib.imported_modules[0].get_source("asm")
+    assert "v_dot4_i32_i8" in asm
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = tvm.topi.testing.conv2d_nchw_python(
+        data_np.astype("int32"), weight_np.astype("int32"), strides, padding
+    )
+
+    np.testing.assert_equal(out, ref)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 9a383a699c670adfb04bca45553c198f8825103c Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 17:28:03 +0900
Subject: [PATCH 09/10] add tir intrin test

---
 tests/python/relay/test_op_level2.py          | 26 ++++------
 .../unittest/test_tir_schedule_tensorize.py   | 50 +++++++++++++++++++
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 273d230d3af5..7b261b0eb7cd 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1582,24 +1582,18 @@ def _test_upsampling3d(layout, method, coordinate_transformation_mode="half_pixe
 
     def get_shape():
         if layout == "NCDHW":
-            return (
-                (c, d, h, w),
-                (
-                    c,
-                    int(round(d * scale_d)),
-                    int(round(h * scale_h)),
-                    int(round(w * scale_w)),
-                ),
+            return (c, d, h, w), (
+                c,
+                int(round(d * scale_d)),
+                int(round(h * scale_h)),
+                int(round(w * scale_w)),
             )
         else:
-            return (
-                (d, h, w, c),
-                (
-                    int(round(d * scale_d)),
-                    int(round(h * scale_h)),
-                    int(round(w * scale_w)),
-                    c,
-                ),
+            return (d, h, w, c), (
+                int(round(d * scale_d)),
+                int(round(h * scale_h)),
+                int(round(w * scale_w)),
+                c,
             )
 
     ishape, oshape = get_shape()
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 482d6f3db574..65dfa06eb6c1 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -26,6 +26,8 @@
     VNNI_DOT_16x4_INTRIN,
     ARM_DOT_4x4_i8_NEON_INTRIN,
     ARM_DOT_4x4_i8_SDOT_INTRIN,
+    AMDGPU_SDOT4_INTRIN,
+    DP4A_INTRIN,
 )
 
 # fmt: off
@@ -595,5 +597,53 @@ def test_tensorize_arm_dot():
         verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensorize_dpa4():
+    m, n, k = 128, 128, 128
+
+    X = te.placeholder((m, k), name="X", dtype="int8")
+    W = te.placeholder((n, k), name="W", dtype="int8")
+    ak = te.reduce_axis((0, k), name="k")
+
+    matmul = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * W[j, ak].astype("int32"),
+            axis=ak,
+        ),
+        name="compute",
+    )
+
+    func = te.create_prim_func([X, W, matmul])
+
+    for intrin in [AMDGPU_SDOT4_INTRIN, DP4A_INTRIN]:
+        sch = tir.Schedule(func, debug_mask="all")
+        block = sch.get_block("compute")
+        i, j, k = sch.get_loops(block)
+
+        by, ty, yi = sch.split(i, factors=sch.sample_perfect_tile(i, n=3))
+        bx, tx, xi = sch.split(j, factors=sch.sample_perfect_tile(j, n=3))
+        ko, ki = sch.split(k, [None, 4])
+        ko, kt = sch.split(ko, factors=sch.sample_perfect_tile(ko, n=2))
+
+        sch.reorder(by, bx, ty, tx, yi, xi)
+
+        CC = sch.cache_write(block, 0, "local")
+        sch.reverse_compute_at(CC, tx)
+
+        def fetch_to_shared(block, idx):
+            block_read = sch.cache_read(block, idx, "shared")
+            sch.compute_at(block_read, ko, True)
+            return block_read
+
+        fetch_to_shared(block, 0)
+        fetch_to_shared(block, 1)
+
+        sch.decompose_reduction(block, ko)
+        sch.tensorize(ki, intrin)
+
+        verify_trace_roundtrip(sch=sch, mod=func)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From ee7a24c516066aae876cc9c4def690cd958f52d5 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 17:40:52 +0900
Subject: [PATCH 10/10] pylint

---
 python/tvm/relay/op/strategy/rocm.py  | 1 -
 python/tvm/topi/cuda/tensor_intrin.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index a6cc94d2b116..6e91101826c9 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -17,7 +17,6 @@
 """Definition of ROCm operator strategy."""
 # pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import
 from tvm import topi
-from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.contrib.thrust import can_use_rocthrust
 from tvm.contrib import miopen
diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py
index dfd10c6256ea..0a504906c053 100644
--- a/python/tvm/topi/cuda/tensor_intrin.py
+++ b/python/tvm/topi/cuda/tensor_intrin.py
@@ -18,7 +18,7 @@
 """Tensor intrinsics on CUDA."""
 import tvm
 from tvm import te
-from ..utils import traverse_inline, get_const_tuple, is_target
+from ..utils import is_target
 
 
 def dp4a(x_scope="local", y_scope="local", z_scope="local", dtypes=("int8", "int8")):