[target] Use native architecture for llvm target

FranklandJack · FranklandJack · commit 8b60c09fc273 · 2023-06-02T16:03:18.000+01:00
Set the default `-device=` key for llvm targets based on the native
architecture rather than hard coding to `cpu` which is x86 specific.
This means that when llvm target triples are not specified  we will
test `arm_cpu` schedules on Arm®-based architectures and `cpu`
schedules on x86 based architectures.

Fix any schedule test failures that result from this fix.
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -132,13 +132,16 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                         plevel=15,
                     )
                 else:
+                    # TODO(@FranklandJack)
+                    # Investigate why this producing output tensor of
+                    # incorrect dimensions in
+                    # test_runtime_module_based_interface.py
                     # ARM conv2d spatial pack schedule.
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
-                        name="conv2d_nchw_spatial_pack.arm_cpu",
-                        plevel=10,
-                    )
+                    # strategy.add_implementation(
+                    #   wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
+                    #   wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
+                    #   name="conv2d_nchw_spatial_pack.arm_cpu",
+                    #   plevel=10,
 
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.x86.conv2d_nchw),
@@ -152,6 +155,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 is_winograd_applicable = (
                     "float" in data.dtype
                     and "float" in kernel.dtype
+                    and not data.dtype.count("custom")
                     and kh == 3
                     and kw == 3
                     and stride_h == 1
@@ -284,8 +288,21 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="depthwise_conv2d_nchw.x86",
                 )
         elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            if target.features.has_asimd:
+            # TODO(@FranklandJack)
+            # Handle HWOI in arm_cpu schedules/compute definition.
+            if kernel_layout != "HWOI":
+                logger.warning(
+                    """depthwise_conv2d with layout NHWC and HWOI
+                               kernel layout is not optimized for arm_cpu target.
+                               """
+                )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
+                    wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
+                    name="depthwise_conv2d_nhwc.generic",
+                )
+
+            elif target.features.has_asimd:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
@@ -111,8 +111,7 @@ def qnn_conv2d_transpose_legalize(attrs, inputs, types):
     # Otherwise it needs to be broadcast.
     else:
         shift_data = relay.nn.bias_add(
-            relay.cast(data, dtype="int16"),
-            -relay.cast(input_zero_point, dtype="int16"),
+            relay.cast(data, dtype="int16"), -relay.cast(input_zero_point, dtype="int16")
         )
 
     # If kernel zero point is a scalar, we can directly subtract it.
@@ -123,8 +122,7 @@ def qnn_conv2d_transpose_legalize(attrs, inputs, types):
     # Otherwise it needs to be broadcast.
     else:
         shift_kernel = relay.nn.bias_add(
-            relay.cast(kernel, dtype="int16"),
-            -relay.cast(kernel_zero_point, dtype="int16"),
+            relay.cast(kernel, dtype="int16"), -relay.cast(kernel_zero_point, dtype="int16")
         )
 
     return relay.nn.conv2d_transpose(shift_data, shift_kernel, **attrs)
@@ -486,7 +484,10 @@ def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     if target.features.has_asimd and not other_options:
         return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
     # ARM prefers the dtypes to be same.
-    return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
+    if types[0].dtype == "int8":
+        return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
+
+    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
 
 
 @qnn_dense_legalize.register("arm_cpu")
@@ -495,7 +496,10 @@ def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
     if target.features.has_asimd and not target.features.has_dotprod:
         return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
     # ARM prefers the dtypes to be same.
-    return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
+    if types[0].dtype == "int8":
+        return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
+
+    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense)
 
 
 ##########################
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
@@ -23,10 +23,11 @@
 from tvm import autotvm
 import tvm.contrib.nnpack
 
-from ..utils import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple, conv2d_infer_layout_helper
 from .. import nn
 from ..nn.utils import get_const_int, get_pad_tuple
 from ..nn.winograd_util import winograd_transform_matrices
+from ..nn.conv2d import conv2d_infer_layout
 from .conv2d_spatial_pack import (
     conv2d_spatial_pack_nchw,
     conv2d_spatial_pack_nhwc,
@@ -509,3 +510,8 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
 def schedule_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return conv2d_nhwc_dsp_schedule(cfg, outs)
+
+
+@conv2d_infer_layout.register("arm_cpu")
+def _conv2d_infer_layout(workload, cfg):
+    return conv2d_infer_layout_helper(workload, cfg)
diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
@@ -69,8 +69,10 @@ def schedule_injective(outs):
     if list(s[x].op.axis):
         # do not vectorize for broadcast
         dtype = "uint16" if x.dtype == "bfloat16" else x.dtype
-        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(dtype).itemsize)
-        s[x].vectorize(ii)
+        # do not vectorize for custom data types
+        if 0 == dtype.count("custom"):
+            (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(dtype).itemsize)
+            s[x].vectorize(ii)
     tvm.te.schedule.AutoInlineInjective(s)
 
     if not is_empty_shape(x.shape):
diff --git a/python/tvm/topi/intel_graphics/conv2d_alter_op.py b/python/tvm/topi/intel_graphics/conv2d_alter_op.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm import autotvm
 
-from ..utils import get_const_tuple
+from ..utils import get_const_tuple, conv2d_infer_layout_helper
 from ..nn import conv2d_alter_layout, conv2d_infer_layout
 from .conv2d import _get_default_config
 
@@ -102,14 +102,4 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
 @conv2d_infer_layout.register("intel_graphics")
 def _conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, layout, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[1]
-    out_channel, _, k_height, k_width = kernel[1]
-    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
-    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
-    in_layout = f"NCHW{tile_ic}c"
-    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
-    out_layout = f"NCHW{tile_oc}c"
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
+    return conv2d_infer_layout_helper(workload, cfg)
diff --git a/python/tvm/topi/testing/common.py b/python/tvm/topi/testing/common.py
@@ -35,6 +35,8 @@
 _reduce_schedule = {
     "generic": topi.generic.schedule_reduce,
     "cpu": topi.x86.schedule_reduce,
+    # TODO(@FranklandJack) Write arm_cpu specific reduction schedule.
+    "arm_cpu": topi.x86.schedule_reduce,
     "gpu": topi.cuda.schedule_reduce,
     "hls": topi.cuda.schedule_reduce,
 }
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
@@ -526,3 +526,26 @@ def is_target(names):
 def is_dynamic_shape(shape):
     """Checks if any part of a shape is dynamic"""
     return any([isinstance(x, (Any, SizeVar)) for x in shape])
+
+
+def conv2d_infer_layout_helper(workload, cfg):
+    """Infers input and output layouts for a conv2d operator
+    scheduled using "tile_ic" and "tile_oc" scheduling configuration knobs which
+    is the case for cpu, arm_cpu and intel_graphics targets."""
+    _, data, kernel, strides, padding, dilation, layout, _, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[1]
+    out_channel, _, k_height, k_width = kernel[1]
+    idxdiv = tvm.tir.indexdiv
+
+    pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
+    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    dilated_kernel_h = (k_height - 1) * hdilation + 1
+    dilated_kernel_w = (k_width - 1) * wdilation + 1
+    out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
+    out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
+    in_layout = f"NCHW{tile_ic}c"
+    out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc)
+    out_layout = f"NCHW{tile_oc}c"
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
@@ -30,7 +30,7 @@
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
 from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline, conv2d_infer_layout_helper
 from . import conv2d_avx_1x1, conv2d_avx_common
 
 logger = logging.getLogger("topi")
@@ -65,23 +65,7 @@ def _get_default_config(
 
 @conv2d_infer_layout.register("cpu")
 def _conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, layout, _, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[1]
-    out_channel, _, k_height, k_width = kernel[1]
-    idxdiv = tvm.tir.indexdiv
-
-    pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-    dilated_kernel_h = (k_height - 1) * hdilation + 1
-    dilated_kernel_w = (k_width - 1) * wdilation + 1
-    out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1
-    out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
-    in_layout = f"NCHW{tile_ic}c"
-    out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc)
-    out_layout = f"NCHW{tile_oc}c"
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
+    return conv2d_infer_layout_helper(workload, cfg)
 
 
 def schedule_conv2d_nhwc(outs):
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
@@ -257,6 +257,12 @@ TargetJSON TestTargetParser(TargetJSON target) {
 
 /**********  Register Target kinds and attributes  **********/
 
+#if defined(__arm__) || defined(__aarch64__)
+#define NATIVE_CPU "arm_cpu"
+#else
+#define NATIVE_CPU "cpu"
+#endif
+
 TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Array<String>>("mattr")
     .add_attr_option<String>("mcpu")
@@ -275,12 +281,14 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Integer>("opt-level")
     // LLVM command line flags, see below
     .add_attr_option<Array<String>>("cl-opt")
-    .set_default_keys({"cpu"})
+    .set_default_keys({NATIVE_CPU})
     // Force the external codegen kind attribute to be registered, even if no external
     // codegen targets are enabled by the TVM build.
     .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(false))
     .set_target_parser(tvm::target::parsers::cpu::ParseTarget);
 
+#undef NATIVE_CPU
+
 // Note regarding the "cl-opt" attribute:
 // Each string in the array has the format
 //   -optionname[[:type]=value]
diff --git a/tests/python/topi/python/test_topi_bitserial_dense.py b/tests/python/topi/python/test_topi_bitserial_dense.py
@@ -17,6 +17,7 @@
 """Test code for bitserial_dense operator"""
 import os
 import numpy as np
+from tvm.target.target import Target
 import tvm
 from tvm import te
 from tvm import topi
@@ -53,11 +54,12 @@ def get_ref_data(a_shape, b_shape, input_dtype):
             c_np = np.dot(a_np, b_np.T)
         return a_np, b_np, c_np
 
-    for target in ["llvm", "llvm -device=arm_cpu"]:
-        if "arm_cpu" in target and "arm" not in os.uname()[4]:
+    for target_string in ["llvm", "llvm -device=arm_cpu"]:
+        target = Target(target_string)
+        if "arm_cpu" in target.keys and "arm" not in os.uname()[4]:
             print("Skipped running code, not an arm device")
             continue
-        input_dtype = "uint8" if "arm_cpu" in target else "uint32"
+        input_dtype = "uint8" if "arm_cpu" in target.keys else "uint32"
         A = te.placeholder((batch, in_dim), dtype=input_dtype, name="A")
         B = te.placeholder((out_dim, in_dim), dtype=input_dtype, name="B")
         fcompute, fschedule = tvm.topi.testing.dispatch(target, _bitserial_dense_implement)
diff --git a/tests/python/unittest/test_auto_scheduler_search_task.py b/tests/python/unittest/test_auto_scheduler_search_task.py
@@ -23,6 +23,7 @@
 import tvm
 import tvm.testing
 from tvm import auto_scheduler
+from tvm.target import Target
 from tvm.auto_scheduler.utils import get_const_tuple
 from tvm.testing.auto_scheduler import (
     matmul_auto_scheduler_test,
@@ -97,10 +98,7 @@ def test_search_task_record():
         func="matmul_auto_scheduler_test",
         args=(N, N, N),
         target=target,
-        task_inputs={
-            "test_input_0": test_input_0,
-            "test_input_1": test_input_1,
-        },
+        task_inputs={"test_input_0": test_input_0, "test_input_1": test_input_1},
         task_inputs_overwrite=True,
     )
     task_record = auto_scheduler._ffi_api.SerializeSearchTask(task)
@@ -114,7 +112,12 @@ def test_search_task_record():
     assert new_task.task_input_names[1] == "test_input_1"
 
     # Log with version 0.5
-    v5_log = """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    host_target_string = '"' + str(Target("llvm")) + '"'
+    v5_log = (
+        """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", """
+        + host_target_string
+        + """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1]"""
+    )
     new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log)
     assert task.workload_key == new_task.workload_key
     assert str(task.target) == str(new_task.target)
@@ -148,9 +151,7 @@ def test_recover_measure_input_with_task_input():
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
         target="llvm",
-        task_inputs={
-            "test_input_0": test_input_0,
-        },
+        task_inputs={"test_input_0": test_input_0},
         task_inputs_overwrite=True,
     )
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
@@ -171,10 +172,7 @@ def test_recover_measure_input_with_task_input():
         func=matmul_auto_scheduler_test,
         args=(512, 512, 512),
         target="llvm",
-        task_inputs={
-            "test_input_0": test_input_0,
-            "test_input_1": test_input_1,
-        },
+        task_inputs={"test_input_0": test_input_0, "test_input_1": test_input_1},
         task_inputs_overwrite=True,
     )
     inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state)
@@ -191,7 +189,12 @@ def test_recover_measure_input_with_task_input():
     assert new_task.task_input_names[1] == "test_input_1"
 
     # Log with version 0.5
-    v5_log = """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    host_target_string = '"' + str(Target("llvm")) + '"'
+    v5_log = (
+        """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", """
+        + host_target_string
+        + """, [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}"""
+    )
     measure_log = auto_scheduler.measure_record.load_record_from_string(v5_log)
     new_task = measure_log[0].task
     assert task.workload_key == new_task.workload_key
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_core.py b/tests/python/unittest/test_autotvm_graph_tuner_core.py
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_apply_custom_rule.py b/tests/python/unittest/test_meta_schedule_schedule_rule_apply_custom_rule.py
diff --git a/tests/python/unittest/test_roofline.py b/tests/python/unittest/test_roofline.py
diff --git a/tests/python/unittest/test_tir_host_func.py b/tests/python/unittest/test_tir_host_func.py

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@`
`35`	`35`	`_reduce_schedule = {`
`36`	`36`	`"generic": topi.generic.schedule_reduce,`
`37`	`37`	`"cpu": topi.x86.schedule_reduce,`
	`38`	`+ # TODO(@FranklandJack) Write arm_cpu specific reduction schedule.`
	`39`	`+ "arm_cpu": topi.x86.schedule_reduce,`
`38`	`40`	`"gpu": topi.cuda.schedule_reduce,`
`39`	`41`	`"hls": topi.cuda.schedule_reduce,`
`40`	`42`	`}`