[microNPU] Expose compute cycle annotations to TIR lowering (#11288)

lhutton1 · web-flow · commit f6ddd52dc008 · 2022-05-26T12:21:22.000+01:00
* [microNPU] Expose compute cycle annotations to TIR lowering

Adds an AttrSttmt "compute_cycles_hint" to each NPU operation for later
passes to consume.

Change-Id: I09779bdab6de6ef2094db610bb20d6e052e68ee3

* compute_cycles-&gt;compute_cycles_hint

Change-Id: Iebd71e699522e92a28fd321ffdb41ed7924db4e0

* add test to check annotations in compilation flow

Change-Id: Idcdcc8c8b5536c4732f297246b71aa8378a2732c

* add compute cycles hints for copy operations

Change-Id: I007ba19732e16081fa2ea9baca40c64a653c93cf

* fixing annotations for copies and improving test coverage

Change-Id: Ib812c4151fab03f4c1adcc016b4e798003a22e5e

* rebase

Change-Id: I653101908706096ae25ad1ebf08e7b6c4f1196c7
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Algorithms to generate Plans for a CascaderGraph."""
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
-from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion
+from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion, TensorConfig
 
 from . import _ffi_api
 from .cascader_options import CascaderOptions
@@ -55,3 +55,23 @@ def _generate_graph_plans(
         home_map,
         options,
     )
+
+
+def get_copy_cycles_hint(tensor_config: TensorConfig) -> Tuple[int, int]:
+    """
+    Returns a hint estimating the number of cycles for the copy
+    specified by tensor_config.
+
+    Parameters
+    ----------
+    tensor_config : TensorConfig
+        The tensor configuration to estimate.
+
+    Returns
+    -------
+    mem2mem_cycles : int
+        Total estimated cycles.
+    initial_mem2mem_cycles : int
+        Estimated cycles for the first block.
+    """
+    return _ffi_api.GetCopyCyclesHint(tensor_config)
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -31,6 +31,7 @@
 from .tensor_config import MemoryRegion
 from .proposal import Proposal
 from .proposal_generator import generate_proposals
+from .plan_generator import get_copy_cycles_hint
 from .graph import create_cascader_graph
 from .device_config import EthosuDeviceConfig
 from .logging import Logging
@@ -134,7 +135,11 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
             if isinstance(part, EthosuPart):
                 tensor_config = plan.tensor_configs[part.output_tensor]
                 stripe_config = tensor_config.stripe_configs[0]
+                buffer_mode = tensor_config.buffer_mode
                 block_config = part.get_block_config(stripe_config)
+                compute_cycles = part.get_performance_info(
+                    stripe_config, buffer_mode
+                ).compute_cycles
                 iv = part.subgraph.output_tensor.op.axis[0]
                 block_shape = block_config.output_shape
                 if len(block_shape) == 4:
@@ -147,6 +152,10 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
                 sch[part.subgraph.output_tensor].pragma(iv, "block_config_width", width)
                 sch[part.subgraph.output_tensor].pragma(iv, "block_config_depth", depth)
 
+                # Attach AttrStmt directly to npu op so it isn't removed by ReplaceOperators
+                npu_op = part.subgraph.output_tensor.op.input_tensors[0].op.input_tensors[0]
+                sch[npu_op].pragma(npu_op.op.axis[0], "compute_cycles_hint", compute_cycles)
+
         output_tensor_config = plan.output_config
         output_tensor = output_tensor_config.tensor
         output_part = output_tensor.producers[0]
@@ -156,6 +165,7 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
         stripe_shape = [int(x) for x in stripe_config.shape]
         stripe_stage, stripe_axis = stripe_part(output_part, stripe_shape, sch)
         copy_te_tensors = []
+        compute_cycles_hints = []
         readers = defaultdict(list)
         for part in plan.part_group:
             if part != output_part:
@@ -167,8 +177,14 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
                 if tensor_config.home_region != tensor_config.copy_region:
                     copy_te_tensors.append(part.subgraph.input_tensors[i])
 
-        for te_tensor in copy_te_tensors:
+                    compute_cycles_hint, _ = get_copy_cycles_hint(tensor_config)
+                    compute_cycles_hints.append(compute_cycles_hint)
+
+        for te_tensor, compute_cycles_hint in zip(copy_te_tensors, compute_cycles_hints):
             copy_stage = sch.cache_read(te_tensor, "global", readers[te_tensor])
+            sch[copy_stage].pragma(
+                copy_stage.op.axis[0], "compute_cycles_hint", compute_cycles_hint
+            )
             sch[copy_stage].compute_at(stripe_stage, stripe_axis)
 
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -263,6 +263,13 @@ def _detect_cache_read(stage):
         if stage.attach_type != 2:  # Not inlined
             if _detect_cache_read(stage):
                 fax = stage.fuse(*stage.op.axis)
+
+                # propagate pragmas placed on the outer loop
+                if len(stage.op.axis) > 0 and stage.op.axis[0] in stage.iter_var_attrs:
+                    attrs = stage.iter_var_attrs[stage.op.axis[0]]
+                    for k, v in zip(attrs.pragma_keys, attrs.pragma_values):
+                        stage.pragma(fax, k.value, v)
+
                 stage.pragma(fax, "op", "ethosu_copy")
 
 
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -301,6 +301,42 @@ int GetInteriorMemoryUsage(const std::vector<TensorConfig>& input_configs,
   return memory_usage;
 }
 
+/**
+ * \brief Returns a hint estimating the number of cycles required for
+ * the copy specified by tensor_config.
+ *
+ * \param tensor_config  The tensor configuration to estimate.
+ * \return mem2mem_cycles Total estimated cycles.
+ * \return initial_mem2mem_cycles Estimated cycles for the first block.
+ */
+std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) {
+  Tensor tensor = tensor_config->GetTensor();
+  MemoryRegion home_region = tensor_config->GetHomeRegion();
+  MemoryRegion copy_region = tensor_config->GetCopyRegion();
+  int initial_mem2mem_cycles = 0;
+  int mem2mem_cycles = 0;
+
+  // This Tensor needs to be copied - Count stripes for this config
+  for (const auto& stripe_config : tensor_config->GetStripeConfigs()) {
+    std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
+    bool first_block = true;
+    for (const auto& block : input_blocks) {
+      int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
+                              tensor->GetCompressionRatio() * block.second;
+      int read_cycles = bytes_transferred * home_region->read_bandwidth + home_region->read_latency;
+      int write_cycles = bytes_transferred * copy_region->write_bandwidth;
+
+      if (first_block) {
+        first_block = false;
+        initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
+      }
+      mem2mem_cycles += std::max(read_cycles, write_cycles);
+    }
+  }
+
+  return {mem2mem_cycles, initial_mem2mem_cycles};
+}
+
 std::vector<Plan> GenerateSinglePlans(
     const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
     const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
@@ -372,28 +408,12 @@ std::vector<Plan> GenerateSinglePlans(
         BlockConfig block_config = perf_info->block_config;
         for (size_t i = 0; i < input_configs.size(); i++) {
           Tensor tensor = input_configs[i]->GetTensor();
-          MemoryRegion home_region = input_configs[i]->GetHomeRegion();
           MemoryRegion copy_region = input_configs[i]->GetCopyRegion();
 
           if (input_configs[i]->DoCopy()) {
-            // This Tensor needs to be copied - Count stripes for this config
-            for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) {
-              std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
-              bool first_block = true;
-              for (const auto& block : input_blocks) {
-                int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
-                                        tensor->GetCompressionRatio() * block.second;
-                int read_cycles = bytes_transferred * home_region->read_bandwidth +
-                                  input_configs[i]->GetHomeRegion()->read_latency;
-                int write_cycles = bytes_transferred * copy_region->write_bandwidth;
-
-                if (first_block) {
-                  first_block = false;
-                  initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
-                }
-                mem2mem_cycles += std::max(read_cycles, write_cycles);
-              }
-            }
+            std::pair<int, int> ret = GetCopyCyclesHint(input_configs[i]);
+            mem2mem_cycles += ret.first;
+            initial_mem2mem_cycles += ret.second;
           }
           float read_efficiency =
               GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region);
@@ -585,6 +605,12 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
       return tclosed_plans;
     });
 
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint")
+    .set_body_typed([](TensorConfig tensor_config) {
+      std::pair<int, int> ret = GetCopyCyclesHint(tensor_config);
+      return Array<Integer>({ret.first, ret.second});
+    });
+
 }  // namespace cascader
 }  // namespace ethosu
 }  // namespace contrib
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
@@ -168,9 +168,14 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
   }
 
   tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
-    auto eval_node{stmt.as<EvaluateNode>()};
+    Stmt eval_stmt = stmt;
+    if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
+      eval_stmt = attr_stmt->body;
+    }
+
+    auto eval_node{eval_stmt.as<EvaluateNode>()};
     ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
-                      << stmt->GetTypeKey();
+                      << eval_stmt->GetTypeKey();
     auto call_node{eval_node->value.as<CallNode>()};
     ICHECK(call_node) << "Expected expression to be a call node, but was "
                       << eval_node->value->GetTypeKey();
diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position,invalid-name
+
+"""
+Test the cascader in the compilation flow.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import _create_cascader
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
+from tvm.contrib.ethosu.cascader import MemoryRegion, EthosuDeviceConfig
+
+from .. import infra as test_infra
+from . import infra as cascader_test_infra
+
+
+def _ethos_u55_cascader():
+    sram = MemoryRegion(
+        name="SRAM",
+        size=10**6,
+        read_bandwidth=16,
+        write_bandwidth=16,
+        read_latency=0,
+        write_latency=0,
+        burst_length=1,
+    )
+    flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4, write_bandwidth=4)
+
+    device_config = EthosuDeviceConfig("ethos-u55-256")
+    cascader_options = cascader_test_infra.make_options(
+        cascade_region=sram,
+        max_proposals=64,
+        stripe_factors=4,
+        max_plan_size=10,
+        max_open_plans=8,
+        max_closed_plans=32,
+        always_copy_size=1024,
+        disable_pareto_plans=False,
+        disable_pareto_proposals=False,
+        enable_striping=False,
+    )
+    return _create_cascader(
+        options=cascader_options,
+        io_region=sram,
+        constant_region=flash,
+        working_regions=[sram],
+        device_config=device_config,
+    )
+
+
+def _compile_model(relay_function):
+    mod = tvm.IRModule()
+    mod["main"] = relay_function
+    mod = relay.transform.InferType()(mod)
+    tir_mod = _lower_to_tir(mod["main"], _ethos_u55_cascader())[0]
+    return tir_mod["main"]
+
+
+def _create_single_conv2d():
+    ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+    conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1), (1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv1), conv1)
+    return func
+
+
+def _create_double_conv2d():
+    ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+    conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1), (1, 1))
+    conv2 = test_infra.make_ethosu_conv2d(conv1, 4, 4, (1, 3), (1, 1), (1, 1), (1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+    return func
+
+
+def _create_scalar_add():
+    ifm = relay.var("x", shape=(1, 5, 4, 3), dtype="int8")
+    ifm2 = relay.const(np.ones((1, 1, 1, 1)), dtype="int8")
+    add = test_infra.make_ethosu_binary_elementwise(
+        ifm, ifm2, ifm_channels=3, ifm2_channels=1, operator_type="ADD", ofm_dtype="int8"
+    )
+    func = relay.Function(relay.analysis.free_vars(add), add)
+    return func
+
+
+def test_single_conv_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for single convolution.
+    """
+    primfunc = _compile_model(_create_single_conv2d())
+    ops = primfunc.body.body.body.seq
+
+    compute_cycles_hints = [2304, 640, 320]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
+
+
+def test_double_conv_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for double convolution.
+    """
+    primfunc = _compile_model(_create_double_conv2d())
+    ops = primfunc.body.body.body.body.body.body.seq
+
+    compute_cycles_hints = [2304, 640, 768, 640, 320, 240]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
+
+
+def test_scalar_add_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for add with scalar values.
+    """
+    primfunc = _compile_model(_create_scalar_add())
+    ops = primfunc.body.body.seq
+
+    compute_cycles_hints = [16, 24]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py