pytorch
diff --git a/‎backends/arm/runtime/EthosUBackend.cpp‎
Lines changed: 3 additions & 2 deletions b/‎backends/arm/runtime/EthosUBackend.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 19 additions & 14 deletions b/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 36 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 7 additions & 1 deletion b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_type_dispatch_passes.py‎
Lines changed: 48 additions & 0 deletions b/‎backends/cadence/aot/tests/test_type_dispatch_passes.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎backends/cadence/aot/type_dispatch.py‎
Lines changed: 40 additions & 17 deletions b/‎backends/cadence/aot/type_dispatch.py‎
Lines changed: 40 additions & 17 deletions
diff --git a/‎backends/cadence/hifi/operators/op_quantized_relu_asym8s_asym8s_per_tensor_out.cpp‎
Lines changed: 52 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_quantized_relu_asym8s_asym8s_per_tensor_out.cpp‎
Lines changed: 52 additions & 0 deletions
@@ -192,8 +192,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // Use a temporary allocator for the intermediate tensors of the
     // computation. The allocator is released in runtime/executor/method.cpp at
     // the end of the execution of the Ethos-U custom delegate
-    char* ethosu_scratch =
-        static_cast<char*>(temp_allocator->allocate(handles.scratch_data_size));
+    // Ethos-U driver requires 16 bit alignment.
+    char* ethosu_scratch = static_cast<char*>(
+        temp_allocator->allocate(handles.scratch_data_size, 16UL));
     if (ethosu_scratch == nullptr) {
       ET_LOG(
           Error,
 
@@ -17,7 +17,6 @@ _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly ins
 
 
 TEST_SUITE=$1
-TOSA_VERSION="${2:-TOSA-1.0+INT}"
 
 # Source the tools
 # This should be prepared by the setup.sh
@@ -157,17 +156,23 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh
 
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
-    examples/arm/run.sh --et_build_root=arm_test/test_run --target=${TOSA_VERSION} --model_name=add
-    examples/arm/run.sh --et_build_root=arm_test/test_run --target=${TOSA_VERSION} --model_name=mul
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA-1.0+INT --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA-1.0+INT --model_name=mul
 
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=add --bundleio
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=add --bundleio --etdump
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=add --etdump
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=mul
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add --bundleio
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add --bundleio --etdump
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add --etdump
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=mul
 
     # Cortex-M op tests
@@ -187,17 +192,17 @@ test_models_tosa() { # End to End model tests using model_test.py
 
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=mv2
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=mv3
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=lstm
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=edsr
-    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=emformer_transcribe # Takes long time to run
-    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=emformer_join       # Takes long time to run
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=w2l
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=ic3
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=ic4
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=resnet18
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=${TOSA_VERSION} --model=resnet50
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=mv2
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=mv3
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=lstm
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=edsr
+    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=emformer_transcribe # Takes long time to run
+    # python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=emformer_join       # Takes long time to run
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=w2l
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=ic3
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=ic4
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=resnet18
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA-1.0+INT --model=resnet50
 
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
@@ -219,6 +219,16 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_per_tensor_out
 
+- func: cadence::quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_relu_asym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_relu_asym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -339,6 +339,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
 
+- func: cadence::quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_asym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_asym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -232,6 +232,20 @@
     "quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
     "int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_relu_asym8s_asym8s.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
+)
+lib.define(
+    "quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
+    "int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_relu_asym8u_asym8u.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
+)
+lib.define(
+    "quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
+    "int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_add.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
@@ -770,6 +784,28 @@ def quantized_relu_per_tensor_meta(
     return input.new_empty(input.size(), dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_relu_asym8s_asym8s.per_tensor")
+def quantized_relu_asym8s_asym8s_per_tensor_meta(
+    input: torch.Tensor,
+    in_zero_point: int,
+    out_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_relu_asym8u_asym8u.per_tensor")
+def quantized_relu_asym8u_asym8u_per_tensor_meta(
+    input: torch.Tensor,
+    in_zero_point: int,
+    out_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
 @register_fake("cadence::fully_connected")
 def fully_connected_meta(
     src: torch.Tensor,
 
@@ -2327,10 +2327,16 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             # Cast the const_arg to the dtype of the x_arg
             full_arg = self.resolve_full_arg(x_arg, const_arg)
 
+            full_output_dtype = (
+                torch.int32 if isinstance(full_arg, int) else torch.float32
+            )
+
             # Extract an argument to a separate full op.
             with graph_module.graph.inserting_before(mul_node):
                 full_node = graph_module.graph.call_function(
-                    torch.ops.aten.full.default, args=([1], full_arg)
+                    torch.ops.aten.full.default,
+                    args=([1], full_arg),
+                    kwargs={"dtype": full_output_dtype},
                 )
                 full_node.meta = mul_node.meta
                 full_node.meta["val"] = [1]
 
@@ -137,3 +137,51 @@ def test_mixed_types_error(self) -> None:
         with self.assertRaises(RuntimeError) as context:
             cast(PassResult, p(gm)).graph_module
         self.assertIn("Unsupported input types", str(context.exception))
+
+    def test_int8_dispatch_quantized_relu(self) -> None:
+        """Test int8 input should dispatch to asym8s_asym8s variant for quantized_relu"""
+        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
+        gm = single_op_builder(
+            placeholders=(x,),
+            op=exir_ops.edge.cadence.quantized_relu.per_tensor,
+            args=(x, 0, 0, 1, 0),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
+            0,
+        )
+        # Should be replaced with int8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_relu_asym8s_asym8s.per_tensor,
+            ),
+            1,
+        )
+
+    def test_uint8_dispatch_quantized_relu(self) -> None:
+        """Test uint8 input should dispatch to asym8u_asym8u variant for quantized_relu"""
+        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+        gm = single_op_builder(
+            placeholders=(x,),
+            op=exir_ops.edge.cadence.quantized_relu.per_tensor,
+            args=(x, 0, 0, 1, 0),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
+            0,
+        )
+        # Should be replaced with uint8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor,
+            ),
+            1,
+        )
@@ -23,40 +23,63 @@ class CompileTimeTypeDispatchPass(ExportPass):
     Replaces generic ops with ops that have explicit types.
     """
 
-    _TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
+    _BINARY_TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
         (torch.int8, torch.int8): "asym8sxasym8s_asym8s",
         (torch.uint8, torch.uint8): "asym8uxasym8u_asym8u",
     }
 
-    _SUPPORTED_OPS: dict[OpOverload, str] = {
+    _UNARY_TYPE_DISPATCH_MAP: dict[torch.dtype, str] = {
+        torch.int8: "asym8s_asym8s",
+        torch.uint8: "asym8u_asym8u",
+    }
+
+    _BINARY_SUPPORTED_OPS: dict[OpOverload, str] = {
         exir_ops.edge.cadence.quantized_fully_connected.per_tensor: "quantized_fully_connected",
         exir_ops.edge.cadence.quantized_linear.per_tensor: "quantized_linear",
     }
 
+    _SUPPORTED_UNARY_OPS: dict[OpOverload, str] = {
+        exir_ops.edge.cadence.quantized_relu.per_tensor: "quantized_relu",
+    }
+
     def call_operator(
         self,
         op: OpOverload,
         args: tuple[Argument, ...],
         kwargs: dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in self._SUPPORTED_OPS:
-            return super().call_operator(op, args, kwargs, meta)
+        if op in self._BINARY_SUPPORTED_OPS:
+            # pyre-ignore[16]: None has no attribute `to_tensor`.
+            input_dtype = args[0].to_tensor().dtype
+            weight_dtype = args[1].to_tensor().dtype
+            dtype_pair = (input_dtype, weight_dtype)
+
+            if dtype_pair not in self._BINARY_TYPE_DISPATCH_MAP:
+                raise RuntimeError(
+                    f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
+                )
+
+            base_op_name = self._BINARY_SUPPORTED_OPS[op]
+            type_suffix = self._BINARY_TYPE_DISPATCH_MAP[dtype_pair]
+
+            typed_op_name = f"{base_op_name}_{type_suffix}"
+            typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
+
+            return super().call_operator(typed_op, args, kwargs, meta)
+
+        elif op in self._SUPPORTED_UNARY_OPS:
+            input_dtype = args[0].to_tensor().dtype
 
-        # pyre-ignore[16]: None has no attribute `to_tensor`.
-        input_dtype = args[0].to_tensor().dtype
-        weight_dtype = args[1].to_tensor().dtype
-        dtype_pair = (input_dtype, weight_dtype)
+            if input_dtype not in self._UNARY_TYPE_DISPATCH_MAP:
+                raise RuntimeError(f"Unsupported input type for {op}: {input_dtype}")
 
-        if dtype_pair not in self._TYPE_DISPATCH_MAP:
-            raise RuntimeError(
-                f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
-            )
+            base_op_name = self._SUPPORTED_UNARY_OPS[op]
+            type_suffix = self._UNARY_TYPE_DISPATCH_MAP[input_dtype]
 
-        base_op_name = self._SUPPORTED_OPS[op]
-        type_suffix = self._TYPE_DISPATCH_MAP[dtype_pair]
+            typed_op_name = f"{base_op_name}_{type_suffix}"
+            typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
 
-        typed_op_name = f"{base_op_name}_{type_suffix}"
-        typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
+            return super().call_operator(typed_op, args, kwargs, meta)
 
-        return super().call_operator(typed_op, args, kwargs, meta)
+        return super().call_operator(op, args, kwargs, meta)
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantized_relu_asym8s_asym8s_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+  const int8_t* __restrict__ input_data = input.const_data_ptr<int8_t>();
+  int8_t* __restrict__ output_data = output.mutable_data_ptr<int8_t>();
+
+  const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
+  const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
+
+  const int32_t ret = xa_nn_vec_relu_asym8s_asym8s(
+      output_data,
+      input_data,
+      in_zero_point,
+      out_multipler_int32,
+      out_shift_int32,
+      out_zero_point,
+      -128,
+      127,
+      input.numel());
+  ET_DCHECK_MSG(
+      ret == 0, "HiFi quantized_relu_asym8s_asym8s_per_tensor failed");
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence