apache · tmoreau89 · Nov 18, 2021 · Nov 8, 2021 · Nov 8, 2021 · Nov 8, 2021
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
@@ -19,19 +19,8 @@
     values from testing parameters """
 
 import tvm
-from .infrastructure import get_packed_filter_layout
 
 
 @tvm.testing.fixture
 def shape_nhwc(batch, in_channel, in_size):
     return (batch, in_size, in_size, in_channel)
-
-
-@tvm.testing.fixture
-def shape_oihw(out_channel, in_channel, kernel):
-    return (out_channel, in_channel, kernel, kernel)
-
-
-@tvm.testing.fixture
-def shape_oihw8i32o4i(out_channel, in_channel, kernel):
-    return get_packed_filter_layout(out_channel, in_channel, kernel, kernel)
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -18,47 +18,65 @@
 """ Hexagon testing infrastructure """
 
 import tvm
+from tvm import te
 import numpy
 
 
 def ceildiv(o, d):
+    assert o >= 0
+    assert d >= 0
     return tvm.tir.floordiv(o + d - 1, d)
 
 
-def get_packed_activation_layout(shape_nhwc, block_shape, packed_C=True):
-    assert len(shape_nhwc) == 4
-    shape = [shape_nhwc[0]]
-    off_h, off_w, off_c = block_shape
-    shape.append(ceildiv(shape_nhwc[1], off_h))
-    shape.append(ceildiv(shape_nhwc[2], off_w))
-    if packed_C:
-        shape.append(ceildiv(shape_nhwc[3], off_c))
-        shape.extend(block_shape)
-    else:
-        shape.extend([off_h, off_w, shape_nhwc[3]])
-    return shape
-
-
+# defines inner block shape: 8h8w32c
 def get_block_shape():
     return 8, 8, 32
 
 
+# defines inner filter block shape: 8i32o41
 def get_filter_block_shape():
     return 8, 32, 4
 
 
-def get_packed_filter_layout(out_channel, in_channel, kernel_h, kernel_w):
-    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
+# input: locgical shape in nhwc layout
+# output:  physical packed shape in nhw8h8w32c layout
+def get_packed_shape(logical_shape_nhwc):
+    assert len(logical_shape_nhwc) == 4
+    physical_shape_nhwc8h8w32c = [logical_shape_nhwc[0]]
+    block_shape = get_block_shape()
+    off_h, off_w, off_c = block_shape
+    physical_shape_nhwc8h8w32c.append(ceildiv(logical_shape_nhwc[1], off_h))
+    physical_shape_nhwc8h8w32c.append(ceildiv(logical_shape_nhwc[2], off_w))
+    physical_shape_nhwc8h8w32c.append(ceildiv(logical_shape_nhwc[3], off_c))
+    physical_shape_nhwc8h8w32c.extend(block_shape)
+    return physical_shape_nhwc8h8w32c
+
+
+# input: physical packed shape in nhw8h8w32c layout
+# output: logical shape in nhwc layout
+def get_logical_shape(physical_shape_nhwc8h8w32c):
+    assert len(physical_shape_nhwc8h8w32c) == 7
+    logical_shape_nhwc = [physical_shape_nhwc8h8w32c[0]]
+    logical_shape_nhwc.append(physical_shape_nhwc8h8w32c[1] * physical_shape_nhwc8h8w32c[4])
+    logical_shape_nhwc.append(physical_shape_nhwc8h8w32c[2] * physical_shape_nhwc8h8w32c[5])
+    logical_shape_nhwc.append(physical_shape_nhwc8h8w32c[3] * physical_shape_nhwc8h8w32c[6])
+    return logical_shape_nhwc
+
+
+# input: logical shape in oihw layout
+# output: physical packed shape in oihw8i3204i layout
+def get_packed_filter_shape(logical_shape_oihw):
+    assert len(logical_shape_oihw) == 4
+    filter_block_shape = get_filter_block_shape()
+    filter_Cio, filter_Ki, filter_Cii = filter_block_shape
     filter_Ci = filter_Cio * filter_Cii
-    return (
-        int(ceildiv(out_channel, filter_Ki)),
-        int(ceildiv(in_channel, filter_Ci)),
-        kernel_h,
-        kernel_w,
-        filter_Cio,
-        filter_Ki,
-        filter_Cii,
-    )
+    physical_shape_oihw8i32o4i = []
+    physical_shape_oihw8i32o4i.append(int(ceildiv(logical_shape_oihw[0], filter_Ki)))
+    physical_shape_oihw8i32o4i.append(int(ceildiv(logical_shape_oihw[1], filter_Ci)))
+    physical_shape_oihw8i32o4i.append(logical_shape_oihw[2])
+    physical_shape_oihw8i32o4i.append(logical_shape_oihw[3])
+    physical_shape_oihw8i32o4i.extend(filter_block_shape)
+    return physical_shape_oihw8i32o4i
 
 
 def build_and_run(inputs, func, target, target_host, *args, **kwargs):
@@ -93,26 +111,10 @@ def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, o
     )
 
 
-def verify_conv2d(output, ref_output, dtype):
-    # nhwc8h8w32c
-    if len(output.shape) == 7:
-        # nhwc8h8w32c -> nhwc
-        output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(
-            output.shape[0],
-            output.shape[1] * output.shape[4],
-            output.shape[2] * output.shape[5],
-            output.shape[3] * output.shape[6],
-        )
-
-    # nhwhwc
-    else:
-        # nhwhwc -> nhwc
-        output = output.transpose(0, 1, 3, 2, 4, 5).reshape(
-            output.shape[0],
-            output.shape[1] * output.shape[3],
-            output.shape[2] * output.shape[4],
-            output.shape[5],
-        )
+def conv2d_verify(output, ref_output, dtype):
+    # nhwc8h8w32c -> nhwc
+    logical_output_shape = get_logical_shape(output.shape)
+    output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(logical_output_shape)
 
     # slice output to match ref_output shape
     # e.g. 8x8 spatial 3x3 filter = 6x6 ref output
@@ -129,3 +131,64 @@ def verify_conv2d(output, ref_output, dtype):
     elif dtype == "float32":
         tol = {"rtol": 1e-4, "atol": 2e-4}
     tvm.testing.assert_allclose(output, ref_output, **tol)
+
+
+def conv2d_compute(X, filt, pad, stride, dilation):
+    block_shape = get_block_shape()
+    block_H, block_W, block_C = block_shape
+    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
+    filter_Ci = filter_Cio * filter_Cii
+
+    shape_filter = filt.shape
+    kernel_size = tuple(shape_filter[2:4])
+    out_channels = shape_filter[0] * shape_filter[5]
+
+    logical_input_shape = get_logical_shape(X.shape)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        logical_input_shape,
+        kernel_size,
+        stride,
+        pad,
+        dilation,
+        out_channels,
+    )
+
+    output_shape = get_packed_shape(logical_output_shape)
+    n, ho, wo, ko, hi, wi, ki = output_shape
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, logical_input_shape[3]), name="rc")
+
+    def compute(n, ho, wo, ko, hi, wi, ki):
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        c_block_id = rc // block_C
+        c_block_offset = rc % block_C
+
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        return te.sum(
+            X[
+                n,
+                h_block_id,
+                w_block_id,
+                c_block_id,
+                h_block_offset,
+                w_block_offset,
+                c_block_offset,
+            ]
+            * filt[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    return output_shape, compute
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.md b/tests/python/contrib/test_hexagon/test_conv2d_blocked.md
@@ -23,24 +23,23 @@ This is a baseline 1x1 conv2d schedule for Hexagon.
 
 ## Command
 
-pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-1-1-0-float32-1-1-1-64-64-128-llvm]"
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[1-64-64-0-1-1-128-1-1-float32-llvm]"
 
 ## Parameters
 
-| Parameter | Value       |
-| --------- | ----------- |
-| Batch     | 1           |
-| Filter    | 1x1         |
-| Spatial   | 64x64       |
-| Input Ch  | 64          |
-| Output Ch | 128         |
-| Stride    | 1           |
-| Padding   | 0           |
-| Layout    | NHWC8h8w32c |
+| Parameter | Value |
+| --------- | ----- |
+| Batch     | 1     |
+| Spatial   | 64x64 |
+| Input Ch  | 64    |
+| Padding   | 0     |
+| Stride    | 1     |
+| Filter    | 1x1   |
+| Output Ch | 128   |
 
 ## Assumptions
 
-* Pattern matching for microkernels is not senstive to cache reads and writes between the outer height (ho) and outer width (wo) loops.
+* n/a
 
 ## To Do
 
@@ -174,26 +173,25 @@ The key changes in TIR versus the above are...
 
 ## Command
 
-pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-1-1-0-float32-2-2-1-64-64-128-llvm]"
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[1-64-64-0-1-1-128-2-2-float32-llvm]"
 
 ## Parameters
 
-| Parameter | Value       |
-| --------- | ----------- |
-| Batch     | 1           |
-| Filter    | 1x1         |
-| Spatial   | 64x64       |
-| Input Ch  | 64          |
-| Output Ch | 128         |
-| Stride    | 1           |
-| Padding   | 0           |
-| Layout    | NHWC8h8w32c |
-| k_split   | 2           |
-| h_split   | 2           |
+| Parameter | Value |
+| --------- | ----- |
+| Batch     | 1     |
+| Spatial   | 64x64 |
+| Input Ch  | 64    |
+| Padding   | 0     |
+| Stride    | 1     |
+| Filter    | 1x1   |
+| Output Ch | 128   |
+| k_split   | 2     |
+| h_split   | 2     |
 
 ## Assumptions
 
-* n/a - With the loop splits on `ko` and `ho` the compute schedule is now over `ko.inner` `ho.inner` `wo` etc. This should fit the pattern matching for microkernels.
+* n/a
 
 ## To Do
 
@@ -350,21 +348,21 @@ The `if` statement above indicates NOT to prefetch the vertically adjacent slice
 
 ## Command
 
-pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-3-1-0-float32-2-2-1-64-64-128-llvm]"
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[1-64-64-0-1-3-128-2-2-float32-llvm]"
 
 ## Parameters
 
-| Parameter | Value       |
-| --------- | ----------- |
-| Batch     | 1           |
-| Filter    | 3x3         |
-| Spatial   | 64x64       |
-| Input Ch  | 64          |
-| Output Ch | 128         |
-| Stride    | 1           |
-| Padding   | 0           |
-| Layout    | NHWC8h8w32c |
-| h_split   | 2           |
+| Parameter | Value |
+| --------- | ----- |
+| Batch     | 1     |
+| Spatial   | 64x64 |
+| Input Ch  | 64    |
+| Padding   | 0     |
+| Stride    | 1     |
+| Filter    | 1x1   |
+| Output Ch | 128   |
+| k_split   | 2     |
+| h_split   | 2     |
 
 ## Assumptions