Skip to content

Commit 37612b0

Browse files
yzhliuwweic
authored andcommitted
[topi] add ARM v8.2 udot (uint8) support (apache#3978)
* [topi] add ARM v8.2 udot (uint8) support * fix test case * fix common conv2d schedule * add back fp32_time in test * fix lint * fix doc, add support for int32_lanes=4, signed int * fix lint * add ic_bn % 4 checker in schedule
1 parent 437755d commit 37612b0

File tree

9 files changed

+633
-171
lines changed

9 files changed

+633
-171
lines changed

topi/python/topi/arm_cpu/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from . import conv2d
44
from . import depthwise_conv2d
55
from . import conv2d_transpose
6+
from . import conv2d_int8
67
from . import bitserial_conv2d
78
from . import bitserial_dense
89
from . import injective
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
18+
"""Conv2D int8 schedule on ARM"""
19+
20+
import tvm
21+
from tvm import autotvm
22+
from .. import generic, tag
23+
from ..util import get_const_tuple
24+
from ..nn.conv2d import conv2d_NCHWc_int8
25+
from ..generic import conv2d as conv2d_generic
26+
from .. import nn
27+
from ..nn.conv2d import _get_workload as _get_conv2d_workload
28+
from .tensor_intrin import dot_int8_int8_int32
29+
30+
31+
def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
32+
"""
33+
Get default int8 schedule config for the workload
34+
"""
35+
wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)
36+
is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
37+
if is_kernel_1x1:
38+
conv2d_generic.fallback_schedule_cpu_1x1_int8(
39+
cfg, wkl, int32_lanes=2, num_int8_elements=4)
40+
else:
41+
conv2d_generic.fallback_schedule_cpu_common_int8(
42+
cfg, wkl, int32_lanes=2, num_int8_elements=4)
43+
44+
45+
@autotvm.register_topi_compute(conv2d_NCHWc_int8, ['arm_cpu'], 'direct')
46+
def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides,
47+
padding, dilation, layout, out_layout, out_dtype):
48+
# layout and out_layout are not used here,
49+
# we keep them for debug convenience when dumping autotvm workload
50+
n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
51+
in_channel = ic_chunk * ic_bn
52+
53+
oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn, n_elems = get_const_tuple(kernel.shape)
54+
num_filter = oc_chunk * oc_bn
55+
56+
# If no config was set, we can fallback to NCHW config.
57+
if cfg.is_fallback:
58+
_get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
59+
tvm.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype),
60+
strides, padding, out_dtype)
61+
return nn.conv2d_NCHWc_int8_compute(data,
62+
kernel,
63+
strides,
64+
padding,
65+
dilation,
66+
layout,
67+
out_layout,
68+
out_dtype)
69+
70+
71+
@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc_int8, ['arm_cpu'], ['direct'])
72+
def _schedule_conv2d_NCHWc_int8(cfg, outs):
73+
"""Create schedule for tensors"""
74+
s = tvm.create_schedule([x.op for x in outs])
75+
scheduled_ops = []
76+
77+
def traverse(op):
78+
"""Traverse operators from computation graph"""
79+
# inline all one-to-one-mapping operators except the last stage (output)
80+
if tag.is_broadcast(op.tag):
81+
if op not in s.outputs:
82+
s[op].compute_inline()
83+
for tensor in op.input_tensors:
84+
if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
85+
traverse(tensor.op)
86+
87+
if 'conv2d_NCHWc_int8' in op.tag:
88+
conv_out = op.output(0)
89+
kernel = conv_out.op.input_tensors[1]
90+
data_vec = conv_out.op.input_tensors[0]
91+
data = data_vec.op.input_tensors[0] \
92+
if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
93+
else data_vec
94+
if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
95+
data_pad = data
96+
data = data_pad.op.input_tensors[0]
97+
98+
args = [s, cfg, data_vec, conv_out, outs[0]]
99+
# int8 conv kernel is 7-dim
100+
_, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape)
101+
dtype = "uint" if data.dtype == "uint8" else "int"
102+
if kh == 1 and kw == 1:
103+
conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(
104+
*args, int32_lanes=4, intrin=dot_int8_int8_int32(int32_lanes=4, dtype=dtype))
105+
else:
106+
conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(
107+
*args, int32_lanes=4, intrin=dot_int8_int8_int32(int32_lanes=4, dtype=dtype))
108+
109+
scheduled_ops.append(op)
110+
111+
traverse(outs[0].op)
112+
return s
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
18+
"""Conv2D int8 schedule on ARM"""
19+
20+
import tvm
21+
22+
def dot_int8_int8_int32(int32_lanes, dtype='uint'):
23+
"""
24+
Int8 dot product by every 4 elements using ARM v8.2 udot.
25+
This function takes two arrays of int8 datatype -- data[4] and
26+
kernel[int32_lanes][4] -- and computes a dot product of data[4] with every
27+
4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype.
28+
The pseudo code is as follows.
29+
30+
.. code-block:: c
31+
32+
void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){
33+
for (int i = 0; i < int32_lanes; i++){
34+
out[i] = 0;
35+
for (int k = 0; k < 4; k++){
36+
out[i] += data[k] * kernel[i][k]
37+
}
38+
}
39+
}
40+
41+
Physically, the kernel array sits in a vector register and
42+
the data[4] is broadcasted to another vector register. This
43+
function returns a TensorIntrin that can be used to tensorize
44+
a schedule.
45+
46+
Parameters
47+
----------
48+
int32_lanes: int
49+
How many int32/uint32 to produce
50+
dtype: str, optional, {"uint", "int"}
51+
Whether it works on unsigned int or signed int
52+
53+
Returns
54+
-------
55+
intrin : TensorIntrin
56+
The ARM uint8 TensorIntrin that can be used in tensorizing schedule
57+
"""
58+
num_int8_elements = 4 # 4 int8 elements in int32
59+
60+
data = tvm.placeholder((num_int8_elements,), dtype='%s8' % dtype, name='data')
61+
kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='%s8' % dtype, name='kernel')
62+
63+
k = tvm.reduce_axis((0, num_int8_elements), name='k')
64+
C = tvm.compute((int32_lanes,),
65+
lambda i: tvm.sum(data[k].astype('%s32' % dtype) *
66+
kernel[i, k].astype('%s32' % dtype),
67+
axis=k), name="C")
68+
69+
a_buffer = tvm.decl_buffer(data.shape, dtype='%s8' % dtype, name="a_buffer",
70+
offset_factor=1,
71+
strides=[1])
72+
b_buffer = tvm.decl_buffer(kernel.shape, dtype='%s8' % dtype, name="b_buffer",
73+
offset_factor=1,
74+
strides=[tvm.var('s'), 1])
75+
76+
def _intrin_func(ins, outs):
77+
def _instr(index):
78+
ib = tvm.ir_builder.create()
79+
if index == 1:
80+
ib.emit(outs[0].vstore(0, tvm.const(0, '%s32x%d' % (dtype, int32_lanes))))
81+
return ib.get()
82+
83+
dtype_a = '%s8x%d' % (dtype, num_int8_elements)
84+
dtype_b = '%s8x%d' % (dtype, int32_lanes * num_int8_elements)
85+
dtype_c = '%s32x%d' % (dtype, int32_lanes)
86+
87+
a_int8 = ins[0].vload([0], dtype_a)
88+
re_int32 = tvm.call_pure_intrin('%s32' % dtype, 'reinterpret', a_int8)
89+
# broadcast a
90+
vec_ai32 = re_int32.astype(dtype_c)
91+
92+
vec_a = tvm.call_pure_intrin(dtype_b, 'reinterpret', vec_ai32)
93+
vec_b = ins[1].vload([0, 0], dtype_b)
94+
vec_c = outs[0].vload([0], dtype_c)
95+
96+
inst = 'udot' if dtype == 'uint' else 'sdot'
97+
inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % (
98+
inst, int32_lanes, int32_lanes * num_int8_elements)
99+
vdot = tvm.call_llvm_intrin(dtype_c,
100+
inst,
101+
tvm.const(2, 'uint32'),
102+
vec_c, vec_a, vec_b)
103+
ib.emit(outs[0].vstore(0, vdot))
104+
return ib.get()
105+
106+
# body, reset, update
107+
return _instr(0), _instr(1), _instr(2)
108+
109+
with tvm.build_config(offset_factor=1, partition_const_loop=True):
110+
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})

0 commit comments

Comments
 (0)