Skip to content

Commit 9164809

Browse files
kevinthesunvinx13
authored andcommitted
Improve x86 roi align (#3296)
* Improve roi_align performance for x86 * Change test
1 parent 88163ec commit 9164809

File tree

3 files changed

+219
-0
lines changed

3 files changed

+219
-0
lines changed

topi/python/topi/x86/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@
1313
from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
1414
from .dense import _schedule_dense, _schedule_dense_pack, _schedule_dense_nopack
1515
from .batch_matmul import schedule_batch_matmul
16+
from .roi_align import roi_align_nchw

topi/python/topi/x86/roi_align.py

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
18+
"""Non-maximum suppression operator for intel cpu"""
19+
import tvm
20+
21+
from tvm import hybrid
22+
from ..vision.rcnn import roi_align_nchw
23+
24+
25+
@hybrid.script
26+
def roi_align_nchw_ir(data, rois, pooled_size, spatial_scale, sample_ratio):
27+
"""Hybrid routing fo ROI align operator in NCHW layout.
28+
29+
Parameters
30+
----------
31+
data : tvm.Tensor or numpy NDArray
32+
4-D with shape [batch, channel, height, width]
33+
34+
rois : tvm.Tensor or numpy NDArray
35+
2-D with shape [num_roi, 5]. The last dimension should be in format of
36+
[batch_index, w_start, h_start, w_end, h_end]
37+
38+
pooled_size : tvm ConsExpr
39+
[out_height, out_width]
40+
41+
spatial_scale : tvm.const
42+
Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
43+
of total stride in convolutional layers, which should be in range (0.0, 1.0]
44+
45+
sample_ratio : tvm.const
46+
Sampling ratio of ROI align, using adaptive size by default.
47+
48+
Returns
49+
-------
50+
output : tvm.Tensor or numpy NDArray
51+
4-D with shape [num_roi, channel, pooled_size, pooled_size]
52+
"""
53+
channels = data.shape[1]
54+
height = data.shape[2]
55+
width = data.shape[3]
56+
num_rois = rois.shape[0]
57+
pooled_size_h = pooled_size[0]
58+
pooled_size_w = pooled_size[1]
59+
output = output_tensor((num_rois, channels, pooled_size_h, pooled_size_w), data.dtype)
60+
max_num_pc_index = height * width * pooled_size_h * pooled_size_w
61+
w_pc = allocate((num_rois, max_num_pc_index, 4), data.dtype)
62+
pos_pc = allocate((num_rois, max_num_pc_index, 4), "int32")
63+
64+
for n in parallel(num_rois):
65+
roi_batch_index = int32(rois[n, 0])
66+
roi_start_w = rois[n, 1] * spatial_scale
67+
roi_start_h = rois[n, 2] * spatial_scale
68+
roi_end_w = rois[n, 3] * spatial_scale
69+
roi_end_h = rois[n, 4] * spatial_scale
70+
71+
roi_h = max(roi_end_h - roi_start_h, 1.0)
72+
roi_w = max(roi_end_w - roi_start_w, 1.0)
73+
74+
bin_h = roi_h / pooled_size_h
75+
bin_w = roi_w / pooled_size_w
76+
77+
roi_bin_grid_h = sample_ratio
78+
roi_bin_grid_w = roi_bin_grid_h
79+
div_h = roi_h / pooled_size_h
80+
div_w = roi_w / pooled_size_w
81+
rounded_div_h = int32(div_h) * 1.0
82+
rounded_div_w = int32(div_w) * 1.0
83+
if sample_ratio <= 0:
84+
# Cannot use ceil function since hybrid script
85+
# doesn't support Call as indexing
86+
roi_bin_grid_h = int32(div_h)
87+
roi_bin_grid_w = int32(div_w)
88+
if rounded_div_h < div_h:
89+
roi_bin_grid_h += 1
90+
if rounded_div_w < div_w:
91+
roi_bin_grid_w += 1
92+
93+
count = roi_bin_grid_h * roi_bin_grid_w
94+
95+
# Pre-calculate indices and weights shared by all channels.
96+
# This is the key point of optimization.
97+
pre_calc_index = 0
98+
iy_upper = roi_bin_grid_h
99+
ix_upper = roi_bin_grid_w
100+
for ph in range(pooled_size_h):
101+
for pw in range(pooled_size_w):
102+
for iy in range(iy_upper):
103+
yy = roi_start_h + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
104+
for ix in range(ix_upper):
105+
xx = roi_start_w + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
106+
x = xx
107+
y = yy
108+
if y < -1.0 or y > height or x < -1.0 or x > width:
109+
for i in range(4):
110+
w_pc[n, pre_calc_index, i] = 0.0
111+
pos_pc[n, pre_calc_index, i] = 0
112+
else:
113+
if y < 0.0:
114+
y = 0.0
115+
if x < 0.0:
116+
x = 0.0
117+
118+
y_low = int32(y)
119+
x_low = int32(x)
120+
x_high = x_low + 1
121+
y_high = y_low + 1
122+
123+
if y_low >= height - 1:
124+
y_high = height - 1
125+
y_low = y_high
126+
y = float32(y_low)
127+
128+
if x_low >= width - 1:
129+
x_high = width - 1
130+
x_low = x_high
131+
x = float32(x_low)
132+
133+
ly = y - y_low
134+
lx = x - x_low
135+
hy = 1.0 - ly
136+
hx = 1.0 - lx
137+
w1 = hy * hx
138+
w2 = hy * lx
139+
w3 = ly * hx
140+
w4 = ly * lx
141+
142+
pos_pc[n, pre_calc_index, 0] = x_low
143+
pos_pc[n, pre_calc_index, 1] = x_high
144+
pos_pc[n, pre_calc_index, 2] = y_low
145+
pos_pc[n, pre_calc_index, 3] = y_high
146+
w_pc[n, pre_calc_index, 0] = w1
147+
w_pc[n, pre_calc_index, 1] = w2
148+
w_pc[n, pre_calc_index, 2] = w3
149+
w_pc[n, pre_calc_index, 3] = w4
150+
151+
pre_calc_index += 1
152+
153+
for c in range(channels):
154+
pre_calc_index = 0
155+
for ph in range(pooled_size_h):
156+
for pw in range(pooled_size_w):
157+
output_val = 0.0
158+
for iy in range(roi_bin_grid_h):
159+
for ix in range(roi_bin_grid_w):
160+
output_val += w_pc[n, pre_calc_index, 0] \
161+
* data[roi_batch_index, c,
162+
pos_pc[n, pre_calc_index, 2],
163+
pos_pc[n, pre_calc_index, 0]] \
164+
+ w_pc[n, pre_calc_index, 1] \
165+
* data[roi_batch_index, c,
166+
pos_pc[n, pre_calc_index, 2],
167+
pos_pc[n, pre_calc_index, 1]] \
168+
+ w_pc[n, pre_calc_index, 2] \
169+
* data[roi_batch_index, c,
170+
pos_pc[n, pre_calc_index, 3],
171+
pos_pc[n, pre_calc_index, 0]] \
172+
+ w_pc[n, pre_calc_index, 3] \
173+
* data[roi_batch_index, c,
174+
pos_pc[n, pre_calc_index, 3],
175+
pos_pc[n, pre_calc_index, 1]]
176+
pre_calc_index += 1
177+
178+
output_val /= count
179+
output[n, c, ph, pw] = output_val
180+
181+
return output
182+
183+
184+
@roi_align_nchw.register("cpu")
185+
def roi_align_nchw_cpu(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
186+
"""ROI align operator in NCHW layout.
187+
188+
Parameters
189+
----------
190+
data : tvm.Tensor
191+
4-D with shape [batch, channel, height, width]
192+
193+
rois : tvm.Tensor
194+
2-D with shape [num_roi, 5]. The last dimension should be in format of
195+
[batch_index, w_start, h_start, w_end, h_end]
196+
197+
pooled_size : int or list/tuple of two ints
198+
output size, or [out_height, out_width]
199+
200+
spatial_scale : float
201+
Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
202+
of total stride in convolutional layers, which should be in range (0.0, 1.0]
203+
204+
sample_ratio : int
205+
Optional sampling ratio of ROI align, using adaptive size by default.
206+
207+
Returns
208+
-------
209+
output : tvm.Tensor
210+
4-D with shape [num_roi, channel, pooled_size, pooled_size]
211+
"""
212+
if not isinstance(pooled_size, (tuple, list)):
213+
pooled_size = (pooled_size, pooled_size)
214+
pooled_size = tvm.convert(pooled_size)
215+
spatial_scale = tvm.const(spatial_scale, "float32")
216+
sample_ratio = tvm.const(sample_ratio, "int32")
217+
return roi_align_nchw_ir(data, rois, pooled_size, spatial_scale, sample_ratio)

topi/tests/python/test_topi_vision.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ def check_device(device):
282282
def test_roi_align():
283283
verify_roi_align(1, 16, 32, 64, 7, 1.0, -1)
284284
verify_roi_align(4, 16, 32, 64, 7, 0.5, 2)
285+
verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2)
285286

286287

287288
def verify_roi_pool(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale):

0 commit comments

Comments
 (0)