Skip to content

Commit 972f019

Browse files
anijain2305tmoreau89
authored andcommitted
[TOPI][x86] Cascade lake support. (#4123)
* [TOPI][x86] Cascade lake support. * Jenkins test debug 1. * Testing cascade lake alone.
1 parent 3185e4a commit 972f019

File tree

10 files changed

+112
-80
lines changed

10 files changed

+112
-80
lines changed

python/tvm/relay/qnn/op/legalizations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def _is_int8_hw_support(target):
100100
Checks to ensure that we can use Intel DLBoost instructions - Check if the target is skylake
101101
and above.
102102
"""
103-
supported_arches = {'-mcpu=skylake-avx512',}
103+
supported_arches = {'-mcpu=skylake-avx512', '-mcpu=cascadelake'}
104104
return supported_arches.intersection(set(target.options))
105105

106106
# Collect the dtypes.

python/tvm/target.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,16 @@ def model(self):
128128
return opt.value[7:]
129129
return 'unknown'
130130

131+
@property
132+
def mcpu(self):
133+
"""Returns the mcpu from the target if it exists."""
134+
mcpu = ''
135+
if self.options is not None:
136+
for opt in self.options:
137+
if 'mcpu' in opt:
138+
mcpu = opt.split('=')[1]
139+
return mcpu
140+
131141
def __enter__(self):
132142
_api_internal._EnterTargetScope(self)
133143
return self

tests/python/contrib/test_gemm_acc16.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
1818
import tvm
1919
import numpy as np
20-
from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int16
20+
from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
2121

2222

2323
def benchmark_fc_int8_acc16():
@@ -40,7 +40,7 @@ def verify(target="llvm -mcpu=skylake-avx512"):
4040
ctx = tvm.context(target, 0)
4141
X = tvm.placeholder((m, k), name='X', dtype="uint8")
4242
W = tvm.placeholder((n, k), name='W', dtype="int8")
43-
pc = dot_16x1x16_int8_int8_int16()
43+
pc = dot_16x1x16_uint8_int8_int16()
4444
ak = tvm.reduce_axis((0, k), name='k')
4545

4646
packedW = tvm.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8")

tests/python/contrib/test_gemm_acc32_vnni.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818

1919
import tvm
2020
import numpy as np
21-
from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32_vnni
22-
from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32
21+
from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
22+
from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
2323
import pytest
2424

2525

@@ -46,7 +46,7 @@ def verify(target="llvm -mcpu=cascadelake"):
4646
return
4747

4848
ctx = tvm.context(target, 0)
49-
pc = dot_16x1x16_int8_int8_int32_vnni()
49+
pc = dot_16x1x16_uint8_int8_int32_cascadelake()
5050
ak = tvm.reduce_axis((0, k), name='k')
5151
packedW = tvm.placeholder(
5252
(n // 16, 16 * (k // 4), 4), name='packedW', dtype="int8")

tests/python/relay/test_op_level2.py

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -576,57 +576,71 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
576576
assembly = lib.get_source("asm")
577577
return assembly
578578

579-
# compile conv2d for x86 (skylake) and test assembly contains *pmadd* instructions
580-
target = "llvm -mcpu=skylake-avx512"
581-
name = "llvm.x86.avx512.pmaddubs.w.512"
582-
llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name)
583-
if llvm_id != 0:
584-
fast_int8_dtypes = ('uint8', 'int8', 'int32')
585-
# Sweep the input channels to check int8 robustness
586-
for ic in range(1, 24):
587-
asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW", kernel_layout='OIHW',
588-
dtypes=fast_int8_dtypes)
589-
assert "pmaddubs" in asm
590-
591-
for ic in range(1, 24):
592-
asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
593-
dtypes=fast_int8_dtypes)
594-
assert "pmaddubs" in asm
595-
596-
597-
# Sweep the output channels to check int8 robustness
598-
for oc in range(2, 24):
599-
asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW", kernel_layout='OIHW',
579+
def _has_fast_int8_instructions(asm, target):
580+
if 'skylake-avx512' in target:
581+
return "pmaddubs" in asm
582+
elif 'cascadelake' in target:
583+
return "vpdpbusd" in asm
584+
else:
585+
assert False, "Target should be Skylake or Cascadelake"
586+
587+
# compile conv2d for x86 (skylake, cascadelake) and test assembly contains *pmadd* instructions
588+
targets = ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]
589+
llvm_version = tvm.codegen.llvm_version_major()
590+
for target in targets:
591+
if llvm_version >= 8:
592+
fast_int8_dtypes = ('uint8', 'int8', 'int32')
593+
# Sweep the input channels to check int8 robustness
594+
# Input channels should be a multiple of 4 internally.
595+
for ic in [1, 4, 6]:
596+
asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW",
597+
kernel_layout='OIHW',
598+
dtypes=fast_int8_dtypes)
599+
assert _has_fast_int8_instructions(asm, target)
600+
601+
for ic in [1, 4, 6]:
602+
asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC",
603+
kernel_layout='HWIO',
604+
dtypes=fast_int8_dtypes)
605+
assert _has_fast_int8_instructions(asm, target)
606+
607+
608+
# Sweep the output channels to check int8 robustness
609+
# Output channels should be a multiple of 16 internally.
610+
for oc in [4, 16, 20]:
611+
asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW",
612+
kernel_layout='OIHW',
613+
dtypes=fast_int8_dtypes)
614+
assert _has_fast_int8_instructions(asm, target)
615+
616+
for oc in [4, 16, 20]:
617+
asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC",
618+
kernel_layout='HWIO',
619+
dtypes=fast_int8_dtypes)
620+
assert _has_fast_int8_instructions(asm, target)
621+
622+
# Check that both non-divisible oc and ic work
623+
asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
600624
dtypes=fast_int8_dtypes)
601-
assert "pmaddubs" in asm
625+
assert _has_fast_int8_instructions(asm, target)
602626

603-
for oc in range(2, 24):
604-
asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC", kernel_layout='HWIO',
627+
asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
605628
dtypes=fast_int8_dtypes)
606-
assert "pmaddubs" in asm
607-
608-
# Check that both non-divisible oc and ic work
609-
asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
610-
dtypes=fast_int8_dtypes)
611-
assert "pmaddubs" in asm
612-
613-
asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
614-
dtypes=fast_int8_dtypes)
615-
assert "pmaddubs" in asm
616-
617-
# Ensure that code is generated when datatypes are not HW supported.
618-
dtypes = ('int8', 'int8', 'int32')
619-
asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
620-
dtypes=dtypes)
621-
# Check that intrinisic is not present in the assembly.
622-
assert "pmaddubs" not in asm
623-
624-
# Ensure that code is generated when datatypes are not HW supported.
625-
dtypes = ('uint8', 'uint8', 'int32')
626-
asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
627-
dtypes=dtypes)
628-
# Check that intrinisic is not present in the assembly.
629-
assert "pmaddubs" not in asm
629+
assert _has_fast_int8_instructions(asm, target)
630+
631+
# Ensure that code is generated when datatypes are not HW supported.
632+
dtypes = ('int8', 'int8', 'int32')
633+
asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
634+
dtypes=dtypes)
635+
# Check that intrinisic is not present in the assembly.
636+
assert not _has_fast_int8_instructions(asm, target)
637+
638+
# Ensure that code is generated when datatypes are not HW supported.
639+
dtypes = ('uint8', 'uint8', 'int32')
640+
asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
641+
dtypes=dtypes)
642+
# Check that intrinisic is not present in the assembly.
643+
assert not _has_fast_int8_instructions(asm, target)
630644

631645
# Check that a vectorized instruction is generated for older Intel
632646
# generations, because we default to NCHWc layout.

topi/python/topi/x86/conv2d_avx_1x1.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from ..nn.util import infer_pad, get_pad_tuple
2525
from ..generic import conv2d as conv2d_generic
2626
from ..util import get_const_tuple, simplify
27-
from .tensor_intrin import dot_16x1x16_int8_int8_int32
27+
from .tensor_intrin import dot_16x1x16_uint8_int8_int32
2828
from .util import get_fp32_len
2929

3030
def _fallback_schedule(cfg, wkl):
@@ -183,7 +183,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
183183
def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
184184
return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last,
185185
int32_lanes=16,
186-
intrin=dot_16x1x16_int8_int8_int32())
186+
intrin=dot_16x1x16_uint8_int8_int32())
187187

188188

189189
def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, out_dtype):
@@ -282,7 +282,7 @@ def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
282282
ic_f_outer, ic_s_outer = s[C].split(ic_outer, factor=ic_factor)
283283
s[C].reorder(oc_outer, oh, ow, ic_f_outer, ic_s_outer, kh, kw, oc_inner, ic_inner)
284284

285-
pc = dot_16x1x16_int8_int8_int32()
285+
pc = dot_16x1x16_uint8_int8_int32()
286286
s[C].tensorize(oc_inner, pc)
287287

288288
if C != O:

topi/python/topi/x86/conv2d_avx_common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from ..nn.util import infer_pad
2424
from ..generic import conv2d as conv2d_generic
2525
from ..util import get_const_tuple
26-
from .tensor_intrin import dot_16x1x16_int8_int8_int32
26+
from .tensor_intrin import dot_16x1x16_uint8_int8_int32
2727
from .util import get_fp32_len
2828

2929
def _fallback_schedule(cfg, wkl):
@@ -209,4 +209,4 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
209209
def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
210210
return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last,
211211
int32_lanes=16,
212-
intrin=dot_16x1x16_int8_int8_int32())
212+
intrin=dot_16x1x16_uint8_int8_int32())

topi/python/topi/x86/conv2d_int8.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,14 @@ def _is_int8_hw_support(data_dtype, kernel_dtype):
5757
is_dtype_support = data_dtype == 'uint8' and kernel_dtype == 'int8'
5858

5959
# 2) Check LLVM support
60-
llvm_intrin_fast_int8 = "llvm.x86.avx512.pmaddubs.w.512"
61-
llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(llvm_intrin_fast_int8)
62-
is_llvm_support = llvm_id != 0
60+
llvm_version = tvm.codegen.llvm_version_major()
61+
is_llvm_support = llvm_version >= 8
6362

6463
# 3) Check target
65-
target = tvm.target.current_target()
64+
mcpu = tvm.target.current_target().mcpu
6665
is_target_support = False
67-
for opt in target.options:
68-
if opt == '-mcpu=skylake-avx512':
69-
is_target_support = True
66+
if mcpu == 'skylake-avx512' or mcpu == 'cascadelake':
67+
is_target_support = True
7068

7169
return is_dtype_support and is_llvm_support and is_target_support
7270

topi/python/topi/x86/tensor_intrin.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,27 @@
1919
import tvm
2020

2121

22-
def dot_16x1x16_int8_int8_int32():
22+
def dot_16x1x16_uint8_int8_int32():
23+
"""Dispatch the most optimized intrin depending on the target"""
24+
mcpu = tvm.target.current_target().mcpu
25+
26+
assert mcpu in ("skylake-avx512", "cascadelake"), \
27+
"An old Intel machine that does not have fast Int8 support."
28+
if mcpu == "skylake-avx512":
29+
return dot_16x1x16_uint8_int8_int32_skylake()
30+
# cascadelake
31+
return dot_16x1x16_uint8_int8_int32_cascadelake()
32+
33+
34+
def dot_16x1x16_uint8_int8_int32_skylake():
2335
"""
2436
Int8 dot product by every 4 elements using AVX512 Skylake instructions.
25-
This function takes two arrays of int8 datatype -- data[4] and
37+
This function takes two arrays of uint8 and int8 datatype -- data[4] and
2638
kernel[16][4] -- and computes a dot product of data[4] with every
2739
4 elements of kernels, resulting in output[16] of int32 datatype.
2840
The pseudo code is as follows.
2941
.. code-block:: c
30-
void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
42+
void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
3143
int32 output[16]){
3244
for (int i = 0; i < 16; i++){
3345
output[i] = 0;
@@ -100,15 +112,15 @@ def _instr(index):
100112
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
101113

102114

103-
def dot_16x1x16_int8_int8_int16():
115+
def dot_16x1x16_uint8_int8_int16():
104116
"""
105117
Int8 dot product by every 2 elements using AVX512 Skylake instructions.
106-
This function takes two arrays of int8 datatype -- data[2] and
118+
This function takes two arrays of uint8 and int8 datatype -- data[2] and
107119
kernel[4][32][2] -- and computes a dot product of data[2] with every
108120
2 elements of kernels, resulting in output[4][32] of int16 datatype.
109121
The pseudo code is as follows.
110122
.. code-block:: c
111-
void dot_16x1x16_int8_int8_int16(int8 data[2], int8 kernel[32*4][2],
123+
void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2],
112124
int16 output[32*4]){
113125
for (int i = 0; i< 4; i++){
114126
for (int j = 0; j < 32; j++){
@@ -182,15 +194,15 @@ def _instr(index):
182194
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
183195

184196

185-
def dot_16x1x16_int8_int8_int32_vnni():
197+
def dot_16x1x16_uint8_int8_int32_cascadelake():
186198
"""
187199
Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
188-
This function takes two arrays of int8 datatype -- data[4] and
200+
This function takes two arrays of uint8 and int8 datatype -- data[4] and
189201
kernel[16][4] -- and computes a dot product of data[4] with every
190202
4 elements of kernels, resulting in output[16] of int32 datatype.
191203
The pseudo code is as follows.
192204
.. code-block:: c
193-
void dot_16x1x16_int8_int8_int32_vnni(int8 data[4], int8 kernel[16][4],
205+
void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4],
194206
int32 output[16]){
195207
for (int i = 0; i < 16; i++){
196208
output[i] = 0;

topi/python/topi/x86/util.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@
1919
import tvm
2020

2121
def get_fp32_len():
22+
mcpu = tvm.target.current_target().mcpu
2223
fp32_vec_len = 8
23-
target = tvm.target.current_target()
24-
if target is not None:
25-
for opt in target.options:
26-
if opt == '-mcpu=skylake-avx512':
27-
fp32_vec_len = 16
24+
if mcpu == 'skylake-avx512' or mcpu == 'cascadelake':
25+
fp32_vec_len = 16
2826
return fp32_vec_len

0 commit comments

Comments
 (0)