Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 27 additions & 22 deletions python/tvm/contrib/ethosu/cascader/device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def _get_input_block(
input_shape: _Shape,
dtype: str,
op_type: str,
is_partkernel: bool,
partkernel: bool,
stride_h: int,
stride_w: int,
dilated_kernel_h: int,
Expand All @@ -310,7 +310,7 @@ def _get_input_block(

if op_type == "ethosu_conv2d":
if dtype == "int8":
if is_partkernel:
if partkernel:
depth = self._align(min(32, input_shape.depth), 8)
else:
depth = self._align(min(16, input_shape.depth), 8)
Expand All @@ -336,7 +336,7 @@ def get_kernel_steps(
dilated_kernel_h: int,
dilated_kernel_w: int,
ifm_dtype: str,
is_partkernel: bool = False,
partkernel: bool = False,
) -> List[int]:
"""Calculate the total number of subkernels and their sizes

Expand All @@ -351,7 +351,7 @@ def get_kernel_steps(
Width of dilated kernel
ifm_dtype: str
Datatype of the Input Feature Map tensor (IFM)
is_partkernel: bool
partkernel: bool
Flag showing whether part-kernel first traversal is used

Returns
Expand All @@ -368,7 +368,7 @@ def get_kernel_steps(
kernel_steps = []
for y, x in subkernels:
subkernel_elements = x * y
if op_type == "ethosu_conv2d" and is_partkernel:
if op_type == "ethosu_conv2d" and partkernel:
# Part-kernel-first traversal conv2d
divisor = 4 if ifm_dtype == "int8" else 2
kernel_steps.append(int(_round_up_div(subkernel_elements, divisor)))
Expand Down Expand Up @@ -509,29 +509,31 @@ def get_elementwise_block_config(
banks_available -= 2

# Split the block in half until it fits into SHRAM
max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
if output_layout == "NHCWB16":
split_order = (a for a in [1, 3, 2])
output_block = [
output_shape[0],
min(output_shape[1], self._max_block_shape.height),
min(output_shape[2] * output_shape[4], self._max_block_shape.depth),
min(output_shape[3], self._max_block_shape.width),
_round_up(min(output_shape[1], max_height), self._micro_block.height),
min(output_shape[2] * output_shape[4], max_depth),
_round_up(min(output_shape[3], max_width), self._micro_block.width),
16,
]
else:
split_order = (a for a in [1, 2, 3])
output_block = [
output_shape[0],
min(output_shape[1], self._max_block_shape.height),
min(output_shape[2], self._max_block_shape.width),
min(output_shape[3], self._max_block_shape.depth),
_round_up(min(output_shape[1], max_height), self._micro_block.height),
_round_up(min(output_shape[2], max_width), self._micro_block.width),
_round_up(min(output_shape[3], max_depth), self._micro_block.depth),
]
split_axis = next(split_order)

offset = [0] * len(output_block)
stripes = [1] * len(output_block)
order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
while True:
# Create stripe config for output block
offset = [0] * len(output_block)
stripes = [1] * len(output_block)
order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
output_stripe_config = StripeConfig(
output_block, output_block, output_block, order, stripes, offset
)
Expand Down Expand Up @@ -564,10 +566,12 @@ def get_elementwise_block_config(
block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
break

if output_block[split_axis] == 1:
if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
split_axis = next(split_order)

output_block[split_axis] = _round_up_div(output_block[split_axis], 2)
output_block[split_axis] = _round_up(
_round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
)

return block_config

Expand Down Expand Up @@ -670,9 +674,9 @@ def get_valid_block_configs(

# Input block depth has additional limitations for operators that require full input depth
input_block_depth = 0
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
if op_type == "ethosu_conv2d":
if is_partkernel:
if partkernel:
input_block_depth = min(ifm_channels, 16)
else:
input_block_depth = min(ifm_channels, 32)
Expand Down Expand Up @@ -745,7 +749,8 @@ def get_valid_block_configs(
kernel_h,
kernel_w,
ifm_channels,
is_partkernel,
"int8",
partkernel,
)
block_config = BlockConfig(
input_block_shape.as_list(), output_block, compute_cycles, output_cycles
Expand All @@ -767,15 +772,15 @@ def _estimate_compute_cycles_per_block(
kernel_w: int,
input_channels: int,
ifm_dtype: str,
is_partkernel: bool = False,
partkernel: bool = False,
) -> Tuple[int, int]:
# Calculate the amount of micro blocks per block, per axis
num_quantum_x = _round_up_div(block_shape.width, self._micro_block.width)
num_quantum_y = _round_up_div(block_shape.height, self._micro_block.height)
num_quantum_z = _round_up_div(block_shape.depth, self._micro_block.depth)
num_quantum_xy = num_quantum_x * num_quantum_y

kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, is_partkernel)
kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, partkernel)

wd_cycles = self._get_weight_decoder_cycles(op_type)
delay_cycles = self._get_delay_cycles(op_type, ifm_dtype)
Expand All @@ -794,7 +799,7 @@ def _estimate_compute_cycles_per_block(
elif subkernel_steps > 1:
compute_cycles += delay_cycles * (subkernel_steps - 1) * num_quantum_z

if is_partkernel:
if partkernel:
compute_cycles *= _round_up_div(input_block_shape.depth, 8)

if op_type == "ethosu_conv2d":
Expand Down
24 changes: 13 additions & 11 deletions src/contrib/ethosu/cascader/parts/ethosu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
BlockConfig best_block_config;
float best_cost = std::numeric_limits<float>::infinity();
std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();

for (const auto& block_config : valid_block_configs_) {
std::vector<int> output_block = block_config->GetOutputBlockShape();
Expand All @@ -86,7 +88,7 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
mul_reduce(output_stripe_shape);

// Single buffering hardware optimization
if (mul_reduce(output_stripe_shape) <= 2 * mul_reduce(output_block)) {
if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
relative_cost /= 2;
}

Expand All @@ -107,25 +109,25 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out
std::vector<int64_t> bytes_per_input =
GetBytesRead(block_shape, output_stripe_config->GetShape());

int elements_per_block = mul_reduce(block_shape);
int bytes_per_output = elements_per_block;
float num_blocks = 1.0f;
for (size_t i = 0; i < block_shape.size(); i++) {
if (buffer_mode == BufferMode::RECOMPUTE) {
num_blocks *= static_cast<float>(output_stripe_config->GetShape()[i] *
output_stripe_config->GetStripes()[i]) /
block_shape[i];
num_blocks *= std::max(static_cast<float>(output_stripe_config->GetShape()[i]) /
block_shape[i] * output_stripe_config->GetStripes()[i],
1.0f);
} else {
num_blocks *=
std::max(static_cast<float>(output_stripe_config->GetExtent()[i]) / block_shape[i], 1.0f);
std::max(static_cast<float>(output_tensor_->GetShape()[i]) / block_shape[i], 1.0f);
}
}
float num_stripes = mul_reduce(output_stripe_config->GetStripes()) - 1.0f;

float num_stripes = mul_reduce(output_stripe_config->GetStripes());
std::vector<int64_t> read_bytes;
for (int block_bytes : bytes_per_input) {
read_bytes.push_back((num_blocks + num_stripes) * block_bytes);
for (int64_t stripe_bytes : bytes_per_input) {
read_bytes.push_back(num_stripes * stripe_bytes);
}
int64_t write_bytes = (num_blocks + num_stripes) * bytes_per_output;
int64_t write_bytes =
num_blocks * mul_reduce(block_shape) * output_tensor_->GetDataType().bytes();

int block_output_cycles = block_config->GetOutputCycles();
int block_compute_cycles = block_config->GetComputeCycles();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@
((1, 7, 10, 16), (1, 7, 1, 10, 16)),
((1, 7, 6, 16), (1, 7, 1, 6, 16)),
# Pooling
((1, 1, 2, 80), (1, 1, 5, 2, 16)),
((1, 1, 2, 16), (1, 1, 1, 2, 16)),
((1, 10, 6, 16), (1, 10, 1, 6, 16)),
],
),
Expand All @@ -225,7 +225,7 @@
((1, 8, 20, 16), (1, 8, 1, 20, 16)),
((1, 14, 6, 16), (1, 14, 1, 6, 16)),
# Pooling
((1, 2, 2, 48), (1, 2, 3, 2, 16)),
((1, 2, 2, 16), (1, 2, 1, 2, 16)),
((1, 10, 12, 16), (1, 10, 1, 12, 16)),
],
),
Expand Down
2 changes: 2 additions & 0 deletions tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def test_ethosu_part():
)
input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
part.set_input(0, input_tensor)
output_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
part.set_output(output_tensor)

assert part.get_stripe_align_hint() == output_quantum
# Check that the performance model runs, don't verify output
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def test_conv_performance(
)
part.set_input(0, cs.Tensor(in_shape, "int8"))
part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8"))
part.set_output(cs.Tensor(out_shape, "int8"))

stripes = [1] * len(output_quantum)
offset = [0] * len(output_quantum)
Expand Down