From feef0a013907189668c1874543a4d8d8508e4541 Mon Sep 17 00:00:00 2001 From: Prithviraj-R Date: Mon, 15 Jul 2024 02:43:16 -0700 Subject: [PATCH] [CM Conv] Enable SIMD16 Support for DPAS Convolutions --- tools/common_lib/src/conv.h | 165 +++--- tools/common_lib/src/layers_utils.h | 1 + .../kernels/conv_nchw_dpas_fp16.cpp | 533 ++++++++++++++++++ .../cross_runner/kernels/reorder_weights.cpp | 233 ++++---- 4 files changed, 753 insertions(+), 179 deletions(-) create mode 100644 tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp diff --git a/tools/common_lib/src/conv.h b/tools/common_lib/src/conv.h index fed0457..4800b8c 100644 --- a/tools/common_lib/src/conv.h +++ b/tools/common_lib/src/conv.h @@ -342,7 +342,7 @@ class ConvolutionBaseDispatcher : public NodeDispatcher bool managed_weights = false; bool algo_winograd = false; bool transposed = false; - + bool use_constant_buffer = false; bool dump_weights = false; bool use_dnnl_for_reference_calculations = false; @@ -362,11 +362,11 @@ class ConvolutionBaseDispatcher : public NodeDispatcher opts->add_flag("--no_bias", params.no_bias); opts->add_flag("--allow_fp16_computations", params.allow_fp16_computations); opts->add_flag("--managed_weights", params.managed_weights); - opts->add_option("--activation", params.activation); + opts->add_option("--activation", params.activation.type); opts->add_flag("--algo_winograd", params.algo_winograd); opts->add_flag("--transposed", params.transposed); opts->add_flag("--dnnl_reference", params.use_dnnl_for_reference_calculations)->default_val(false); - + opts->add_flag("--use_constant_buffer", params.use_constant_buffer); opts->add_flag("--dump_weights", params.dump_weights); } }; @@ -391,9 +391,12 @@ class ConvolutionBaseDispatcher : public NodeDispatcher assert(params_.groups >= 1); const auto output_shape = get_output_shape(); - prepare_constant_data(); + if (use_constant()) + { + prepare_constant_data(); + } - if (!params_.no_bias) + if (use_bias()) { bias_data_ = std::vector(output_shape.c * get_data_type_bytes_width(params_.dt)); } @@ -449,8 +452,11 @@ class ConvolutionBaseDispatcher : public NodeDispatcher bias_buffer_ = create_buffer(d3d12_device, tensor_bias_bytes_width, D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); } - constant_buffer_ = create_buffer(d3d12_device, tensor_constant_bytes_width, - D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + if (use_constant()) + { + constant_buffer_ = create_buffer(d3d12_device, tensor_constant_bytes_width, + D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + } output_buffer_ = create_buffer(d3d12_device, tensor_out_bytes_width, D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); @@ -469,7 +475,10 @@ class ConvolutionBaseDispatcher : public NodeDispatcher std::memcpy(upload_mapped_ptr + memcopy_offset, bias_data_.data(), tensor_bias_bytes_width); memcopy_offset += tensor_bias_bytes_width; } - std::memcpy(upload_mapped_ptr + memcopy_offset, constant_data_.data(), tensor_constant_bytes_width); + if (use_constant()) + { + std::memcpy(upload_mapped_ptr + memcopy_offset, constant_data_.data(), tensor_constant_bytes_width); + } // unmap memory upload_buffer_->Unmap(0, nullptr); @@ -483,7 +492,10 @@ class ConvolutionBaseDispatcher : public NodeDispatcher cmd_list->CopyBufferRegion(bias_buffer_.Get(), 0, upload_buffer_.Get(), memcopy_offset, tensor_bias_bytes_width); memcopy_offset += tensor_bias_bytes_width; } - cmd_list->CopyBufferRegion(constant_buffer_.Get(), 0, upload_buffer_.Get(), memcopy_offset, tensor_constant_bytes_width); + if (use_constant()) + { + cmd_list->CopyBufferRegion(constant_buffer_.Get(), 0, upload_buffer_.Get(), memcopy_offset, tensor_constant_bytes_width); + } std::vector barriers; barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(input_buffer_.Get(), @@ -495,8 +507,11 @@ class ConvolutionBaseDispatcher : public NodeDispatcher barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(bias_buffer_.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); } - barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(constant_buffer_.Get(), - D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); + if (use_constant()) + { + barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(constant_buffer_.Get(), + D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); + } cmd_list->ResourceBarrier(static_cast(barriers.size()), barriers.data()); } @@ -602,7 +617,10 @@ class ConvolutionBaseDispatcher : public NodeDispatcher { return !params_.no_bias; } - + inline bool use_constant() const + { + return params_.use_constant_buffer; + } std::vector get_dnnl_result(std::size_t reference_dispatch_iterations) const { const auto output_shape = get_output_shape(); @@ -1043,7 +1061,7 @@ class ConvolutionUmdD3d12Dispatcher : public ConvolutionBaseDispatcher input_memory_desc_ = to_dnnl_mem_desc(params_.input_shape, params_.input_layout, params_.dt); output_memory_desc_ = to_dnnl_mem_desc(get_output_shape(), params_.output_layout, params_.dt); - if (!params_.no_bias) + if (use_bias()) { bias_memory_desc_.emplace(to_dnnl_mem_desc(TensorShape{ get_output_shape().c, 0, 0, 0}, DataLayout::eO, params_.dt)); } @@ -1253,12 +1271,12 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher wr_params.k_size = params_.filter_shape.w; wr_params.input_layout = DataLayout::eOIYX; - /*if (params_.dt == DataType::eFp16 && params_.filter_shape.w == 1 && params_.filter_shape.h == 1) + if (params_.dt == DataType::eFp16) { wr_params.output_layout = DataLayout::eIO_i8_o8_i2; } else if (params_.dt == DataType::eFp16 && params_.filter_shape.w != 1 && params_.filter_shape.h != 1) - {*/ + { if (cm_params_.block_oc == 8) { wr_params.output_layout = DataLayout::eOYXI_o8; @@ -1267,7 +1285,7 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher { wr_params.output_layout = DataLayout::eOYXI_o16; } - //} + } weights_reorder_.emplace(WeightsReorder(std::move(wr_params), filter_buffer_, constant_buffer_, intc_ext, d3d12_device, cmd_list)); } @@ -1276,11 +1294,14 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher { // input, filter std::vector desc_list = { DescType::eSrv, DescType::eSrv }; - if (!params_.no_bias) + if (use_bias()) + { + desc_list.push_back(DescType::eSrv); + } + if (constant_buffer_) { desc_list.push_back(DescType::eSrv); } - desc_list.push_back(DescType::eSrv); // output desc_list.push_back(DescType::eUav); root_signature_ = create_root_signature(d3d12_device_, desc_list); @@ -1307,31 +1328,36 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher build_options += pre_jit + name + between_name_and_value + value_str + post_jit; }; - add_define("DT", static_cast(params_.dt)); - //add_define("INPUT_WIDTH", params_.input_shape.w); - //add_define("INPUT_HEIGHT", params_.input_shape.h); - //add_define("INPUT_CHANNELS", params_.input_shape.c); - - //add_define("OUTPUT_WIDTH", output_shape_.w); - //add_define("OUTPUT_HEIGHT", output_shape_.h); - //("OUTPUT_CHANNELS", output_shape_.c); - - //add_define("BATCH", params_.input_shape.n); - //add_define("INPUT_PAD", params_.in_pad); - //add_define("OUTPUT_PAD", params_.out_pad); + if (params_.allow_fp16_computations) + { + add_define("DT_ACCU", "half"); + } + else + { + add_define("DT_ACCU", "float"); + } + add_define("INPUT_WIDTH", params_.input_shape.w); + add_define("INPUT_HEIGHT", params_.input_shape.h); + add_define("INPUT_CHANNELS", params_.input_shape.c); + add_define("OUTPUT_WIDTH", output_shape_.w); + add_define("OUTPUT_HEIGHT", output_shape_.h); + add_define("OUTPUT_CHANNELS", output_shape_.c); + add_define("BATCH", params_.input_shape.n); + add_define("INPUT_PAD", params_.in_pad); + add_define("OUTPUT_PAD", params_.out_pad); add_define("USE_BIAS", !params_.no_bias); add_define("KERNEL_SIZE", params_.filter_shape.h); add_define("STRIDE_W", params_.stride.w); - //add_define("STRIDE_H", params_.stride.h); - - //add_define("SLICE_IC", cm_params_.slice_ic); - //add_define("BLOCK_W", cm_params_.block_w); - //add_define("BLOCK_H", cm_params_.block_h); - //add_define("BLOCK_OC", cm_params_.block_oc); - //add_define("BLOCK_BATCH", cm_params_.block_batch); - - //add_define("WEIGHTS_IN_OPTIMAL_FORMAT", cm_params.reorder_weights); - + add_define("STRIDE_H", params_.stride.h); + add_define("SLICE_IC", cm_params_.slice_ic); + add_define("BLOCK_W", cm_params_.block_w); + add_define("BLOCK_H", cm_params_.block_h); + add_define("BLOCK_OC", cm_params_.block_oc); + add_define("BLOCK_BATCH", cm_params_.block_batch); + add_define("WEIGHTS_IN_OPTIMAL_FORMAT", cm_params.reorder_weights); + add_define("INPUT_LAYOUT", (params_.input_layout == DataLayout::eNCHW) ? 0 : 1); + add_define("USE_RELU", (params_.activation.type == ActivationType::eRelu) ? 1 : 0); + add_define("WEI_OFFSET", 0); // Kernel uses this compile time flag as base offset to weights surface in the actual driver mode, So forcing it to Zero in cross-runner // kernel compilation const auto dump_asm_str = cm_params_.dump_asm ? " -mdump_asm" : ""; const auto large_grf_str = cm_params_.large_grf ? " -Qxcm_doubleGRF" : ""; @@ -1348,16 +1374,8 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher auto kernel_source_content = [](const auto kernel_size) { - std::string path = ""; - if (false/*kernel_size == 1*/) - { - path = "conv_1x1_nchw_fp16.cpp"; - } - else - { - path = "conv_nchw_fp16.cpp"; - } - + std::string path = "conv_nchw_dpas_fp16.cpp"; + std::fstream file(path); if (!file.is_open()) { @@ -1380,7 +1398,7 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher { // input, weights, output std::uint32_t descriptor_count = 4; - if (!params_.no_bias) + if (use_bias()) { descriptor_count++; } @@ -1416,7 +1434,10 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher { resources_list.push_back({ DescType::eSrv, bias_buffer_.Get() }); } - resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() }); + if (constant_buffer_) + { + resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() }); + } const auto tensor_out_bytes_width = output_buffer_->GetDesc().Width; resources_list.push_back({ DescType::eUav, output_buffer_.Get() }); @@ -1452,7 +1473,8 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher const uint32_t out_ch_size = static_cast(std::ceil(params_.filter_shape.n / (double)(cm_params_.block_oc))); const auto gws_x = cm_params_.slice_ic * (round_up_next_multiple(output_shape_.w, cm_params_.block_w) / cm_params_.block_w); const auto gws_y = round_up_next_multiple(output_shape_.h, cm_params_.block_h) / cm_params_.block_h; - const auto gws_z = (params_.input_shape.n / cm_params_.block_batch) * out_ch_size; + const auto execsize = 2; // BMG = 2, DG2 = 1 + const auto gws_z = ((params_.input_shape.n / cm_params_.block_batch) * out_ch_size) / execsize; assert(gws_x % cm_params_.lws[0] == 0); assert(gws_y % cm_params_.lws[1] == 0); @@ -1503,12 +1525,13 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher std::uint32_t gws_z = 0; if (output_layout == DataLayout::eIO_i8_o8_i2) { - const std::uint32_t ic_chunks_per_hw_thread = 8; - const std::uint32_t exec_size = 8; + const std::uint32_t exec_size = 16; // BMG = 16, DG2 = 8 const std::uint32_t dpas_depth = 8; + const std::uint32_t ic_chunks_per_hw_thread = 2; const std::uint32_t out_dt_size = get_data_type_bytes_width(output_dt); + const std::uint32_t ic_multipler = (ic_chunks_per_hw_thread * dpas_depth * out_dt_size); gws_x = oc / exec_size; - gws_y = ic / (ic_chunks_per_hw_thread * dpas_depth * out_dt_size); + gws_y = (ic % ic_multipler == 0) ? ic / ic_multipler : (ic / ic_multipler) + ic % ic_multipler; gws_z = 1; } else if (output_layout == DataLayout::eOYXI_o8) @@ -1572,23 +1595,17 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher build_options += pre_jit + name + between_name_and_value + value_str + post_jit; }; - if (params_.input_dt == DataType::eFp16 && params_.output_dt == DataType::eFp16) - { - add_define("DT", "half"); - } - else - { - add_define("DT", "float"); - } - //add_define("WEI_OFFSET", 0); - //add_define("IC", params_.ic); - //add_define("OC", params_.oc); + add_define("INPUT_TYPE", "half"); + add_define("OUTPUT_TYPE", "half"); + add_define("WEI_OFFSET", 0); + add_define("IC", params_.ic); + add_define("OC", params_.oc); add_define("K_SIZE", params_.k_size); - /*for (std::int32_t i = static_cast(DataLayout::eWeightsLayoutStart) + 1; i < static_cast(DataLayout::eCount); i++) + for (std::int32_t i = static_cast(DataLayout::eWeightsLayoutStart) + 1; i < static_cast(DataLayout::eCount); i++) { add_define("LAYOUT_" + data_layout_name(static_cast(i)), i); - }*/ + } add_define("INPUT_LAYOUT", static_cast(params_.input_layout)); add_define("OUTPUT_LAYOUT", static_cast(params_.output_layout)); @@ -1607,11 +1624,11 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher // kernel compilation const auto dump_asm_str = " -mdump_asm"; const auto print_reg_str = " -mCM_printregusage"; - + const auto large_grf_str = " -Qxcm_doubleGRF"; const auto lws_x = " -DLWS_SIZE_X=" + std::to_string(params_.lws[0]); const auto lws_y = " -DLWS_SIZE_Y=" + std::to_string(params_.lws[1]); const auto lws_z = " -DLWS_SIZE_Z=" + std::to_string(params_.lws[2]); - const auto build_options_final = " -I \" \" " + build_options + dump_asm_str + print_reg_str + lws_x + lws_y + lws_z; + const auto build_options_final = " -I \" \" " + build_options + dump_asm_str + large_grf_str + print_reg_str + lws_x + lws_y + lws_z; CD3DX12_SHADER_BYTECODE byte_code; byte_code.pShaderBytecode = kernel_source_content.data(); @@ -1655,7 +1672,6 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher { assert(input_buffer_); assert(output_buffer_); - assert(constant_buffer_); const auto desc_heap_incrs_size = d3d12_device_->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); // i.e. add weights reorder @@ -1666,7 +1682,10 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher std::vector> resources_list; resources_list.reserve(get_total_descriptor_count()); resources_list.push_back({ DescType::eSrv, input_buffer_.Get() }); - resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() }); + if (constant_buffer_) + { + resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() }); + } resources_list.push_back({ DescType::eUav, output_buffer_.Get() }); gpu_handles_ = create_resource_views_and_handles(d3d12_device_, resources_list, base_cpu_handle, base_gpu_handle); diff --git a/tools/common_lib/src/layers_utils.h b/tools/common_lib/src/layers_utils.h index c3e909d..4c5c534 100644 --- a/tools/common_lib/src/layers_utils.h +++ b/tools/common_lib/src/layers_utils.h @@ -190,6 +190,7 @@ inline std::string data_layout_name(DataLayout l) case DataLayout::eNHWC: return "NHWC"; case DataLayout::eCHW: return "CHW"; case DataLayout::eW: return "W"; + case DataLayout::eO: return "O"; case DataLayout::eOIYX: return "OIYX"; case DataLayout::eIO_i8_o8_i2: return "IO_i8_o8_i2"; case DataLayout::eOYXI_o8: return "OYXI_o8"; diff --git a/tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp b/tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp new file mode 100644 index 0000000..170434c --- /dev/null +++ b/tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp @@ -0,0 +1,533 @@ +/*========================== begin_copyright_notice ============================ + +INTEL CONFIDENTIAL + +Copyright (C) 2023 Intel Corporation + +This software and the related documents are Intel copyrighted materials, +and your use of them is governed by the express license under which they were +provided to you ("License"). Unless the License provides otherwise, +you may not use, modify, copy, publish, distribute, disclose or transmit this +software or the related documents without Intel's prior written permission. + +This software and the related documents are provided as is, with no express or +implied warranties, other than those that are expressly stated in the License. + +============================= end_copyright_notice ===========================*/ + +#include +#include + +#if !CM_HAS_DPAS +#error [Error_device_no_dpas] Kernel designed to use dpas. Current device does not support dpas. +#endif + +#if !CM_HAS_LSC +#error [Error_device_no_lsc] Kernel designed to use lsc. Current device does not support lsc. +#endif + +#if BLOCK_W > 8 +#error [Error_kernel_config_unsupported_block_w] Kernel designed to with with block_w in range: <1; 7>; +#endif + +#if BLOCK_OC != 8 && BLOCK_OC != 16 && BLOCK_OC != 32 && BLOCK_OC != 40 && BLOCK_OC != 64 && BLOCK_OC != 80 +#error [Error_kernel_config_unsupported_block_w] Kernel designed to with with block_oc which is equal to 8 or 16 or 32 or 40 or 64 or 80; +#endif + +#define DPAS_DEPTH 8 +#if(CM_GENX >= 1280) +#define EXEC_SIZE 16 +#else +#define EXEC_SIZE 8 +#endif +#define OUTPUT_CHANNEL_MULTIPLIER (EXEC_SIZE/DPAS_DEPTH) +#define BLOCK_H 1 +#define DT_OUT half +#define DT_IN half +#define DT_IN_SIZE 2 +#define DT_WEIGHTS half +// accu on DG2 have to be float for half dt inputs +#define DT_ACCU float + +#define DPAS_INPUT_CHANNELS (DPAS_DEPTH * sizeof(DT_IN)) +#define DPAS_OUTPUT_CHANNELS EXEC_SIZE +#define DPAS_RC BLOCK_W + +// currently it is fixed with 1, can be tuned for larger input channel sizes for any future case +#define SLICE_IC 1 + +#define CONV_LOOP_COUNT ((INPUT_CHANNELS/DPAS_INPUT_CHANNELS) / SLICE_IC) + +#define WEIGHTS_REG_SIZE (DPAS_INPUT_CHANNELS * DPAS_OUTPUT_CHANNELS) + +#define INPUT_NCHW_PLANE_SIZE (INPUT_WIDTH * INPUT_HEIGHT * sizeof(DT_IN)) +#define OUTPUT_NCHW_PLANE_SIZE (OUTPUT_WIDTH * OUTPUT_HEIGHT * sizeof(DT_OUT)) + +#define INPUT_NHWC_PLANE_SIZE (INPUT_CHANNELS * sizeof(DT_IN)) +#define OUTPUT_NHWC_PLANE_SIZE (OUTPUT_CHANNELS * sizeof(DT_OUT)) + +#define NCHW 0 +#define NHWC 1 + +#define LOAD_3x3_BLOCK_SIZE (BLOCK_W + 2) +#define LOAD_3x3_BLOCK_START 0 +#define LOAD_3x3_BLOCK_END 9 + +#if(INPUT_LAYOUT == NHWC) +#define OUTPUT_DPAS_OFFSET DPAS_OUTPUT_CHANNELS * sizeof(DT_OUT) +#else +#define OUTPUT_DPAS_OFFSET (DPAS_OUTPUT_CHANNELS * OUTPUT_HEIGHT * OUTPUT_WIDTH) * sizeof(DT_OUT) +#endif + +static const uint32_t init_linear_offsets_16[] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; + +template +_GENX_ inline vector load_3x3_input(SurfaceIndex surface [[type("buffer_t")]], int input_offset, int w_chunk_id) +{ +#if(INPUT_LAYOUT == NHWC) + const uint32_t LOAD_W_WIDTH = DPAS_INPUT_CHANNELS; +#else + const uint32_t LOAD_W_WIDTH = LOAD_W * STRIDE_W; +#endif + const uint32_t LOAD_W_BYTES_WIDTH = LOAD_W_WIDTH * sizeof(DT_IN); + const uint32_t LOAD_W_DWORDS = LOAD_W_BYTES_WIDTH / sizeof(uint32_t); + vector data_out; + vector load_offsets(init_linear_offsets_16); + const int current_kw = w_chunk_id * BLOCK_W * STRIDE_W; + const float left_pad = (current_kw == LOAD_3x3_BLOCK_START) ? 0.0f : 1.0f; + const float right_pad = ((current_kw + LOAD_3x3_BLOCK_END) > INPUT_WIDTH) ? 0.0f : 1.0f; +#if(INPUT_LAYOUT == NHWC) + load_offsets += input_offset - INPUT_CHANNELS * sizeof(DT_IN); + #pragma unroll + for(int i = 0; i < LOAD_W + 2; i++) + { + vector load_chunk = cm_load(surface, load_offsets); + if( i == LOAD_3x3_BLOCK_START ) + { + load_chunk *= left_pad; + } + if( i == LOAD_3x3_BLOCK_END ) + { + load_chunk *= right_pad; + } + data_out.select(i * DPAS_INPUT_CHANNELS) = load_chunk.select(); + load_offsets += INPUT_NHWC_PLANE_SIZE; + } +#else + load_offsets += input_offset - INPUT_PAD * sizeof(DT_IN); + #pragma unroll + for(int i = 0; i < DPAS_INPUT_CHANNELS; i++) + { + vector load_chunk = cm_load(surface, load_offsets); + load_chunk[LOAD_3x3_BLOCK_START] *= left_pad; + load_chunk[LOAD_3x3_BLOCK_END] *= right_pad; + data_out.select(i) = load_chunk.select(); + load_offsets += INPUT_NCHW_PLANE_SIZE; + } +#endif + return data_out; +} + +template +_GENX_ inline vector load_1x1_input(SurfaceIndex surface [[type("buffer_t")]], uint byte_offset) +{ +#if(INPUT_LAYOUT == NHWC) + const uint32_t LOAD_W_WIDTH = DPAS_INPUT_CHANNELS; +#else + const uint32_t LOAD_W_WIDTH = LOAD_W * STRIDE_W; +#endif + const uint32_t LOAD_W_BYTES_WIDTH = LOAD_W_WIDTH * sizeof(DT_IN); + const uint32_t LOAD_W_DWORDS = LOAD_W_BYTES_WIDTH / sizeof(uint32_t); + vector data_out; + vector load_chunk; +#if(INPUT_LAYOUT == NHWC) + #pragma unroll + for(int i = 0; i < LOAD_W; i++) + { + load_chunk = cm_load(surface, byte_offset); + data_out.select(i * DPAS_INPUT_CHANNELS) = load_chunk.format().select(); + byte_offset += INPUT_NHWC_PLANE_SIZE; + } +#else + #pragma unroll + for(int i = 0; i < DPAS_INPUT_CHANNELS; i++) + { + load_chunk = cm_load(surface, byte_offset); + data_out.select(i) = load_chunk.format().select(); + byte_offset += INPUT_NCHW_PLANE_SIZE; + } +#endif + return data_out; +} + +_GENX_ inline vector load_filter_nchw_data(SurfaceIndex surface [[type("buffer_t")]], uint32_t byte_offset) +{ +#if WEIGHTS_IN_OPTIMAL_FORMAT + vector data_out; + //vector_ref data_load_view = data_out.select<128,1>(0).format(); + data_out.select<128,1>(0).format() = cm_load(surface, byte_offset); + #if EXEC_SIZE == 16 + data_out.select<128,1>(128).format() = cm_load(surface, byte_offset + 256); + #endif + return data_out; +#else + #error Kernel only supports reordered weight layouts. +#endif +} + +_GENX_ inline vector load_bias(SurfaceIndex surface [[type("buffer_t")]], uint32_t byte_offset) +{ + vector data_out; +#if BLOCK_OC == 40 + data_out.select<32 * OUTPUT_CHANNEL_MULTIPLIER,1>(OUTPUT_CHANNEL_MULTIPLIER * 0 ).format() = cm_load(surface, byte_offset); + data_out.select<8 * OUTPUT_CHANNEL_MULTIPLIER,1>(OUTPUT_CHANNEL_MULTIPLIER * 32).format() = cm_load(surface, byte_offset + 64 * OUTPUT_CHANNEL_MULTIPLIER); +#endif +#if BLOCK_OC == 80 + data_out.select<64 * OUTPUT_CHANNEL_MULTIPLIER,1>(0 * OUTPUT_CHANNEL_MULTIPLIER).format() = cm_load(surface, byte_offset); + data_out.select<16 * OUTPUT_CHANNEL_MULTIPLIER,1>(64 * OUTPUT_CHANNEL_MULTIPLIER).format() = cm_load(surface, byte_offset + 128 * OUTPUT_CHANNEL_MULTIPLIER); +#endif +#if BLOCK_OC != 40 && BLOCK_OC != 80 + data_out.format() = cm_load(surface, byte_offset); +#endif + return data_out; +} + +template +_GENX_ inline void store_output(SurfaceIndex surface [[type("buffer_t")]], vector_ref grf_chunk, uint32_t byte_offset) +{ + uint32_t offsets = byte_offset; +#if(INPUT_LAYOUT == NHWC) + #pragma unroll + for(int i = 0; i < STORE_W; i++) + { + vector grf_chunk_store = grf_chunk.select(i * DPAS_OUTPUT_CHANNELS); + cm_store(surface, offsets, grf_chunk_store.format()); + offsets += OUTPUT_NHWC_PLANE_SIZE; + } +#else + #pragma unroll + for(int i = 0; i < DPAS_OUTPUT_CHANNELS; i++) + { + vector grf_chunk_store = grf_chunk.select(i); + cm_store(surface, offsets, grf_chunk_store.format()); + offsets += OUTPUT_NCHW_PLANE_SIZE; + } +#endif +} + +extern "C" _GENX_MAIN_ void conv_nchw_dpas_fp16( + SurfaceIndex surface_input [[type("buffer_t")]], + SurfaceIndex surface_weights [[type("buffer_t")]], + SurfaceIndex surface_output [[type("buffer_t")]] +#if USE_BIAS + ,SurfaceIndex surface_bias [[type("buffer_t")]] +#endif +) +{ + const uint32_t thg_0 = (cm_group_id(0) * cm_local_size(0) + cm_local_id(0)); + const uint w_chunk_id = thg_0 / SLICE_IC; + const uint slice_ic_id = thg_0 % SLICE_IC; + const uint h_chunk_id = cm_group_id(1) * cm_local_size(1) + cm_local_id(1); + const uint thread_id_2 = (cm_group_id(2) * cm_local_size(2) + cm_local_id(2)); + + const uint THREADS_FOR_OC = (OUTPUT_CHANNELS / BLOCK_OC) / OUTPUT_CHANNEL_MULTIPLIER; + const uint batch_id = (thread_id_2 / THREADS_FOR_OC); + const uint oc_chunk_id = (thread_id_2 % THREADS_FOR_OC) * (BLOCK_OC / DPAS_DEPTH); + +#if(INPUT_LAYOUT == NHWC) + const uint32_t input_row_offset_size = BLOCK_H * STRIDE_H * INPUT_WIDTH * INPUT_CHANNELS; + const uint32_t input_dpas_ic_offset_size = DPAS_INPUT_CHANNELS; + const uint32_t input_batch_offset = batch_id * INPUT_WIDTH * INPUT_HEIGHT * INPUT_CHANNELS; + const uint32_t input_w_chunk_offset = w_chunk_id * BLOCK_W * STRIDE_W * INPUT_CHANNELS; + const uint32_t input_h_chunk_offset = h_chunk_id * BLOCK_H * STRIDE_H * INPUT_WIDTH * INPUT_CHANNELS; +#else + const uint32_t input_row_offset_size = BLOCK_H * STRIDE_H * INPUT_WIDTH; + const uint32_t input_dpas_ic_offset_size = INPUT_HEIGHT * DPAS_INPUT_CHANNELS * INPUT_WIDTH; + const uint32_t input_batch_offset = batch_id * INPUT_WIDTH * INPUT_HEIGHT * INPUT_CHANNELS; + const uint32_t input_w_chunk_offset = w_chunk_id * BLOCK_W * STRIDE_W; + const uint32_t input_h_chunk_offset = h_chunk_id * input_row_offset_size; +#endif + const uint32_t input_slice_ic_chunk_offset = slice_ic_id * CONV_LOOP_COUNT * input_dpas_ic_offset_size; + uint32_t input_offset = (input_batch_offset + input_slice_ic_chunk_offset + input_h_chunk_offset + input_w_chunk_offset) * sizeof(DT_IN); + +#if WEIGHTS_IN_OPTIMAL_FORMAT + #if KERNEL_SIZE == 1 + const uint32_t weights_oc_chunk_offset = EXEC_SIZE * DPAS_INPUT_CHANNELS * sizeof(DT_WEIGHTS); + const uint32_t weights_ic_offset_size = OUTPUT_CHANNELS * DPAS_INPUT_CHANNELS * sizeof(DT_WEIGHTS); + #elif KERNEL_SIZE == 3 + const uint32_t weights_oc_chunk_offset = DPAS_OUTPUT_CHANNELS * INPUT_CHANNELS * sizeof(DT_WEIGHTS) * KERNEL_SIZE * KERNEL_SIZE; + const uint32_t weights_ic_offset_size = DPAS_INPUT_CHANNELS * EXEC_SIZE * sizeof(DT_WEIGHTS) * KERNEL_SIZE * KERNEL_SIZE; + #else + #error unsupported Kernel Size + #endif +#else + #error Kernel only supports reordered weight layouts. +#endif + + uint32_t weights_offset_0 = WEI_OFFSET + oc_chunk_id * weights_oc_chunk_offset + (slice_ic_id * CONV_LOOP_COUNT * weights_ic_offset_size); + uint32_t weights_offset_1 = weights_offset_0 + weights_oc_chunk_offset; + uint32_t weights_offset_2 = weights_offset_1 + weights_oc_chunk_offset; + uint32_t weights_offset_3 = weights_offset_2 + weights_oc_chunk_offset; + uint32_t weights_offset_4 = weights_offset_3 + weights_oc_chunk_offset; + uint32_t weights_offset_5 = weights_offset_4 + weights_oc_chunk_offset; + uint32_t weights_offset_6 = weights_offset_5 + weights_oc_chunk_offset; + uint32_t weights_offset_7 = weights_offset_6 + weights_oc_chunk_offset; + uint32_t weights_offset_8 = weights_offset_7 + weights_oc_chunk_offset; + uint32_t weights_offset_9 = weights_offset_8 + weights_oc_chunk_offset; + + const uint ACCU_REG_SIZE = BLOCK_W * DPAS_OUTPUT_CHANNELS; + vector accu_row_0_oc_0 = 0; + vector accu_row_0_oc_1 = 0; + vector accu_row_0_oc_2 = 0; + vector accu_row_0_oc_3 = 0; + vector accu_row_0_oc_4 = 0; + vector accu_row_0_oc_5 = 0; + vector accu_row_0_oc_6 = 0; + vector accu_row_0_oc_7 = 0; + vector accu_row_0_oc_8 = 0; + vector accu_row_0_oc_9 = 0; + + // todo debug performance with pragma unroll + //#pragma unroll + for(int i = 0; i < CONV_LOOP_COUNT; i++) + { + #if KERNEL_SIZE == 1 + vector input_row_0 = load_1x1_input(surface_input, input_offset); + + vector weights_0 = load_filter_nchw_data(surface_weights, weights_offset_0); + #if BLOCK_OC >= 16 + vector weights_1 = load_filter_nchw_data(surface_weights, weights_offset_1); + #endif + #if BLOCK_OC >= 32 + vector weights_2 = load_filter_nchw_data(surface_weights, weights_offset_2); + vector weights_3 = load_filter_nchw_data(surface_weights, weights_offset_3); + #endif + #if BLOCK_OC >= 40 + vector weights_4 = load_filter_nchw_data(surface_weights, weights_offset_4); + #endif + #if BLOCK_OC >= 64 + vector weights_5 = load_filter_nchw_data(surface_weights, weights_offset_5); + vector weights_6 = load_filter_nchw_data(surface_weights, weights_offset_6); + vector weights_7 = load_filter_nchw_data(surface_weights, weights_offset_7); + #endif + #if BLOCK_OC == 80 + vector weights_8 = load_filter_nchw_data(surface_weights, weights_offset_8); + vector weights_9 = load_filter_nchw_data(surface_weights, weights_offset_9); + #endif + + accu_row_0_oc_0 = cm_dpas(accu_row_0_oc_0, weights_0.format(), input_row_0.format()); + #if BLOCK_OC >= 16 + accu_row_0_oc_1 = cm_dpas(accu_row_0_oc_1, weights_1.format(), input_row_0.format()); + #endif + #if BLOCK_OC >= 32 + accu_row_0_oc_2 = cm_dpas(accu_row_0_oc_2, weights_2.format(), input_row_0.format()); + accu_row_0_oc_3 = cm_dpas(accu_row_0_oc_3, weights_3.format(), input_row_0.format()); + #endif + #if BLOCK_OC >= 40 + accu_row_0_oc_4 = cm_dpas(accu_row_0_oc_4, weights_4.format(), input_row_0.format()); + #endif + #if BLOCK_OC >= 64 + accu_row_0_oc_5 = cm_dpas(accu_row_0_oc_5, weights_5.format(), input_row_0.format()); + accu_row_0_oc_6 = cm_dpas(accu_row_0_oc_6, weights_6.format(), input_row_0.format()); + accu_row_0_oc_7 = cm_dpas(accu_row_0_oc_7, weights_7.format(), input_row_0.format()); + #endif + #if BLOCK_OC == 80 + accu_row_0_oc_8 = cm_dpas(accu_row_0_oc_8, weights_8.format(), input_row_0.format()); + accu_row_0_oc_9 = cm_dpas(accu_row_0_oc_9, weights_9.format(), input_row_0.format()); + #endif + #elif KERNEL_SIZE == 3 + #pragma unroll + for(int kh = -INPUT_PAD; kh < KERNEL_SIZE-INPUT_PAD; kh++) + { + int input_load_offset_kh = input_offset + (kh * input_row_offset_size * sizeof(DT_IN)); + if(h_chunk_id + kh < 0 || h_chunk_id + kh >= INPUT_HEIGHT) { continue; }; + vector input_row_0 = load_3x3_input(surface_input, input_load_offset_kh, w_chunk_id); + + #pragma unroll + for(int kw = 0; kw < KERNEL_SIZE; kw++) + { + uint32_t kernel_index = ((kh + INPUT_PAD) * KERNEL_SIZE + kw) * sizeof(DT_WEIGHTS); + vector weights_0 = load_filter_nchw_data(surface_weights, weights_offset_0 + (kernel_index * WEIGHTS_REG_SIZE)); + + #if BLOCK_OC >= 16 + vector weights_1 = load_filter_nchw_data(surface_weights, weights_offset_1 + (kernel_index * WEIGHTS_REG_SIZE)); + #endif + #if BLOCK_OC >= 32 + vector weights_2 = load_filter_nchw_data(surface_weights, weights_offset_2 + kernel_index * WEIGHTS_REG_SIZE); + vector weights_3 = load_filter_nchw_data(surface_weights, weights_offset_3 + kernel_index * WEIGHTS_REG_SIZE); + #endif + #if BLOCK_OC >= 40 + vector weights_4 = load_filter_nchw_data(surface_weights, weights_offset_4 + kernel_index * WEIGHTS_REG_SIZE); + #endif + #if BLOCK_OC >= 64 + vector weights_5 = load_filter_nchw_data(surface_weights, weights_offset_5 + kernel_index * WEIGHTS_REG_SIZE); + vector weights_6 = load_filter_nchw_data(surface_weights, weights_offset_6 + kernel_index * WEIGHTS_REG_SIZE); + vector weights_7 = load_filter_nchw_data(surface_weights, weights_offset_7 + kernel_index * WEIGHTS_REG_SIZE); + #endif + #if BLOCK_OC == 80 + vector weights_8 = load_filter_nchw_data(surface_weights, weights_offset_8 + kernel_index * WEIGHTS_REG_SIZE); + vector weights_9 = load_filter_nchw_data(surface_weights, weights_offset_9 + kernel_index * WEIGHTS_REG_SIZE); + #endif + + accu_row_0_oc_0 = cm_dpas(accu_row_0_oc_0, weights_0.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + #if BLOCK_OC >= 16 + accu_row_0_oc_1 = cm_dpas(accu_row_0_oc_1, weights_1.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + #endif + #if BLOCK_OC >= 32 + accu_row_0_oc_2 = cm_dpas(accu_row_0_oc_2, weights_2.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + accu_row_0_oc_3 = cm_dpas(accu_row_0_oc_3, weights_3.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + #endif + #if BLOCK_OC >= 40 + accu_row_0_oc_4 = cm_dpas(accu_row_0_oc_4, weights_4.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + #endif + #if BLOCK_OC >= 64 + accu_row_0_oc_5 = cm_dpas(accu_row_0_oc_5, weights_5.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + accu_row_0_oc_6 = cm_dpas(accu_row_0_oc_6, weights_6.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + accu_row_0_oc_7 = cm_dpas(accu_row_0_oc_7, weights_7.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + #endif + #if BLOCK_OC == 80 + accu_row_0_oc_8 = cm_dpas(accu_row_0_oc_8, weights_8.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + accu_row_0_oc_9 = cm_dpas(accu_row_0_oc_9, weights_9.format(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format()); + #endif + } + } + #else + #error unsupported Kernel Size + #endif + input_offset += (input_dpas_ic_offset_size * sizeof(DT_IN)); + weights_offset_0 += weights_ic_offset_size; + weights_offset_1 += weights_ic_offset_size; + weights_offset_2 += weights_ic_offset_size; + weights_offset_3 += weights_ic_offset_size; + weights_offset_4 += weights_ic_offset_size; + weights_offset_5 += weights_ic_offset_size; + weights_offset_6 += weights_ic_offset_size; + weights_offset_7 += weights_ic_offset_size; + weights_offset_8 += weights_ic_offset_size; + weights_offset_9 += weights_ic_offset_size; + } + + vector output_row_0_oc_0 = vector(accu_row_0_oc_0); +#if BLOCK_OC >= 16 + vector output_row_0_oc_1 = vector(accu_row_0_oc_1); +#endif + +#if BLOCK_OC >= 32 + vector output_row_0_oc_2 = vector(accu_row_0_oc_2); + vector output_row_0_oc_3 = vector(accu_row_0_oc_3); +#endif +#if BLOCK_OC >= 40 + vector output_row_0_oc_4 = vector(accu_row_0_oc_4); +#endif +#if BLOCK_OC >= 64 + vector output_row_0_oc_5 = vector(accu_row_0_oc_5); + vector output_row_0_oc_6 = vector(accu_row_0_oc_6); + vector output_row_0_oc_7 = vector(accu_row_0_oc_7); +#endif +#if BLOCK_OC == 80 + vector output_row_0_oc_8 = vector(accu_row_0_oc_8); + vector output_row_0_oc_9 = vector(accu_row_0_oc_9); +#endif + +#if USE_BIAS + vector bias = load_bias(surface_bias, oc_chunk_id * EXEC_SIZE * sizeof(DT_OUT)); + #pragma unroll + for(int bw = 0; bw < BLOCK_W; bw++) + { + output_row_0_oc_0.select(bw * EXEC_SIZE) += bias.select(0 * OUTPUT_CHANNEL_MULTIPLIER); +#if BLOCK_OC >= 16 + output_row_0_oc_1.select(bw * EXEC_SIZE) += bias.select(8 * OUTPUT_CHANNEL_MULTIPLIER); +#endif +#if BLOCK_OC >= 32 + output_row_0_oc_2.select(bw * EXEC_SIZE) += bias.select(16 * OUTPUT_CHANNEL_MULTIPLIER); + output_row_0_oc_3.select(bw * EXEC_SIZE) += bias.select(24 * OUTPUT_CHANNEL_MULTIPLIER); +#endif +#if BLOCK_OC >= 40 + output_row_0_oc_4.select(bw * EXEC_SIZE) += bias.select(32 * OUTPUT_CHANNEL_MULTIPLIER); +#endif +#if BLOCK_OC >= 64 + output_row_0_oc_5.select(bw * EXEC_SIZE) += bias.select(40 * OUTPUT_CHANNEL_MULTIPLIER); + output_row_0_oc_6.select(bw * EXEC_SIZE) += bias.select(48 * OUTPUT_CHANNEL_MULTIPLIER); + output_row_0_oc_7.select(bw * EXEC_SIZE) += bias.select(56 * OUTPUT_CHANNEL_MULTIPLIER); +#endif +#if BLOCK_OC >= 80 + output_row_0_oc_8.select(bw * EXEC_SIZE) += bias.select(64 * OUTPUT_CHANNEL_MULTIPLIER); + output_row_0_oc_9.select(bw * EXEC_SIZE) += bias.select(72 * OUTPUT_CHANNEL_MULTIPLIER); +#endif + } +#endif + +#if USE_RELU + output_row_0_oc_0 = cm_max(output_row_0_oc_0, 0); + #if BLOCK_OC >= 16 + output_row_0_oc_1 = cm_max(output_row_0_oc_1, 0); + #endif + #if BLOCK_OC >= 32 + output_row_0_oc_2 = cm_max(output_row_0_oc_2, 0); + output_row_0_oc_3 = cm_max(output_row_0_oc_3, 0); + #endif + #if BLOCK_OC >= 40 + output_row_0_oc_4 = cm_max(output_row_0_oc_4, 0); + #endif + #if BLOCK_OC >= 64 + output_row_0_oc_5 = cm_max(output_row_0_oc_5, 0); + output_row_0_oc_6 = cm_max(output_row_0_oc_6, 0); + output_row_0_oc_7 = cm_max(output_row_0_oc_7, 0); + #endif + #if BLOCK_OC == 80 + output_row_0_oc_8 = cm_max(output_row_0_oc_8, 0); + output_row_0_oc_9 = cm_max(output_row_0_oc_9, 0); + #endif +#endif + +#if(INPUT_LAYOUT == NHWC) + const uint output_oc_chunk_offset = oc_chunk_id * DPAS_OUTPUT_CHANNELS; + const uint output_w_chunk_offset = w_chunk_id * BLOCK_W * OUTPUT_CHANNELS; + const uint output_h_chunk_offset = h_chunk_id * BLOCK_H * OUTPUT_WIDTH * OUTPUT_CHANNELS; +#else + const uint output_oc_chunk_offset = oc_chunk_id * DPAS_OUTPUT_CHANNELS * OUTPUT_HEIGHT * OUTPUT_WIDTH; + const uint output_w_chunk_offset = w_chunk_id * BLOCK_W; + const uint output_h_chunk_offset = h_chunk_id * BLOCK_H * OUTPUT_WIDTH; +#endif + const uint output_batch_offset = batch_id * OUTPUT_HEIGHT * OUTPUT_WIDTH * OUTPUT_CHANNELS; + uint32_t output_offset = (output_batch_offset + output_oc_chunk_offset + output_h_chunk_offset + output_w_chunk_offset) * sizeof(DT_OUT); + + store_output(surface_output, output_row_0_oc_0, output_offset); + +#if BLOCK_OC >= 16 + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_1, output_offset); +#endif + +#if BLOCK_OC >= 32 + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_2, output_offset); + + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_3, output_offset); +#endif + +#if BLOCK_OC >= 40 + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_4, output_offset); +#endif + +#if BLOCK_OC >= 64 + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_5, output_offset); + + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_6, output_offset); + + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_7, output_offset); +#endif + +#if BLOCK_OC == 80 + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_8, output_offset); + + output_offset += OUTPUT_DPAS_OFFSET; + store_output(surface_output, output_row_0_oc_9, output_offset); +#endif +} \ No newline at end of file diff --git a/tools/cross_runner/kernels/reorder_weights.cpp b/tools/cross_runner/kernels/reorder_weights.cpp index b4a8021..858e40b 100644 --- a/tools/cross_runner/kernels/reorder_weights.cpp +++ b/tools/cross_runner/kernels/reorder_weights.cpp @@ -18,138 +18,159 @@ implied warranties, other than those that are expressly stated in the License. #include #include -#define LAYOUT_OIYX 1001 -#define LAYOUT_IO_i8_o8_i2 1002 -#define LAYOUT_OYXI_o8 1003 -#define LAYOUT_OYXI_o16 1004 - #if !CM_HAS_LSC #error [Error_device_no_lsc] Kernel designed to use lsc. Current device does not support lsc. #endif -#define DPAS_DEPTH 8 #if(CM_GENX >= 1280) #define DPAS_EXEC_SIZE 16 #else #define DPAS_EXEC_SIZE 8 #endif -#if OUTPUT_LAYOUT == LAYOUT_OYXI_o8 -#define SIMD_SIZE 8 -#elif OUTPUT_LAYOUT == LAYOUT_OYXI_o16 -#define SIMD_SIZE 16 -#endif - -#define WEIGHT_TYPE_SIZE sizeof(DT) - -static const uint32_t weights_init_linear_offsets[] = { - 0 * WEIGHT_TYPE_SIZE, - 1 * WEIGHT_TYPE_SIZE, - 2 * WEIGHT_TYPE_SIZE, - 3 * WEIGHT_TYPE_SIZE, - 4 * WEIGHT_TYPE_SIZE, - 5 * WEIGHT_TYPE_SIZE, - 6 * WEIGHT_TYPE_SIZE, - 7 * WEIGHT_TYPE_SIZE, - 8 * WEIGHT_TYPE_SIZE, - 9 * WEIGHT_TYPE_SIZE, - 10 * WEIGHT_TYPE_SIZE, - 11 * WEIGHT_TYPE_SIZE, - 12 * WEIGHT_TYPE_SIZE, - 13 * WEIGHT_TYPE_SIZE, - 14 * WEIGHT_TYPE_SIZE, - 15 * WEIGHT_TYPE_SIZE - }; +#define WEIGHT_TYPE_SIZE sizeof(INPUT_TYPE) +#define DPAS_DEPTH 8 +#define DPAS_LOAD_SIZE (DPAS_DEPTH * WEIGHT_TYPE_SIZE) +#define DPAS_STORE_SIZE (DPAS_EXEC_SIZE * WEIGHT_TYPE_SIZE) +#define DPAS_STORE_BLOCK (DPAS_EXEC_SIZE * DPAS_LOAD_SIZE) +#define MAX_STORE_SIZE (DPAS_EXEC_SIZE * DPAS_DEPTH) +#define MAX_STORE_BYTES 128 -extern "C" _GENX_MAIN_ void weights_reorder(SurfaceIndex surface_input [[type("buffer_t")]], SurfaceIndex surface_constants [[type("buffer_t")]], SurfaceIndex surface_output [[type("buffer_t")]]) +extern "C" _GENX_MAIN_ void weights_reorder(SurfaceIndex surface_input [[type("buffer_t")]], SurfaceIndex surface_output [[type("buffer_t")]]) { - const uint thread_id_0 = cm_group_id(0) * cm_local_size(0) + cm_local_id(0); - const uint thread_id_1 = cm_group_id(1) * cm_local_size(1) + cm_local_id(1); - const uint thread_id_2 = cm_group_id(2) * cm_local_size(2) + cm_local_id(2); + const uint32_t thread_id_0 = cm_group_id(0) * cm_local_size(0) + cm_local_id(0); + const uint32_t thread_id_1 = cm_group_id(1) * cm_local_size(1) + cm_local_id(1); + const uint32_t thread_id_2 = cm_group_id(2) * cm_local_size(2) + cm_local_id(2); - vector constants = cm_load(surface_constants, 0); - const uint32_t IC = constants[7]; - const uint32_t OC = constants[8]; - const uint32_t filter_layout_is_nhwc = constants[15]; - const uint32_t weights_ic_offset = (K_SIZE * K_SIZE * WEIGHT_TYPE_SIZE); + const uint32_t oc = thread_id_0 * DPAS_EXEC_SIZE; + +#if INPUT_LAYOUT == LAYOUT_OIYX && OUTPUT_LAYOUT == LAYOUT_IO_i8_o8_i2 && K_SIZE == 3 + const uint32_t ic_chunks_per_hw_thread = DPAS_DEPTH/(WEIGHT_TYPE_SIZE * WEIGHT_TYPE_SIZE); + const uint32_t ic_per_hw_thread = (DPAS_LOAD_SIZE * ic_chunks_per_hw_thread); + const uint32_t data_load_size = ic_per_hw_thread/WEIGHT_TYPE_SIZE; -#if INPUT_LAYOUT == LAYOUT_OIYX && OUTPUT_LAYOUT == LAYOUT_IO_i8_o8_i2 - const uint32_t ic_chunk_size = DPAS_DEPTH * (sizeof(uint32_t)/ sizeof(DT)); - const uint32_t ic_chunks_per_hw_thread = 8; - const uint32_t ic_per_hw_thread = (ic_chunk_size * ic_chunks_per_hw_thread); - const uint32_t ic_per_hw_thread_packed = (ic_per_hw_thread * sizeof(DT)) / sizeof(uint32_t); - const uint32_t int_block = (sizeof(uint32_t) / sizeof(DT)); - const uint32_t dpas_input_channels = DPAS_DEPTH * int_block; - - const uint32_t oc = thread_id_0 * DPAS_EXEC_SIZE; - const uint32_t ic = thread_id_1 * ic_per_hw_thread; - - const uint32_t chunks_count = DPAS_EXEC_SIZE; + const uint32_t ic = thread_id_1 * ic_per_hw_thread; + uint32_t input_offset = (oc * IC + ic) * K_SIZE * K_SIZE * sizeof(INPUT_TYPE); + uint32_t output_offset = WEI_OFFSET + ((oc * IC) + (ic * DPAS_EXEC_SIZE)) * K_SIZE * K_SIZE * sizeof(INPUT_TYPE); - // load - matrix data_input_typed; - uint32_t input_offset = (oc * IC + ic) * sizeof(DT); - #pragma unroll - for(int i = 0; i < chunks_count; i++) + matrix data_input_typed_0; + matrix data_input_typed_1; + matrix data_input_typed_2; + matrix data_input_typed_3; + matrix data_input_typed_4; + matrix data_input_typed_5; + matrix data_input_typed_6; + matrix data_input_typed_7; + matrix data_input_typed_8; + #pragma unroll + for(int i = 0; i < DPAS_EXEC_SIZE; i++) { - data_input_typed.row(i) = cm_load(surface_input, input_offset); - input_offset += IC * sizeof(DT); + uint32_t load_offset = input_offset + i * IC * K_SIZE * K_SIZE * sizeof(INPUT_TYPE); + + vector load_line; + load_line.select(0 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 0 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(1 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 1 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(2 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 2 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(3 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 3 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(4 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 4 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(5 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 5 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(6 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 6 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(7 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 7 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + load_line.select(8 * ic_per_hw_thread).format() = cm_load(surface_input, load_offset + 8 * ic_per_hw_thread * WEIGHT_TYPE_SIZE); + + data_input_typed_0.row(i) = load_line.select(0); + data_input_typed_1.row(i) = load_line.select(1); + data_input_typed_2.row(i) = load_line.select(2); + data_input_typed_3.row(i) = load_line.select(3); + data_input_typed_4.row(i) = load_line.select(4); + data_input_typed_5.row(i) = load_line.select(5); + data_input_typed_6.row(i) = load_line.select(6); + data_input_typed_7.row(i) = load_line.select(7); + data_input_typed_8.row(i) = load_line.select(8); } - matrix_ref data_input = data_input_typed.format(); - - uint32_t output_offset = (oc * dpas_input_channels + ic * OC) * sizeof(DT); - vector data_out; + + vector data_out_0 = 0; + vector data_out_1 = 0; + vector data_out_2 = 0; + vector data_out_3 = 0; + vector data_out_4 = 0; + vector data_out_5 = 0; + vector data_out_6 = 0; + vector data_out_7 = 0; + vector data_out_8 = 0; #pragma unroll for(int i = 0; i < ic_chunks_per_hw_thread; i++) { - #pragma unroll - for(int j = 0; j < DPAS_EXEC_SIZE; j++) + #pragma unroll + for(int j = 0; j < DPAS_DEPTH; j++) { - data_out.select(j * dpas_input_channels) = data_input.select(0, int_block * j + i * dpas_input_channels); + data_out_0.select(j * DPAS_STORE_SIZE) = data_input_typed_0.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_1.select(j * DPAS_STORE_SIZE) = data_input_typed_1.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_2.select(j * DPAS_STORE_SIZE) = data_input_typed_2.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_3.select(j * DPAS_STORE_SIZE) = data_input_typed_3.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_4.select(j * DPAS_STORE_SIZE) = data_input_typed_4.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_5.select(j * DPAS_STORE_SIZE) = data_input_typed_5.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_6.select(j * DPAS_STORE_SIZE) = data_input_typed_6.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_7.select(j * DPAS_STORE_SIZE) = data_input_typed_7.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + data_out_8.select(j * DPAS_STORE_SIZE) = data_input_typed_8.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); } - const uint32_t packed_size = (DPAS_EXEC_SIZE * dpas_input_channels)/2; - cm_store(surface_output, output_offset, data_out.format()); - output_offset += OC * dpas_input_channels * sizeof(DT); + + cm_store(surface_output, output_offset + (0 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_0.select(0).format()); + cm_store(surface_output, output_offset + (1 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_1.select(0).format()); + cm_store(surface_output, output_offset + (2 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_2.select(0).format()); + cm_store(surface_output, output_offset + (3 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_3.select(0).format()); + cm_store(surface_output, output_offset + (4 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_4.select(0).format()); + cm_store(surface_output, output_offset + (5 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_5.select(0).format()); + cm_store(surface_output, output_offset + (6 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_6.select(0).format()); + cm_store(surface_output, output_offset + (7 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_7.select(0).format()); + cm_store(surface_output, output_offset + (8 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_8.select(0).format()); + + #if DPAS_EXEC_SIZE == 16 + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (0 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_0.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (1 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_1.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (2 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_2.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (3 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_3.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (4 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_4.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (5 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_5.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (6 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_6.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (7 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_7.select(MAX_STORE_BYTES).format()); + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK + (8 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_8.select(MAX_STORE_BYTES).format()); + #endif + output_offset += DPAS_EXEC_SIZE * K_SIZE * K_SIZE * DPAS_LOAD_SIZE * sizeof(OUTPUT_TYPE); } -#elif INPUT_LAYOUT == LAYOUT_OIYX && (OUTPUT_LAYOUT == LAYOUT_OYXI_o8 || OUTPUT_LAYOUT == LAYOUT_OYXI_o16) +#elif INPUT_LAYOUT == LAYOUT_OIYX && OUTPUT_LAYOUT == LAYOUT_IO_i8_o8_i2 && K_SIZE == 1 + const uint32_t ic_chunks_per_hw_thread = DPAS_DEPTH/WEIGHT_TYPE_SIZE; + const uint32_t ic_per_hw_thread = (DPAS_LOAD_SIZE * ic_chunks_per_hw_thread); + const uint32_t data_load_size = ic_per_hw_thread/WEIGHT_TYPE_SIZE; + + const uint32_t ic = thread_id_1 * ic_per_hw_thread; + uint32_t input_offset = (oc * IC + ic) * sizeof(INPUT_TYPE); + uint32_t output_offset = WEI_OFFSET + (oc * DPAS_LOAD_SIZE + ic * OC) * sizeof(INPUT_TYPE); - const uint32_t oc = thread_id_0 * SIMD_SIZE; - const uint32_t ic = thread_id_1; - const uint32_t kh = thread_id_2; - const uint32_t weights_oc_offset = IC * weights_ic_offset; - const uint32_t chunks_count = SIMD_SIZE; - const uint32_t max_dt_size = sizeof(float)/WEIGHT_TYPE_SIZE; - const uint32_t LOAD_SIZE = ((K_SIZE + SIMD_SIZE) >> 4) << 4; - matrix data_input; - - vector offsets(weights_init_linear_offsets); + matrix data_input; + #pragma unroll + for(int i = 0; i < DPAS_EXEC_SIZE; i++) + { + data_input.row(i).format() = cm_load(surface_input, input_offset); + input_offset += IC * sizeof(INPUT_TYPE); + } - if (filter_layout_is_nhwc) // nhwc - { - offsets *= IC; - offsets += oc * weights_oc_offset + ic * WEIGHT_TYPE_SIZE + kh * IC * K_SIZE * WEIGHT_TYPE_SIZE; - } - else // nchw - { - offsets += oc * weights_oc_offset + ic * weights_ic_offset + kh * K_SIZE * WEIGHT_TYPE_SIZE; - } + vector data_out = 0; + #pragma unroll + for(int i = 0; i < ic_chunks_per_hw_thread; i++) + { + #pragma unroll + for(int j = 0; j < DPAS_DEPTH; j++) + { + data_out.select(j * DPAS_STORE_SIZE) = data_input.select(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE); + } - #pragma unroll - for(int i = 0; i < chunks_count; i++) - { - vector data_load = cm_load(surface_input, offsets); - data_input.select<1, 1, K_SIZE, 1>(i, 0) = data_load.select(); - offsets += K_SIZE * K_SIZE * IC * WEIGHT_TYPE_SIZE; - } - - uint32_t ouput_offset = (oc * K_SIZE * K_SIZE * IC + ic * SIMD_SIZE + kh * K_SIZE * IC * SIMD_SIZE) * WEIGHT_TYPE_SIZE; - #pragma unroll - for(int kw = 0; kw < K_SIZE; kw++) - { - vector data_out = data_input.select(0, kw); - cm_store(surface_output, ouput_offset, data_out.format()); - ouput_offset += IC * SIMD_SIZE * WEIGHT_TYPE_SIZE; - } + cm_store(surface_output, output_offset, data_out.select(0).format()); + #if DPAS_EXEC_SIZE == 16 + cm_store(surface_output, output_offset + DPAS_STORE_BLOCK, data_out.select(MAX_STORE_BYTES).format()); + #endif + output_offset += OC * DPAS_LOAD_SIZE * sizeof(OUTPUT_TYPE); + } #else #error Not supported layouts. #endif