From feef0a013907189668c1874543a4d8d8508e4541 Mon Sep 17 00:00:00 2001
From: Prithviraj-R <prithviraj.r@intel.com>
Date: Mon, 15 Jul 2024 02:43:16 -0700
Subject: [PATCH] [CM Conv] Enable SIMD16 Support for DPAS Convolutions

---
 tools/common_lib/src/conv.h                   | 165 +++---
 tools/common_lib/src/layers_utils.h           |   1 +
 .../kernels/conv_nchw_dpas_fp16.cpp           | 533 ++++++++++++++++++
 .../cross_runner/kernels/reorder_weights.cpp  | 233 ++++----
 4 files changed, 753 insertions(+), 179 deletions(-)
 create mode 100644 tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp

diff --git a/tools/common_lib/src/conv.h b/tools/common_lib/src/conv.h
index fed0457..4800b8c 100644
--- a/tools/common_lib/src/conv.h
+++ b/tools/common_lib/src/conv.h
@@ -342,7 +342,7 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
         bool managed_weights = false;
         bool algo_winograd = false;
         bool transposed = false;
-
+        bool use_constant_buffer = false;
         bool dump_weights = false;
         bool use_dnnl_for_reference_calculations = false;
 
@@ -362,11 +362,11 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
             opts->add_flag("--no_bias", params.no_bias);
             opts->add_flag("--allow_fp16_computations", params.allow_fp16_computations);
             opts->add_flag("--managed_weights", params.managed_weights);
-            opts->add_option("--activation", params.activation);
+            opts->add_option("--activation", params.activation.type);
             opts->add_flag("--algo_winograd", params.algo_winograd);
             opts->add_flag("--transposed", params.transposed);
             opts->add_flag("--dnnl_reference", params.use_dnnl_for_reference_calculations)->default_val(false);
-
+            opts->add_flag("--use_constant_buffer", params.use_constant_buffer);
             opts->add_flag("--dump_weights", params.dump_weights);
         }
     };
@@ -391,9 +391,12 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
         assert(params_.groups >= 1);
 
         const auto output_shape = get_output_shape();
-        prepare_constant_data();
+        if (use_constant())
+        {
+            prepare_constant_data();
+        }
 
-        if (!params_.no_bias)
+        if (use_bias())
         {
             bias_data_ = std::vector<std::byte>(output_shape.c * get_data_type_bytes_width(params_.dt));
         }
@@ -449,8 +452,11 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
             bias_buffer_ = create_buffer(d3d12_device, tensor_bias_bytes_width,
                 D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
         }
-        constant_buffer_ = create_buffer(d3d12_device, tensor_constant_bytes_width,
-            D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+        if (use_constant())
+        {
+            constant_buffer_ = create_buffer(d3d12_device, tensor_constant_bytes_width,
+                D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+        }
         output_buffer_ = create_buffer(d3d12_device, tensor_out_bytes_width,
             D3D12_HEAP_TYPE_DEFAULT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
 
@@ -469,7 +475,10 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
             std::memcpy(upload_mapped_ptr + memcopy_offset, bias_data_.data(), tensor_bias_bytes_width);
             memcopy_offset += tensor_bias_bytes_width;
         }
-        std::memcpy(upload_mapped_ptr + memcopy_offset, constant_data_.data(), tensor_constant_bytes_width);
+        if (use_constant())
+        {
+            std::memcpy(upload_mapped_ptr + memcopy_offset, constant_data_.data(), tensor_constant_bytes_width);
+        }
         // unmap memory
         upload_buffer_->Unmap(0, nullptr);
 
@@ -483,7 +492,10 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
             cmd_list->CopyBufferRegion(bias_buffer_.Get(), 0, upload_buffer_.Get(), memcopy_offset, tensor_bias_bytes_width);
             memcopy_offset += tensor_bias_bytes_width;
         }
-        cmd_list->CopyBufferRegion(constant_buffer_.Get(), 0, upload_buffer_.Get(), memcopy_offset, tensor_constant_bytes_width);
+        if (use_constant())
+        {
+            cmd_list->CopyBufferRegion(constant_buffer_.Get(), 0, upload_buffer_.Get(), memcopy_offset, tensor_constant_bytes_width);
+        }
 
         std::vector<CD3DX12_RESOURCE_BARRIER> barriers;
         barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(input_buffer_.Get(),
@@ -495,8 +507,11 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
             barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(bias_buffer_.Get(),
                 D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS));
         }
-        barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(constant_buffer_.Get(),
-            D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS));
+        if (use_constant())
+        {
+            barriers.push_back(CD3DX12_RESOURCE_BARRIER::Transition(constant_buffer_.Get(),
+                D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS));
+        }
         cmd_list->ResourceBarrier(static_cast<std::uint32_t>(barriers.size()), barriers.data());
     }
 
@@ -602,7 +617,10 @@ class ConvolutionBaseDispatcher : public NodeDispatcher
     {
         return !params_.no_bias;
     }
-
+    inline bool use_constant() const
+    {
+        return params_.use_constant_buffer;
+    }
     std::vector<std::byte> get_dnnl_result(std::size_t reference_dispatch_iterations) const
     {
         const auto output_shape = get_output_shape();
@@ -1043,7 +1061,7 @@ class ConvolutionUmdD3d12Dispatcher : public ConvolutionBaseDispatcher
         input_memory_desc_ = to_dnnl_mem_desc(params_.input_shape, params_.input_layout, params_.dt);
         output_memory_desc_ = to_dnnl_mem_desc(get_output_shape(), params_.output_layout, params_.dt);
 
-        if (!params_.no_bias)
+        if (use_bias())
         {
             bias_memory_desc_.emplace(to_dnnl_mem_desc(TensorShape{ get_output_shape().c, 0, 0, 0}, DataLayout::eO, params_.dt));
         }
@@ -1253,12 +1271,12 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
             wr_params.k_size = params_.filter_shape.w;
             wr_params.input_layout = DataLayout::eOIYX;
 
-            /*if (params_.dt == DataType::eFp16 && params_.filter_shape.w == 1 && params_.filter_shape.h == 1)
+            if (params_.dt == DataType::eFp16)
             {
                 wr_params.output_layout = DataLayout::eIO_i8_o8_i2;
             }
             else if (params_.dt == DataType::eFp16  && params_.filter_shape.w != 1 && params_.filter_shape.h != 1)
-            {*/
+            {
                 if (cm_params_.block_oc == 8)
                 {
                     wr_params.output_layout = DataLayout::eOYXI_o8;
@@ -1267,7 +1285,7 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
                 {
                     wr_params.output_layout = DataLayout::eOYXI_o16;
                 }
-            //}
+            }
 
             weights_reorder_.emplace(WeightsReorder(std::move(wr_params), filter_buffer_, constant_buffer_, intc_ext, d3d12_device, cmd_list));
         }
@@ -1276,11 +1294,14 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
         {
             // input, filter
             std::vector<DescType> desc_list = { DescType::eSrv, DescType::eSrv };
-            if (!params_.no_bias)
+            if (use_bias())
+            {
+                desc_list.push_back(DescType::eSrv);
+            }
+            if (constant_buffer_)
             {
                 desc_list.push_back(DescType::eSrv);
             }
-            desc_list.push_back(DescType::eSrv);
             // output 
             desc_list.push_back(DescType::eUav);
             root_signature_ = create_root_signature(d3d12_device_, desc_list);
@@ -1307,31 +1328,36 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
 
             build_options += pre_jit + name + between_name_and_value + value_str + post_jit;
         };
-        add_define("DT", static_cast<uint32_t>(params_.dt));
-        //add_define("INPUT_WIDTH", params_.input_shape.w);
-        //add_define("INPUT_HEIGHT", params_.input_shape.h);
-        //add_define("INPUT_CHANNELS", params_.input_shape.c);
-
-        //add_define("OUTPUT_WIDTH", output_shape_.w);
-        //add_define("OUTPUT_HEIGHT", output_shape_.h);
-        //("OUTPUT_CHANNELS", output_shape_.c);
-
-        //add_define("BATCH", params_.input_shape.n);
-        //add_define("INPUT_PAD", params_.in_pad);
-        //add_define("OUTPUT_PAD", params_.out_pad);
+        if (params_.allow_fp16_computations)
+        {
+            add_define("DT_ACCU", "half");
+        }
+        else
+        {
+            add_define("DT_ACCU", "float");
+        }
+        add_define("INPUT_WIDTH", params_.input_shape.w);
+        add_define("INPUT_HEIGHT", params_.input_shape.h);
+        add_define("INPUT_CHANNELS", params_.input_shape.c);
+        add_define("OUTPUT_WIDTH", output_shape_.w);
+        add_define("OUTPUT_HEIGHT", output_shape_.h);
+        add_define("OUTPUT_CHANNELS", output_shape_.c);
+        add_define("BATCH", params_.input_shape.n);
+        add_define("INPUT_PAD", params_.in_pad);
+        add_define("OUTPUT_PAD", params_.out_pad);
         add_define("USE_BIAS", !params_.no_bias);
         add_define("KERNEL_SIZE", params_.filter_shape.h);
         add_define("STRIDE_W", params_.stride.w);
-        //add_define("STRIDE_H", params_.stride.h);
-
-        //add_define("SLICE_IC", cm_params_.slice_ic);
-        //add_define("BLOCK_W", cm_params_.block_w);
-        //add_define("BLOCK_H", cm_params_.block_h);
-        //add_define("BLOCK_OC", cm_params_.block_oc);
-        //add_define("BLOCK_BATCH", cm_params_.block_batch);
-
-        //add_define("WEIGHTS_IN_OPTIMAL_FORMAT", cm_params.reorder_weights);
-
+        add_define("STRIDE_H", params_.stride.h);
+        add_define("SLICE_IC", cm_params_.slice_ic);
+        add_define("BLOCK_W", cm_params_.block_w);
+        add_define("BLOCK_H", cm_params_.block_h);
+        add_define("BLOCK_OC", cm_params_.block_oc);
+        add_define("BLOCK_BATCH", cm_params_.block_batch);
+        add_define("WEIGHTS_IN_OPTIMAL_FORMAT", cm_params.reorder_weights);
+        add_define("INPUT_LAYOUT", (params_.input_layout == DataLayout::eNCHW) ? 0 : 1);
+        add_define("USE_RELU", (params_.activation.type == ActivationType::eRelu) ? 1 : 0);
+        add_define("WEI_OFFSET", 0);    // Kernel uses this compile time flag as base offset to weights surface in the actual driver mode, So forcing it to Zero in cross-runner
         // kernel compilation
         const auto dump_asm_str = cm_params_.dump_asm ? " -mdump_asm" : "";
         const auto large_grf_str = cm_params_.large_grf ? " -Qxcm_doubleGRF" : "";
@@ -1348,16 +1374,8 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
 
         auto kernel_source_content = [](const auto kernel_size)
         {
-            std::string path = "";
-            if (false/*kernel_size == 1*/)
-            {
-                path = "conv_1x1_nchw_fp16.cpp";
-            }
-            else
-            {
-                path = "conv_nchw_fp16.cpp";
-            }
-
+            std::string path = "conv_nchw_dpas_fp16.cpp";
+            
             std::fstream file(path);
             if (!file.is_open())
             {
@@ -1380,7 +1398,7 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
     {
         // input, weights, output
         std::uint32_t descriptor_count = 4;
-        if (!params_.no_bias)
+        if (use_bias())
         {
             descriptor_count++;
         }
@@ -1416,7 +1434,10 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
         {
             resources_list.push_back({ DescType::eSrv, bias_buffer_.Get() });
         }
-        resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() });
+        if (constant_buffer_)
+        {
+            resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() });
+        }
 
         const auto tensor_out_bytes_width = output_buffer_->GetDesc().Width;
         resources_list.push_back({ DescType::eUav, output_buffer_.Get() });
@@ -1452,7 +1473,8 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
         const uint32_t out_ch_size = static_cast<uint32_t>(std::ceil(params_.filter_shape.n / (double)(cm_params_.block_oc)));
         const auto gws_x = cm_params_.slice_ic * (round_up_next_multiple(output_shape_.w, cm_params_.block_w) / cm_params_.block_w);
         const auto gws_y = round_up_next_multiple(output_shape_.h, cm_params_.block_h) / cm_params_.block_h;
-        const auto gws_z = (params_.input_shape.n / cm_params_.block_batch) * out_ch_size;
+        const auto execsize = 2; // BMG = 2, DG2 = 1
+        const auto gws_z = ((params_.input_shape.n / cm_params_.block_batch) * out_ch_size) / execsize;
 
         assert(gws_x % cm_params_.lws[0] == 0);
         assert(gws_y % cm_params_.lws[1] == 0);
@@ -1503,12 +1525,13 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
                 std::uint32_t gws_z = 0;
                 if (output_layout == DataLayout::eIO_i8_o8_i2)
                 {
-                    const std::uint32_t ic_chunks_per_hw_thread = 8;
-                    const std::uint32_t exec_size = 8;
+                    const std::uint32_t exec_size = 16; // BMG = 16, DG2 = 8
                     const std::uint32_t dpas_depth = 8;
+                    const std::uint32_t ic_chunks_per_hw_thread = 2;
                     const std::uint32_t out_dt_size = get_data_type_bytes_width(output_dt);
+                    const std::uint32_t ic_multipler = (ic_chunks_per_hw_thread * dpas_depth * out_dt_size);
                     gws_x = oc / exec_size;
-                    gws_y = ic / (ic_chunks_per_hw_thread * dpas_depth * out_dt_size);
+                    gws_y = (ic % ic_multipler == 0) ? ic / ic_multipler : (ic / ic_multipler) + ic % ic_multipler;
                     gws_z = 1;
                 }
                 else if (output_layout == DataLayout::eOYXI_o8)
@@ -1572,23 +1595,17 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
 
                 build_options += pre_jit + name + between_name_and_value + value_str + post_jit;
             };
-            if (params_.input_dt == DataType::eFp16 && params_.output_dt == DataType::eFp16)
-            {
-                add_define("DT", "half");
-            }
-            else
-            {
-                add_define("DT", "float");
-            }
-            //add_define("WEI_OFFSET", 0);
-            //add_define("IC", params_.ic);
-            //add_define("OC", params_.oc);
+            add_define("INPUT_TYPE", "half");
+            add_define("OUTPUT_TYPE", "half");
+            add_define("WEI_OFFSET", 0);
+            add_define("IC", params_.ic);
+            add_define("OC", params_.oc);
             add_define("K_SIZE", params_.k_size);
 
-            /*for (std::int32_t i = static_cast<std::int32_t>(DataLayout::eWeightsLayoutStart) + 1; i < static_cast<std::int32_t>(DataLayout::eCount); i++)
+            for (std::int32_t i = static_cast<std::int32_t>(DataLayout::eWeightsLayoutStart) + 1; i < static_cast<std::int32_t>(DataLayout::eCount); i++)
             {
                 add_define("LAYOUT_" + data_layout_name(static_cast<DataLayout>(i)), i);
-            }*/
+            }
             add_define("INPUT_LAYOUT", static_cast<std::int32_t>(params_.input_layout));
             add_define("OUTPUT_LAYOUT", static_cast<std::int32_t>(params_.output_layout));
 
@@ -1607,11 +1624,11 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
             // kernel compilation
             const auto dump_asm_str = " -mdump_asm";
             const auto print_reg_str = " -mCM_printregusage";
-
+            const auto large_grf_str = " -Qxcm_doubleGRF";
             const auto lws_x = " -DLWS_SIZE_X=" + std::to_string(params_.lws[0]);
             const auto lws_y = " -DLWS_SIZE_Y=" + std::to_string(params_.lws[1]);
             const auto lws_z = " -DLWS_SIZE_Z=" + std::to_string(params_.lws[2]);
-            const auto build_options_final = " -I \" \" " + build_options + dump_asm_str + print_reg_str + lws_x + lws_y + lws_z;
+            const auto build_options_final = " -I \" \" " + build_options + dump_asm_str + large_grf_str + print_reg_str + lws_x + lws_y + lws_z;
 
             CD3DX12_SHADER_BYTECODE byte_code;
             byte_code.pShaderBytecode = kernel_source_content.data();
@@ -1655,7 +1672,6 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
         {
             assert(input_buffer_);
             assert(output_buffer_);
-            assert(constant_buffer_);
 
             const auto desc_heap_incrs_size = d3d12_device_->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
             // i.e. add weights reorder
@@ -1666,7 +1682,10 @@ class ConvolutionCmDispatcher : public ConvolutionBaseDispatcher
             std::vector<std::pair<DescType, ID3D12Resource*>> resources_list;
             resources_list.reserve(get_total_descriptor_count());
             resources_list.push_back({ DescType::eSrv, input_buffer_.Get() });
-            resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() });
+            if (constant_buffer_)
+            {
+                resources_list.push_back({ DescType::eSrv, constant_buffer_.Get() });
+            }
             resources_list.push_back({ DescType::eUav, output_buffer_.Get() });
 
             gpu_handles_ = create_resource_views_and_handles(d3d12_device_, resources_list, base_cpu_handle, base_gpu_handle);
diff --git a/tools/common_lib/src/layers_utils.h b/tools/common_lib/src/layers_utils.h
index c3e909d..4c5c534 100644
--- a/tools/common_lib/src/layers_utils.h
+++ b/tools/common_lib/src/layers_utils.h
@@ -190,6 +190,7 @@ inline std::string data_layout_name(DataLayout l)
     case DataLayout::eNHWC: return "NHWC";
     case DataLayout::eCHW:  return "CHW";
     case DataLayout::eW:    return "W";
+    case DataLayout::eO: return "O";
     case DataLayout::eOIYX: return "OIYX";
     case DataLayout::eIO_i8_o8_i2: return "IO_i8_o8_i2";
     case DataLayout::eOYXI_o8:  return "OYXI_o8";
diff --git a/tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp b/tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp
new file mode 100644
index 0000000..170434c
--- /dev/null
+++ b/tools/cross_runner/kernels/conv_nchw_dpas_fp16.cpp
@@ -0,0 +1,533 @@
+/*========================== begin_copyright_notice ============================
+
+INTEL CONFIDENTIAL
+
+Copyright (C) 2023 Intel Corporation
+
+This software and the related documents are Intel copyrighted materials,
+and your use of them is governed by the express license under which they were
+provided to you ("License"). Unless the License provides otherwise,
+you may not use, modify, copy, publish, distribute, disclose or transmit this
+software or the related documents without Intel's prior written permission.
+
+This software and the related documents are provided as is, with no express or
+implied warranties, other than those that are expressly stated in the License.
+
+============================= end_copyright_notice ===========================*/
+
+#include <cm/cm.h>
+#include <cm/cmtl.h>
+
+#if !CM_HAS_DPAS
+#error [Error_device_no_dpas] Kernel designed to use dpas. Current device does not support dpas.
+#endif
+
+#if !CM_HAS_LSC
+#error [Error_device_no_lsc] Kernel designed to use lsc. Current device does not support lsc.
+#endif
+
+#if BLOCK_W > 8
+#error [Error_kernel_config_unsupported_block_w] Kernel designed to with with block_w in range: <1; 7>;
+#endif
+
+#if BLOCK_OC != 8 && BLOCK_OC != 16 && BLOCK_OC != 32 && BLOCK_OC != 40 && BLOCK_OC != 64 && BLOCK_OC != 80
+#error [Error_kernel_config_unsupported_block_w] Kernel designed to with with block_oc which is equal to 8 or 16 or 32 or 40 or 64 or 80;
+#endif
+
+#define DPAS_DEPTH 8 
+#if(CM_GENX >= 1280)
+#define EXEC_SIZE 16
+#else
+#define EXEC_SIZE 8
+#endif
+#define OUTPUT_CHANNEL_MULTIPLIER (EXEC_SIZE/DPAS_DEPTH)
+#define BLOCK_H 1
+#define DT_OUT half
+#define DT_IN half
+#define DT_IN_SIZE 2 
+#define DT_WEIGHTS half
+// accu on DG2 have to be float for half dt inputs
+#define DT_ACCU float
+
+#define DPAS_INPUT_CHANNELS (DPAS_DEPTH * sizeof(DT_IN))
+#define DPAS_OUTPUT_CHANNELS EXEC_SIZE
+#define DPAS_RC BLOCK_W
+
+// currently it is fixed with 1, can be tuned for larger input channel sizes for any future case
+#define SLICE_IC 1  
+
+#define CONV_LOOP_COUNT ((INPUT_CHANNELS/DPAS_INPUT_CHANNELS) / SLICE_IC)
+
+#define WEIGHTS_REG_SIZE (DPAS_INPUT_CHANNELS * DPAS_OUTPUT_CHANNELS)
+
+#define INPUT_NCHW_PLANE_SIZE (INPUT_WIDTH * INPUT_HEIGHT * sizeof(DT_IN))
+#define OUTPUT_NCHW_PLANE_SIZE (OUTPUT_WIDTH * OUTPUT_HEIGHT * sizeof(DT_OUT))
+
+#define INPUT_NHWC_PLANE_SIZE (INPUT_CHANNELS * sizeof(DT_IN))
+#define OUTPUT_NHWC_PLANE_SIZE (OUTPUT_CHANNELS * sizeof(DT_OUT))
+
+#define NCHW 0
+#define NHWC 1
+
+#define LOAD_3x3_BLOCK_SIZE (BLOCK_W + 2)
+#define LOAD_3x3_BLOCK_START 0
+#define LOAD_3x3_BLOCK_END 9
+
+#if(INPUT_LAYOUT == NHWC)
+#define OUTPUT_DPAS_OFFSET DPAS_OUTPUT_CHANNELS * sizeof(DT_OUT)
+#else
+#define OUTPUT_DPAS_OFFSET (DPAS_OUTPUT_CHANNELS * OUTPUT_HEIGHT * OUTPUT_WIDTH) * sizeof(DT_OUT)
+#endif
+
+static const uint32_t init_linear_offsets_16[] = { 0, 2, 4, 6,  8, 10, 12, 14,  16, 18, 20, 22,  24, 26, 28, 30 };
+
+template<uint32_t LOAD_W>
+_GENX_ inline vector<DT_IN, LOAD_3x3_BLOCK_SIZE * DPAS_INPUT_CHANNELS> load_3x3_input(SurfaceIndex surface [[type("buffer_t")]], int input_offset, int w_chunk_id)
+{
+#if(INPUT_LAYOUT == NHWC)
+	const uint32_t LOAD_W_WIDTH = DPAS_INPUT_CHANNELS;
+#else
+    const uint32_t LOAD_W_WIDTH = LOAD_W * STRIDE_W;
+#endif
+    const uint32_t LOAD_W_BYTES_WIDTH = LOAD_W_WIDTH * sizeof(DT_IN);
+    const uint32_t LOAD_W_DWORDS = LOAD_W_BYTES_WIDTH / sizeof(uint32_t);
+    vector<DT_IN, LOAD_3x3_BLOCK_SIZE * DPAS_INPUT_CHANNELS> data_out;
+	vector<uint32_t, LOAD_W_WIDTH> load_offsets(init_linear_offsets_16);
+	const int current_kw = w_chunk_id * BLOCK_W * STRIDE_W;
+	const float left_pad = (current_kw == LOAD_3x3_BLOCK_START) ? 0.0f : 1.0f;
+	const float right_pad = ((current_kw + LOAD_3x3_BLOCK_END) > INPUT_WIDTH) ? 0.0f : 1.0f;
+#if(INPUT_LAYOUT == NHWC)
+	load_offsets += input_offset - INPUT_CHANNELS * sizeof(DT_IN);
+    #pragma unroll
+    for(int i = 0; i < LOAD_W + 2; i++)
+    {
+		vector<half, LOAD_W_WIDTH> load_chunk = cm_load<half, VectorSize::N1, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, load_offsets);
+		if( i == LOAD_3x3_BLOCK_START )
+		{
+			load_chunk *= left_pad;
+		}
+		if( i == LOAD_3x3_BLOCK_END )
+		{
+			load_chunk *= right_pad;
+		}
+        data_out.select<DPAS_INPUT_CHANNELS, 1>(i * DPAS_INPUT_CHANNELS) = load_chunk.select<DPAS_INPUT_CHANNELS, 1>();
+        load_offsets += INPUT_NHWC_PLANE_SIZE;
+    }
+#else
+	load_offsets += input_offset - INPUT_PAD * sizeof(DT_IN);
+	#pragma unroll
+    for(int i = 0; i < DPAS_INPUT_CHANNELS; i++)
+    {
+        vector<half, LOAD_W_WIDTH> load_chunk = cm_load<half, VectorSize::N1, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, load_offsets);
+		load_chunk[LOAD_3x3_BLOCK_START] *= left_pad;
+		load_chunk[LOAD_3x3_BLOCK_END] *= right_pad;
+        data_out.select<LOAD_3x3_BLOCK_SIZE, DPAS_INPUT_CHANNELS>(i) = load_chunk.select<LOAD_3x3_BLOCK_SIZE, STRIDE_W>();
+        load_offsets += INPUT_NCHW_PLANE_SIZE;
+    }
+#endif
+	return data_out;
+}
+
+template<uint32_t LOAD_W>
+_GENX_ inline vector<DT_IN, BLOCK_W * DPAS_INPUT_CHANNELS> load_1x1_input(SurfaceIndex surface [[type("buffer_t")]], uint byte_offset)
+{
+#if(INPUT_LAYOUT == NHWC)
+	const uint32_t LOAD_W_WIDTH = DPAS_INPUT_CHANNELS;
+#else
+    const uint32_t LOAD_W_WIDTH = LOAD_W * STRIDE_W;
+#endif
+    const uint32_t LOAD_W_BYTES_WIDTH = LOAD_W_WIDTH * sizeof(DT_IN);
+    const uint32_t LOAD_W_DWORDS = LOAD_W_BYTES_WIDTH / sizeof(uint32_t);
+    vector<DT_IN, BLOCK_W * DPAS_INPUT_CHANNELS> data_out;
+	vector<uint32_t, LOAD_W_DWORDS> load_chunk;
+#if(INPUT_LAYOUT == NHWC)
+    #pragma unroll
+    for(int i = 0; i < LOAD_W; i++)
+    {
+        load_chunk = cm_load<uint32_t, LOAD_W_DWORDS, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset);
+        data_out.select<DPAS_INPUT_CHANNELS, 1>(i * DPAS_INPUT_CHANNELS) = load_chunk.format<half>().select<DPAS_INPUT_CHANNELS, 1>();
+        byte_offset += INPUT_NHWC_PLANE_SIZE;
+    }
+#else
+	#pragma unroll
+    for(int i = 0; i < DPAS_INPUT_CHANNELS; i++)
+    {
+        load_chunk = cm_load<uint32_t, LOAD_W_DWORDS, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset);
+        data_out.select<BLOCK_W, DPAS_INPUT_CHANNELS>(i) = load_chunk.format<half>().select<BLOCK_W, STRIDE_W>();
+        byte_offset += INPUT_NCHW_PLANE_SIZE;
+    }
+#endif
+    return data_out;
+}
+
+_GENX_ inline vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> load_filter_nchw_data(SurfaceIndex surface [[type("buffer_t")]], uint32_t byte_offset)
+{
+#if WEIGHTS_IN_OPTIMAL_FORMAT
+    vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> data_out;
+    //vector_ref<uint32_t, 64> data_load_view = data_out.select<128,1>(0).format<uint32_t>();
+    data_out.select<128,1>(0).format<uint32_t>() = cm_load<uint32_t, 64, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset);
+	#if EXEC_SIZE == 16
+		data_out.select<128,1>(128).format<uint32_t>() = cm_load<uint32_t, 64, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset + 256);	
+	#endif
+    return data_out;
+#else
+	#error Kernel only supports reordered weight layouts.
+#endif
+}
+
+_GENX_ inline vector<DT_OUT, BLOCK_OC * OUTPUT_CHANNEL_MULTIPLIER> load_bias(SurfaceIndex surface [[type("buffer_t")]], uint32_t byte_offset)
+{
+	vector<DT_OUT, BLOCK_OC * OUTPUT_CHANNEL_MULTIPLIER> data_out;
+#if BLOCK_OC == 40
+	data_out.select<32 * OUTPUT_CHANNEL_MULTIPLIER,1>(OUTPUT_CHANNEL_MULTIPLIER * 0 ).format<uint32_t>() = cm_load<uint32_t, 16 * OUTPUT_CHANNEL_MULTIPLIER, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset);
+	data_out.select<8  * OUTPUT_CHANNEL_MULTIPLIER,1>(OUTPUT_CHANNEL_MULTIPLIER * 32).format<uint32_t>() = cm_load<uint32_t,  4 * OUTPUT_CHANNEL_MULTIPLIER, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset + 64 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+#if BLOCK_OC == 80
+	data_out.select<64 * OUTPUT_CHANNEL_MULTIPLIER,1>(0  * OUTPUT_CHANNEL_MULTIPLIER).format<uint32_t>() = cm_load<uint32_t, 32 * OUTPUT_CHANNEL_MULTIPLIER, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset);
+	data_out.select<16 * OUTPUT_CHANNEL_MULTIPLIER,1>(64 * OUTPUT_CHANNEL_MULTIPLIER).format<uint32_t>() = cm_load<uint32_t, 8  * OUTPUT_CHANNEL_MULTIPLIER, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset + 128 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+#if BLOCK_OC != 40 && BLOCK_OC != 80
+	data_out.format<uint32_t>() = cm_load<uint32_t, BLOCK_OC/2 * OUTPUT_CHANNEL_MULTIPLIER, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface, byte_offset);
+#endif
+	return data_out;
+}
+
+template<uint32_t STORE_W>
+_GENX_ inline void store_output(SurfaceIndex surface [[type("buffer_t")]], vector_ref<DT_OUT, BLOCK_W * DPAS_OUTPUT_CHANNELS> grf_chunk, uint32_t byte_offset)
+{
+	uint32_t offsets = byte_offset;
+#if(INPUT_LAYOUT == NHWC)
+	#pragma unroll
+	for(int i = 0; i < STORE_W; i++)
+    {
+        vector<DT_OUT, DPAS_OUTPUT_CHANNELS> grf_chunk_store = grf_chunk.select<DPAS_OUTPUT_CHANNELS, 1>(i * DPAS_OUTPUT_CHANNELS);                  
+        cm_store<U32, 8, DataSize::Default, CacheHint::WriteBack, CacheHint::WriteBack>(surface, offsets, grf_chunk_store.format<U32>());
+        offsets += OUTPUT_NHWC_PLANE_SIZE;
+    }
+#else
+    #pragma unroll
+    for(int i = 0; i < DPAS_OUTPUT_CHANNELS; i++)
+    {
+        vector<DT_OUT, STORE_W> grf_chunk_store = grf_chunk.select<STORE_W, DPAS_OUTPUT_CHANNELS>(i);                  
+        cm_store<U32, 4, DataSize::Default, CacheHint::WriteBack, CacheHint::WriteBack>(surface, offsets, grf_chunk_store.format<U32>());
+        offsets += OUTPUT_NCHW_PLANE_SIZE;
+    }
+#endif
+}
+
+extern "C" _GENX_MAIN_ void conv_nchw_dpas_fp16(
+	SurfaceIndex surface_input [[type("buffer_t")]],
+	SurfaceIndex surface_weights [[type("buffer_t")]],
+	SurfaceIndex surface_output [[type("buffer_t")]]
+#if USE_BIAS
+	,SurfaceIndex surface_bias [[type("buffer_t")]]
+#endif
+)
+{
+    const uint32_t thg_0 = (cm_group_id(0) * cm_local_size(0) + cm_local_id(0));
+    const uint w_chunk_id = thg_0 / SLICE_IC;
+    const uint slice_ic_id = thg_0 % SLICE_IC;
+    const uint h_chunk_id = cm_group_id(1) * cm_local_size(1) + cm_local_id(1);
+    const uint thread_id_2 = (cm_group_id(2) * cm_local_size(2) + cm_local_id(2));
+    
+    const uint THREADS_FOR_OC = (OUTPUT_CHANNELS / BLOCK_OC) / OUTPUT_CHANNEL_MULTIPLIER;
+    const uint batch_id = (thread_id_2 / THREADS_FOR_OC);
+    const uint oc_chunk_id = (thread_id_2 % THREADS_FOR_OC) * (BLOCK_OC / DPAS_DEPTH);
+    
+#if(INPUT_LAYOUT == NHWC)
+	const uint32_t input_row_offset_size = BLOCK_H * STRIDE_H * INPUT_WIDTH * INPUT_CHANNELS;
+	const uint32_t input_dpas_ic_offset_size = DPAS_INPUT_CHANNELS;
+	const uint32_t input_batch_offset = batch_id * INPUT_WIDTH * INPUT_HEIGHT * INPUT_CHANNELS;
+	const uint32_t input_w_chunk_offset = w_chunk_id * BLOCK_W * STRIDE_W * INPUT_CHANNELS;
+	const uint32_t input_h_chunk_offset = h_chunk_id * BLOCK_H * STRIDE_H * INPUT_WIDTH * INPUT_CHANNELS;
+#else
+	const uint32_t input_row_offset_size = BLOCK_H * STRIDE_H * INPUT_WIDTH;
+	const uint32_t input_dpas_ic_offset_size = INPUT_HEIGHT * DPAS_INPUT_CHANNELS * INPUT_WIDTH;	
+	const uint32_t input_batch_offset = batch_id * INPUT_WIDTH * INPUT_HEIGHT * INPUT_CHANNELS;
+	const uint32_t input_w_chunk_offset = w_chunk_id * BLOCK_W * STRIDE_W;
+	const uint32_t input_h_chunk_offset = h_chunk_id * input_row_offset_size;
+#endif
+	const uint32_t input_slice_ic_chunk_offset = slice_ic_id * CONV_LOOP_COUNT * input_dpas_ic_offset_size;
+	uint32_t input_offset = (input_batch_offset + input_slice_ic_chunk_offset + input_h_chunk_offset + input_w_chunk_offset) * sizeof(DT_IN);
+
+#if WEIGHTS_IN_OPTIMAL_FORMAT
+	#if KERNEL_SIZE == 1
+		const uint32_t weights_oc_chunk_offset = EXEC_SIZE * DPAS_INPUT_CHANNELS * sizeof(DT_WEIGHTS);
+		const uint32_t weights_ic_offset_size = OUTPUT_CHANNELS * DPAS_INPUT_CHANNELS * sizeof(DT_WEIGHTS);
+	#elif KERNEL_SIZE == 3
+		const uint32_t weights_oc_chunk_offset = DPAS_OUTPUT_CHANNELS * INPUT_CHANNELS * sizeof(DT_WEIGHTS) * KERNEL_SIZE * KERNEL_SIZE;
+		const uint32_t weights_ic_offset_size =  DPAS_INPUT_CHANNELS * EXEC_SIZE * sizeof(DT_WEIGHTS) * KERNEL_SIZE * KERNEL_SIZE;
+	#else
+		#error unsupported Kernel Size
+	#endif
+#else
+    #error Kernel only supports reordered weight layouts.
+#endif
+
+    uint32_t weights_offset_0 = WEI_OFFSET + oc_chunk_id * weights_oc_chunk_offset + (slice_ic_id * CONV_LOOP_COUNT * weights_ic_offset_size);
+    uint32_t weights_offset_1 = weights_offset_0 + weights_oc_chunk_offset;
+    uint32_t weights_offset_2 = weights_offset_1 + weights_oc_chunk_offset;
+    uint32_t weights_offset_3 = weights_offset_2 + weights_oc_chunk_offset;
+	uint32_t weights_offset_4 = weights_offset_3 + weights_oc_chunk_offset;
+    uint32_t weights_offset_5 = weights_offset_4 + weights_oc_chunk_offset;
+	uint32_t weights_offset_6 = weights_offset_5 + weights_oc_chunk_offset;
+    uint32_t weights_offset_7 = weights_offset_6 + weights_oc_chunk_offset;
+    uint32_t weights_offset_8 = weights_offset_7 + weights_oc_chunk_offset;
+    uint32_t weights_offset_9 = weights_offset_8 + weights_oc_chunk_offset;
+	
+    const uint ACCU_REG_SIZE = BLOCK_W * DPAS_OUTPUT_CHANNELS;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_0 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_1 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_2 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_3 = 0;
+	vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_4 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_5 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_6 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_7 = 0;
+	vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_8 = 0;
+    vector<DT_ACCU, ACCU_REG_SIZE> accu_row_0_oc_9 = 0;
+	
+    // todo debug performance with pragma unroll
+    //#pragma unroll
+    for(int i = 0; i < CONV_LOOP_COUNT; i++)
+    {
+		#if KERNEL_SIZE == 1
+			vector<DT_IN, BLOCK_W * DPAS_INPUT_CHANNELS> input_row_0 = load_1x1_input<BLOCK_W>(surface_input, input_offset);
+			
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_0 = load_filter_nchw_data(surface_weights, weights_offset_0);
+			#if BLOCK_OC >= 16
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_1 = load_filter_nchw_data(surface_weights, weights_offset_1);
+			#endif  
+			#if BLOCK_OC >= 32
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_2 = load_filter_nchw_data(surface_weights, weights_offset_2);
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_3 = load_filter_nchw_data(surface_weights, weights_offset_3);
+			#endif
+			#if BLOCK_OC >= 40
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_4 = load_filter_nchw_data(surface_weights, weights_offset_4);
+			#endif
+			#if BLOCK_OC >= 64
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_5 = load_filter_nchw_data(surface_weights, weights_offset_5);
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_6 = load_filter_nchw_data(surface_weights, weights_offset_6);
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_7 = load_filter_nchw_data(surface_weights, weights_offset_7);
+			#endif
+			#if BLOCK_OC == 80
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_8 = load_filter_nchw_data(surface_weights, weights_offset_8);
+			vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_9 = load_filter_nchw_data(surface_weights, weights_offset_9);
+			#endif
+	
+			accu_row_0_oc_0 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_0, weights_0.format<uint32_t>(), input_row_0.format<uint32_t>());
+			#if BLOCK_OC >= 16
+			accu_row_0_oc_1 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_1, weights_1.format<uint32_t>(), input_row_0.format<uint32_t>());
+			#endif
+			#if BLOCK_OC >= 32
+			accu_row_0_oc_2 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_2, weights_2.format<uint32_t>(), input_row_0.format<uint32_t>());
+			accu_row_0_oc_3 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_3, weights_3.format<uint32_t>(), input_row_0.format<uint32_t>());
+			#endif
+			#if BLOCK_OC >= 40
+			accu_row_0_oc_4 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_4, weights_4.format<uint32_t>(), input_row_0.format<uint32_t>());
+			#endif
+			#if BLOCK_OC >= 64
+			accu_row_0_oc_5 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_5, weights_5.format<uint32_t>(), input_row_0.format<uint32_t>());
+			accu_row_0_oc_6 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_6, weights_6.format<uint32_t>(), input_row_0.format<uint32_t>());
+			accu_row_0_oc_7 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_7, weights_7.format<uint32_t>(), input_row_0.format<uint32_t>());
+			#endif
+			#if BLOCK_OC == 80
+			accu_row_0_oc_8 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_8, weights_8.format<uint32_t>(), input_row_0.format<uint32_t>());
+			accu_row_0_oc_9 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_9, weights_9.format<uint32_t>(), input_row_0.format<uint32_t>());
+			#endif
+		#elif KERNEL_SIZE == 3
+			#pragma unroll
+			for(int kh = -INPUT_PAD; kh < KERNEL_SIZE-INPUT_PAD; kh++)
+			{
+				int input_load_offset_kh = input_offset + (kh * input_row_offset_size * sizeof(DT_IN));
+				if(h_chunk_id + kh < 0 || h_chunk_id + kh  >= INPUT_HEIGHT) { continue; };
+				vector<DT_IN, (LOAD_3x3_BLOCK_SIZE) * DPAS_INPUT_CHANNELS> input_row_0 = load_3x3_input<LOAD_3x3_BLOCK_SIZE>(surface_input, input_load_offset_kh, w_chunk_id);
+
+				#pragma unroll
+				for(int kw = 0; kw < KERNEL_SIZE; kw++)
+				{
+					uint32_t kernel_index = ((kh + INPUT_PAD) * KERNEL_SIZE + kw) * sizeof(DT_WEIGHTS);
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_0 = load_filter_nchw_data(surface_weights, weights_offset_0 + (kernel_index * WEIGHTS_REG_SIZE));
+					
+					#if BLOCK_OC >= 16
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_1 = load_filter_nchw_data(surface_weights, weights_offset_1 + (kernel_index * WEIGHTS_REG_SIZE));
+					#endif  
+					#if BLOCK_OC >= 32
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_2 = load_filter_nchw_data(surface_weights, weights_offset_2 + kernel_index * WEIGHTS_REG_SIZE);
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_3 = load_filter_nchw_data(surface_weights, weights_offset_3 + kernel_index * WEIGHTS_REG_SIZE);
+					#endif
+					#if BLOCK_OC >= 40
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_4 = load_filter_nchw_data(surface_weights, weights_offset_4 + kernel_index * WEIGHTS_REG_SIZE);
+					#endif
+					#if BLOCK_OC >= 64
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_5 = load_filter_nchw_data(surface_weights, weights_offset_5 + kernel_index * WEIGHTS_REG_SIZE);
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_6 = load_filter_nchw_data(surface_weights, weights_offset_6 + kernel_index * WEIGHTS_REG_SIZE);
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_7 = load_filter_nchw_data(surface_weights, weights_offset_7 + kernel_index * WEIGHTS_REG_SIZE);
+					#endif
+					#if BLOCK_OC == 80
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_8 = load_filter_nchw_data(surface_weights, weights_offset_8 + kernel_index * WEIGHTS_REG_SIZE);
+					vector<DT_WEIGHTS, WEIGHTS_REG_SIZE> weights_9 = load_filter_nchw_data(surface_weights, weights_offset_9 + kernel_index * WEIGHTS_REG_SIZE);
+					#endif
+
+					accu_row_0_oc_0 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_0, weights_0.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					#if BLOCK_OC >= 16
+					accu_row_0_oc_1 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_1, weights_1.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					#endif  
+					#if BLOCK_OC >= 32
+					accu_row_0_oc_2 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_2, weights_2.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					accu_row_0_oc_3 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_3, weights_3.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					#endif
+					#if BLOCK_OC >= 40
+					accu_row_0_oc_4 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_4, weights_4.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					#endif
+					#if BLOCK_OC >= 64
+					accu_row_0_oc_5 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_5, weights_5.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					accu_row_0_oc_6 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_6, weights_6.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					accu_row_0_oc_7 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_7, weights_7.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					#endif
+					#if BLOCK_OC == 80
+					accu_row_0_oc_8 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_8, weights_8.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					accu_row_0_oc_9 = cm_dpas<CM_PRECISION_HF, CM_PRECISION_HF, 8, DPAS_RC>(accu_row_0_oc_9, weights_9.format<uint32_t>(), input_row_0.select<128,1>(kw * DPAS_INPUT_CHANNELS).format<uint32_t>());
+					#endif
+				}
+			}
+		#else
+			#error unsupported Kernel Size
+		#endif
+        input_offset += (input_dpas_ic_offset_size * sizeof(DT_IN));
+        weights_offset_0 += weights_ic_offset_size;
+        weights_offset_1 += weights_ic_offset_size;
+        weights_offset_2 += weights_ic_offset_size;
+        weights_offset_3 += weights_ic_offset_size;
+        weights_offset_4 += weights_ic_offset_size;
+        weights_offset_5 += weights_ic_offset_size;
+        weights_offset_6 += weights_ic_offset_size;
+        weights_offset_7 += weights_ic_offset_size;
+		weights_offset_8 += weights_ic_offset_size;
+		weights_offset_9 += weights_ic_offset_size;
+    }
+
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_0 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_0);
+#if BLOCK_OC >= 16
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_1 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_1);
+#endif
+
+#if BLOCK_OC >= 32
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_2 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_2);
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_3 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_3);
+#endif
+#if BLOCK_OC >= 40
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_4 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_4);
+#endif
+#if BLOCK_OC >= 64
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_5 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_5);
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_6 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_6);
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_7 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_7);
+#endif
+#if BLOCK_OC == 80
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_8 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_8);
+    vector<DT_OUT, ACCU_REG_SIZE> output_row_0_oc_9 = vector<DT_OUT, ACCU_REG_SIZE>(accu_row_0_oc_9);
+#endif
+
+#if USE_BIAS
+	vector<DT_OUT, BLOCK_OC * OUTPUT_CHANNEL_MULTIPLIER> bias = load_bias(surface_bias, oc_chunk_id * EXEC_SIZE * sizeof(DT_OUT));
+	#pragma unroll
+	for(int bw = 0; bw < BLOCK_W; bw++)
+	{
+		output_row_0_oc_0.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(0 * OUTPUT_CHANNEL_MULTIPLIER);
+#if BLOCK_OC >= 16
+		output_row_0_oc_1.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(8 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+#if BLOCK_OC >= 32
+		output_row_0_oc_2.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(16 * OUTPUT_CHANNEL_MULTIPLIER);
+		output_row_0_oc_3.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(24 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+#if BLOCK_OC >= 40
+		output_row_0_oc_4.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(32 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+#if BLOCK_OC >= 64
+		output_row_0_oc_5.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(40 * OUTPUT_CHANNEL_MULTIPLIER);
+		output_row_0_oc_6.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(48 * OUTPUT_CHANNEL_MULTIPLIER);
+		output_row_0_oc_7.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(56 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+#if BLOCK_OC >= 80
+		output_row_0_oc_8.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(64 * OUTPUT_CHANNEL_MULTIPLIER);
+		output_row_0_oc_9.select<EXEC_SIZE, 1>(bw * EXEC_SIZE) += bias.select<EXEC_SIZE, 1>(72 * OUTPUT_CHANNEL_MULTIPLIER);
+#endif
+	}
+#endif 
+
+#if USE_RELU
+    output_row_0_oc_0 = cm_max<DT_OUT>(output_row_0_oc_0, 0);
+	#if BLOCK_OC >= 16
+		output_row_0_oc_1 = cm_max<DT_OUT>(output_row_0_oc_1, 0);
+	#endif
+	#if BLOCK_OC >= 32
+		output_row_0_oc_2 = cm_max<DT_OUT>(output_row_0_oc_2, 0);
+		output_row_0_oc_3 = cm_max<DT_OUT>(output_row_0_oc_3, 0);
+	#endif
+	#if BLOCK_OC >= 40
+		output_row_0_oc_4 = cm_max<DT_OUT>(output_row_0_oc_4, 0);
+	#endif
+	#if BLOCK_OC >= 64
+		output_row_0_oc_5 = cm_max<DT_OUT>(output_row_0_oc_5, 0);
+		output_row_0_oc_6 = cm_max<DT_OUT>(output_row_0_oc_6, 0);
+		output_row_0_oc_7 = cm_max<DT_OUT>(output_row_0_oc_7, 0);
+	#endif
+	#if BLOCK_OC == 80
+		output_row_0_oc_8 = cm_max<DT_OUT>(output_row_0_oc_8, 0);
+		output_row_0_oc_9 = cm_max<DT_OUT>(output_row_0_oc_9, 0);
+	#endif
+#endif
+
+#if(INPUT_LAYOUT == NHWC)
+    const uint output_oc_chunk_offset = oc_chunk_id * DPAS_OUTPUT_CHANNELS;
+    const uint output_w_chunk_offset = w_chunk_id * BLOCK_W * OUTPUT_CHANNELS;
+    const uint output_h_chunk_offset = h_chunk_id * BLOCK_H * OUTPUT_WIDTH * OUTPUT_CHANNELS;
+#else
+    const uint output_oc_chunk_offset = oc_chunk_id * DPAS_OUTPUT_CHANNELS * OUTPUT_HEIGHT * OUTPUT_WIDTH;
+    const uint output_w_chunk_offset = w_chunk_id * BLOCK_W;
+    const uint output_h_chunk_offset = h_chunk_id * BLOCK_H * OUTPUT_WIDTH;
+#endif
+	const uint output_batch_offset = batch_id * OUTPUT_HEIGHT * OUTPUT_WIDTH * OUTPUT_CHANNELS;
+	uint32_t output_offset = (output_batch_offset + output_oc_chunk_offset + output_h_chunk_offset + output_w_chunk_offset) * sizeof(DT_OUT);
+	
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_0, output_offset);
+    
+#if BLOCK_OC >= 16
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_1, output_offset); 
+#endif
+
+#if BLOCK_OC >= 32
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_2, output_offset); 
+    
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_3, output_offset); 
+#endif
+
+#if BLOCK_OC >= 40
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_4, output_offset); 
+#endif
+
+#if BLOCK_OC >= 64
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_5, output_offset);
+	
+	output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_6, output_offset); 
+    
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_7, output_offset);
+#endif
+
+#if BLOCK_OC == 80
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_8, output_offset);
+    
+    output_offset += OUTPUT_DPAS_OFFSET;
+    store_output<BLOCK_W>(surface_output, output_row_0_oc_9, output_offset); 
+#endif
+}
\ No newline at end of file
diff --git a/tools/cross_runner/kernels/reorder_weights.cpp b/tools/cross_runner/kernels/reorder_weights.cpp
index b4a8021..858e40b 100644
--- a/tools/cross_runner/kernels/reorder_weights.cpp
+++ b/tools/cross_runner/kernels/reorder_weights.cpp
@@ -18,138 +18,159 @@ implied warranties, other than those that are expressly stated in the License.
 #include <cm/cm.h>
 #include <cm/cmtl.h>
 
-#define LAYOUT_OIYX 1001
-#define LAYOUT_IO_i8_o8_i2 1002
-#define LAYOUT_OYXI_o8 1003
-#define LAYOUT_OYXI_o16 1004
-
 #if !CM_HAS_LSC
 #error [Error_device_no_lsc] Kernel designed to use lsc. Current device does not support lsc.
 #endif
 
-#define DPAS_DEPTH 8 
 #if(CM_GENX >= 1280)
 #define DPAS_EXEC_SIZE 16
 #else
 #define DPAS_EXEC_SIZE 8
 #endif
 
-#if OUTPUT_LAYOUT == LAYOUT_OYXI_o8
-#define SIMD_SIZE 8
-#elif OUTPUT_LAYOUT == LAYOUT_OYXI_o16
-#define SIMD_SIZE 16
-#endif
-
-#define WEIGHT_TYPE_SIZE sizeof(DT)
-	
-static const uint32_t weights_init_linear_offsets[] = {
-													0 * WEIGHT_TYPE_SIZE,
-													1 * WEIGHT_TYPE_SIZE,
-													2 * WEIGHT_TYPE_SIZE,
-													3 * WEIGHT_TYPE_SIZE,
-													4 * WEIGHT_TYPE_SIZE,
-													5 * WEIGHT_TYPE_SIZE,
-													6 * WEIGHT_TYPE_SIZE,
-													7 * WEIGHT_TYPE_SIZE,
-													8 * WEIGHT_TYPE_SIZE,
-													9 * WEIGHT_TYPE_SIZE,
-													10 * WEIGHT_TYPE_SIZE,
-													11 * WEIGHT_TYPE_SIZE,
-													12 * WEIGHT_TYPE_SIZE,
-													13 * WEIGHT_TYPE_SIZE,
-													14 * WEIGHT_TYPE_SIZE,
-													15 * WEIGHT_TYPE_SIZE
-													};
+#define WEIGHT_TYPE_SIZE sizeof(INPUT_TYPE)
+#define DPAS_DEPTH 8
+#define DPAS_LOAD_SIZE (DPAS_DEPTH * WEIGHT_TYPE_SIZE)
+#define DPAS_STORE_SIZE (DPAS_EXEC_SIZE * WEIGHT_TYPE_SIZE)
+#define DPAS_STORE_BLOCK (DPAS_EXEC_SIZE * DPAS_LOAD_SIZE)
+#define MAX_STORE_SIZE (DPAS_EXEC_SIZE * DPAS_DEPTH)
+#define MAX_STORE_BYTES 128
 
-extern "C" _GENX_MAIN_ void weights_reorder(SurfaceIndex surface_input [[type("buffer_t")]], SurfaceIndex surface_constants [[type("buffer_t")]], SurfaceIndex surface_output [[type("buffer_t")]])
+extern "C" _GENX_MAIN_ void weights_reorder(SurfaceIndex surface_input [[type("buffer_t")]], SurfaceIndex surface_output [[type("buffer_t")]])
 {
-	const uint thread_id_0 = cm_group_id(0) * cm_local_size(0) + cm_local_id(0);
-    const uint thread_id_1 = cm_group_id(1) * cm_local_size(1) + cm_local_id(1);
-    const uint thread_id_2 = cm_group_id(2) * cm_local_size(2) + cm_local_id(2);
+    const uint32_t thread_id_0 = cm_group_id(0) * cm_local_size(0) + cm_local_id(0);
+    const uint32_t thread_id_1 = cm_group_id(1) * cm_local_size(1) + cm_local_id(1);
+    const uint32_t thread_id_2 = cm_group_id(2) * cm_local_size(2) + cm_local_id(2);
 	
-	vector<uint32_t, 32> constants = cm_load<uint32_t, 32, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_constants, 0);
-    const uint32_t IC = constants[7];
-	const uint32_t OC = constants[8];
-	const uint32_t filter_layout_is_nhwc = constants[15];
-	const uint32_t weights_ic_offset = (K_SIZE * K_SIZE * WEIGHT_TYPE_SIZE);
+	const uint32_t oc = thread_id_0 * DPAS_EXEC_SIZE;
+		
+#if INPUT_LAYOUT == LAYOUT_OIYX && OUTPUT_LAYOUT == LAYOUT_IO_i8_o8_i2 && K_SIZE == 3
+	const uint32_t ic_chunks_per_hw_thread = DPAS_DEPTH/(WEIGHT_TYPE_SIZE * WEIGHT_TYPE_SIZE);
+	const uint32_t ic_per_hw_thread = (DPAS_LOAD_SIZE * ic_chunks_per_hw_thread);
+	const uint32_t data_load_size = ic_per_hw_thread/WEIGHT_TYPE_SIZE;
 	
-#if INPUT_LAYOUT == LAYOUT_OIYX && OUTPUT_LAYOUT == LAYOUT_IO_i8_o8_i2
-	const uint32_t ic_chunk_size = DPAS_DEPTH * (sizeof(uint32_t)/ sizeof(DT));
-	const uint32_t ic_chunks_per_hw_thread = 8;
-	const uint32_t ic_per_hw_thread = (ic_chunk_size * ic_chunks_per_hw_thread);
-	const uint32_t ic_per_hw_thread_packed = (ic_per_hw_thread * sizeof(DT)) / sizeof(uint32_t);
-    const uint32_t int_block = (sizeof(uint32_t) / sizeof(DT));
-    const uint32_t dpas_input_channels = DPAS_DEPTH * int_block;
-    
-    const uint32_t oc = thread_id_0 * DPAS_EXEC_SIZE;
-    const uint32_t ic = thread_id_1 * ic_per_hw_thread;
-    
-    const uint32_t chunks_count = DPAS_EXEC_SIZE;
+    const uint32_t ic = thread_id_1 * ic_per_hw_thread;	
+    uint32_t input_offset = (oc * IC + ic) * K_SIZE * K_SIZE * sizeof(INPUT_TYPE);
+    uint32_t output_offset = WEI_OFFSET + ((oc * IC) + (ic * DPAS_EXEC_SIZE)) * K_SIZE * K_SIZE * sizeof(INPUT_TYPE);
 	
-    // load
-    matrix<uint32_t, chunks_count, ic_per_hw_thread_packed> data_input_typed;
-    uint32_t input_offset = (oc * IC + ic) * sizeof(DT);
-    #pragma unroll
-    for(int i = 0; i < chunks_count; i++)
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_0;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_1;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_2;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_3;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_4;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_5;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_6;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_7;
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input_typed_8;
+	#pragma unroll
+    for(int i = 0; i < DPAS_EXEC_SIZE; i++)
     {
-        data_input_typed.row(i) = cm_load<uint32_t, ic_per_hw_thread_packed, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, input_offset);
-        input_offset += IC * sizeof(DT);
+		uint32_t load_offset = input_offset + i * IC * K_SIZE * K_SIZE * sizeof(INPUT_TYPE);
+		
+		vector<INPUT_TYPE, ic_per_hw_thread * K_SIZE * K_SIZE> load_line;	
+		load_line.select<ic_per_hw_thread,1>(0 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 0 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(1 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 1 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(2 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 2 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(3 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 3 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(4 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 4 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(5 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 5 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(6 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 6 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(7 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 7 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);
+		load_line.select<ic_per_hw_thread,1>(8 * ic_per_hw_thread).format<uint32_t>() 	= cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, load_offset + 8 * ic_per_hw_thread * WEIGHT_TYPE_SIZE);	
+
+		data_input_typed_0.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(0);
+		data_input_typed_1.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(1);
+		data_input_typed_2.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(2);
+		data_input_typed_3.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(3);
+		data_input_typed_4.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(4);
+		data_input_typed_5.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(5);
+		data_input_typed_6.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(6);
+		data_input_typed_7.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(7);
+		data_input_typed_8.row(i) = load_line.select<ic_per_hw_thread, K_SIZE * K_SIZE>(8);
     }
-    matrix_ref<DT, chunks_count, ic_per_hw_thread> data_input = data_input_typed.format<DT, chunks_count, ic_per_hw_thread>();
-    
-    uint32_t output_offset = (oc * dpas_input_channels + ic * OC) * sizeof(DT);  
-    vector<DT, DPAS_EXEC_SIZE * dpas_input_channels> data_out;
+
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_0 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_1 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_2 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_3 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_4 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_5 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_6 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_7 = 0;
+	vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out_8 = 0;
     #pragma unroll
     for(int i = 0; i < ic_chunks_per_hw_thread; i++)
     {
-        #pragma unroll
-        for(int j = 0; j < DPAS_EXEC_SIZE; j++)
+		#pragma unroll
+        for(int j = 0; j < DPAS_DEPTH; j++)
         {
-            data_out.select<dpas_input_channels, 1>(j * dpas_input_channels) = data_input.select<DPAS_EXEC_SIZE, 1, int_block, 1>(0, int_block * j + i * dpas_input_channels);
+			data_out_0.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_0.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_1.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_1.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_2.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_2.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_3.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_3.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_4.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_4.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_5.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_5.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_6.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_6.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_7.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_7.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+			data_out_8.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input_typed_8.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
         }
-        const uint32_t packed_size = (DPAS_EXEC_SIZE * dpas_input_channels)/2;
-        cm_store<uint32_t, packed_size>(surface_output, output_offset, data_out.format<uint32_t>());
-        output_offset += OC * dpas_input_channels * sizeof(DT);
+		
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (0 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_0.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (1 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_1.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (2 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_2.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (3 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_3.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (4 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_4.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (5 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_5.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());	
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (6 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_6.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (7 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_7.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + (8 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_8.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		
+		#if DPAS_EXEC_SIZE == 16	
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (0 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_0.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (1 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_1.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (2 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_2.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (3 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_3.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (4 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_4.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (5 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_5.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());	
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (6 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_6.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (7 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_7.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK + (8 * MAX_STORE_SIZE * sizeof(uint32_t)), data_out_8.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+		#endif	
+        output_offset += DPAS_EXEC_SIZE * K_SIZE * K_SIZE * DPAS_LOAD_SIZE * sizeof(OUTPUT_TYPE);
     }
-#elif INPUT_LAYOUT == LAYOUT_OIYX && (OUTPUT_LAYOUT == LAYOUT_OYXI_o8 || OUTPUT_LAYOUT == LAYOUT_OYXI_o16)
+#elif INPUT_LAYOUT == LAYOUT_OIYX && OUTPUT_LAYOUT == LAYOUT_IO_i8_o8_i2 && K_SIZE == 1
+	const uint32_t ic_chunks_per_hw_thread = DPAS_DEPTH/WEIGHT_TYPE_SIZE;
+	const uint32_t ic_per_hw_thread = (DPAS_LOAD_SIZE * ic_chunks_per_hw_thread);
+	const uint32_t data_load_size = ic_per_hw_thread/WEIGHT_TYPE_SIZE;
+		
+	const uint32_t ic = thread_id_1 * ic_per_hw_thread;
+	uint32_t input_offset = (oc * IC + ic) * sizeof(INPUT_TYPE);
+	uint32_t output_offset = WEI_OFFSET + (oc * DPAS_LOAD_SIZE + ic * OC) * sizeof(INPUT_TYPE); 
 	
-	const uint32_t oc = thread_id_0 * SIMD_SIZE;
-    const uint32_t ic = thread_id_1;
-    const uint32_t kh = thread_id_2;
-	const uint32_t weights_oc_offset = IC * weights_ic_offset;
-	const uint32_t chunks_count = SIMD_SIZE;
-	const uint32_t max_dt_size = sizeof(float)/WEIGHT_TYPE_SIZE;
-	const uint32_t LOAD_SIZE = ((K_SIZE + SIMD_SIZE) >> 4) << 4;
-    matrix<DT, SIMD_SIZE, K_SIZE> data_input;
-	
-	vector<uint32_t, LOAD_SIZE> offsets(weights_init_linear_offsets);
+	matrix<INPUT_TYPE, DPAS_EXEC_SIZE, ic_per_hw_thread> data_input;
+    #pragma unroll
+    for(int i = 0; i < DPAS_EXEC_SIZE; i++)
+    {
+		data_input.row(i).format<uint32_t>() = cm_load<uint32_t, data_load_size, DataSize::Default, CacheHint::Cached, CacheHint::Cached>(surface_input, input_offset);
+        input_offset += IC * sizeof(INPUT_TYPE);
+    }
 
-	if (filter_layout_is_nhwc) // nhwc
-	{	
-		offsets *= IC;
-		offsets += oc * weights_oc_offset + ic * WEIGHT_TYPE_SIZE + kh * IC * K_SIZE * WEIGHT_TYPE_SIZE;
-	}
-	else // nchw
-	{
-		offsets += oc * weights_oc_offset + ic * weights_ic_offset + kh * K_SIZE * WEIGHT_TYPE_SIZE;
-	}
+    vector<OUTPUT_TYPE, DPAS_STORE_BLOCK> data_out = 0;
+    #pragma unroll
+    for(int i = 0; i < ic_chunks_per_hw_thread; i++)
+    {
+        #pragma unroll
+        for(int j = 0; j < DPAS_DEPTH; j++)
+        {
+            data_out.select<DPAS_STORE_SIZE, 1>(j * DPAS_STORE_SIZE) = data_input.select<DPAS_EXEC_SIZE, 1, WEIGHT_TYPE_SIZE, 1>(0, WEIGHT_TYPE_SIZE * j + i * DPAS_LOAD_SIZE);
+        }
 
-	#pragma unroll
-	for(int i = 0; i < chunks_count; i++)
-	{
-		vector<DT, LOAD_SIZE> data_load = cm_load<DT, VectorSize::N1, DataSize::Default, CacheHint::Default, CacheHint::Default>(surface_input, offsets);
-		data_input.select<1, 1, K_SIZE, 1>(i, 0) = data_load.select<K_SIZE, 1>();
-		offsets += K_SIZE * K_SIZE * IC * WEIGHT_TYPE_SIZE;
-	}
-	
-	uint32_t ouput_offset = (oc * K_SIZE * K_SIZE * IC + ic * SIMD_SIZE + kh * K_SIZE * IC * SIMD_SIZE) * WEIGHT_TYPE_SIZE;
-	#pragma unroll
-	for(int kw = 0; kw < K_SIZE; kw++)
-	{
-		vector<DT, SIMD_SIZE> data_out = data_input.select<SIMD_SIZE, 1, 1, 1>(0, kw);
-		cm_store<uint32_t, SIMD_SIZE/max_dt_size>(surface_output, ouput_offset, data_out.format<uint32_t>());
-		ouput_offset += IC * SIMD_SIZE * WEIGHT_TYPE_SIZE;
-	}
+        cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset, data_out.select<MAX_STORE_BYTES, 1>(0).format<uint32_t>());
+		#if DPAS_EXEC_SIZE == 16		
+			cm_store<uint32_t, MAX_STORE_BYTES/WEIGHT_TYPE_SIZE>(surface_output, output_offset + DPAS_STORE_BLOCK, data_out.select<MAX_STORE_BYTES, 1>(MAX_STORE_BYTES).format<uint32_t>());
+		#endif
+        output_offset += OC * DPAS_LOAD_SIZE * sizeof(OUTPUT_TYPE);
+    }
 #else
 #error Not supported layouts.
 #endif