[ET-VK][11/n] copy_channel_offsets node

yipjustin · yipjustin · commit aa2c7fcc3f33 · 2024-04-27T00:27:53.000-07:00
Pull Request resolved: #3351 1. Add a node `copy_channel_offsets` specifically for copying along the channel dimension, it needs extra attention at the boundaries due to channel packing. 1.1. `copy_channel_offsets` will be useful for `aten.cat` and `aten.split`. 2. Create `etvk.*` operators to facilitate testing. Add test case for both `copy_offset` and `copy_channel_offset`. ghstack-source-id: 224194953 Differential Revision: [D56554426](https://our.internmc.facebook.com/intern/diff/D56554426/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  // Analogus to range variable in copy. It defines the # of channel being
+  // copied.
+  int channel_range;  
+  int src_channel_offset;
+  int dst_channel_offset;
+  int unused; 
+  // Operates on (x, y, z) extents. 
+  ivec3 range;
+  int unused1;
+  ivec3 dst_offset;
+  int unused2;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  // Note: Unlike other shaders, the range is often not equal to the destination
+  // texture extent.
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, range))) {
+    return;
+  }
+
+  const ivec3 out_pos = pos + dst_offset;
+
+  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  // First read the existing values to make sure the boundary values stay.
+  VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
+
+  for (int i=0; i<4; i++) {
+    ivec4 in_whcn = out_whcn;
+
+    in_whcn.z = out_whcn.z - dst_channel_offset + i;
+
+    // Handle the partial update for begining of channel in an existing tensor.
+    // If the source channel index is below zero or exceeds the range, we skip
+    // updating the element to avoid overwriting existing data.
+    if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
+      continue;
+    }
+
+    // Readjust for the source offset.
+    in_whcn.z = in_whcn.z + src_channel_offset;
+    
+    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
+    v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+  }
+
+  imageStore(image_out, out_pos, v);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
@@ -0,0 +1,10 @@
+copy_channel_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: copy_channel_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -10,26 +10,12 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-
 layout(std430) buffer;
 
-#include "indexing_utils.h"
-
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
-};
-
-
-
-layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
+layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
   ivec3 range;
   int unused0;
   ivec3 src_offset;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -8,38 +8,42 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
+using api::utils::ivec3;
+using api::utils::uvec3;
+
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
-    const api::utils::ivec3& range,
-    const api::utils::ivec3& src_offset,
-    const api::utils::ivec3& dst_offset,
+    const ivec3& range,
+    const ivec3& src_offset,
+    const ivec3& dst_offset,
     const ValueRef out) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
-  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
-
   std::string kernel_name = "copy_offset";
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = api::utils::make_uvec3(range);
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+  uvec3 global_size = api::utils::make_uvec3(range);
+  uvec3 local_size = adaptive_work_group_size(global_size);
 
   const struct Block final {
-    api::utils::ivec3 range;
+    ivec3 range;
     int32_t unused0;
-    api::utils::ivec3 src_offset;
+    ivec3 src_offset;
     int32_t unused1;
-    api::utils::ivec3 dst_offset;
+    ivec3 dst_offset;
     int32_t unused2;
   } offset_params{
       range,
@@ -58,13 +62,166 @@ void add_copy_offset_node(
       global_size,
       local_size,
       // Inputs and Outputs
-      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      {
+          {out, api::MemoryAccessType::WRITE},
+          {in, api::MemoryAccessType::READ},
+      },
       // Parameter buffers
-      {t_out->texture_limits_ubo(),
-       t_in->texture_limits_ubo(),
-       graph.create_params_buffer(offset_params)},
+      {graph.create_params_buffer(offset_params)},
       // Specialization Constants
       {}));
 }
 
+void add_copy_channel_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    int32_t channel_range,
+    int32_t src_channel_offset,
+    int32_t dst_channel_offset,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  // Likely need to prepad these numbers.
+  std::vector<int64_t> in_sizes = t_in->sizes();
+  std::vector<int64_t> out_sizes = t_out->sizes();
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+
+  // NOTE: This function should be able to support 1d and 2d tensors when
+  // range=1, src_offset=dst_offset=1.
+  VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
+  VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");
+
+  VK_CHECK_COND(
+      dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
+      "Source channel plus range should be less than or equal to input tensor's channel size");
+  VK_CHECK_COND(
+      dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
+      "Source channel and range should be less than or equal to input tensor's channel size");
+
+  VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
+  VK_CHECK_COND(
+      src_channel_offset >= 0, "Src channel offset must be non-negative");
+  VK_CHECK_COND(
+      dst_channel_offset >= 0, "Dst channel offset must be non-negative");
+
+  std::string kernel_name = "copy_channel_offset";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
+
+  // Copy one batch at a time.
+  for (int batch_idx = 0; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
+       batch_idx++) {
+    // Mapping the tensor NCHW coordinates into texture XYZ coordinates
+    int32_t dst_first_z = dst_channel_offset / 4;
+    int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
+
+    // We copy the entire width and height dimension. For the channel dimension,
+    // we use the z-dimension of the global_size to specify the texture range.
+    // The shader combines the global invocation id and the dst_offset to get
+    // the actual coordinate.
+
+    ivec3 dst_offset{
+        0, 0, dst_first_z + batch_idx * api::utils::div_up(out_channels, 4)};
+
+    uvec3 global_size{
+        dim_at<Dim4D::Width>(in_sizes),
+        dim_at<Dim4D::Height>(in_sizes),
+        api::utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
+
+    uvec3 local_size = adaptive_work_group_size(global_size);
+
+    const struct Block final {
+      api::utils::ivec4 out_sizes;
+      api::utils::ivec4 in_sizes;
+      int32_t channel_range;
+      int32_t src_channel_offset;
+      int32_t dst_channel_offset;
+      int32_t unused;
+      ivec3 range;
+      int32_t unused1;
+      ivec3 dst_offset;
+      int32_t unused2;
+
+    } channel_offset_params{
+        api::utils::make_whcn_ivec4(out_sizes),
+        api::utils::make_whcn_ivec4(in_sizes),
+        channel_range,
+        src_channel_offset,
+        dst_channel_offset,
+        0,
+        api::utils::make_ivec3(global_size),
+        0,
+        dst_offset,
+        0,
+    };
+
+    auto shader = VK_KERNEL_FROM_STR(kernel_name);
+
+    graph.execute_nodes().emplace_back(new ExecuteNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        global_size,
+        local_size,
+        // Inputs and Outputs
+        {
+            {out, api::MemoryAccessType::WRITE},
+            {out, api::MemoryAccessType::READ},
+            {in, api::MemoryAccessType::READ},
+        },
+        // Parameter buffers
+        {graph.create_params_buffer(channel_offset_params)},
+        // Specialization Constants
+        {}));
+  }
+}
+
+void add_copy_offset_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef range_ref,
+    ValueRef src_offset_ref,
+    ValueRef dst_offset_ref,
+    ValueRef out) {
+  ivec3 range = api::utils::make_ivec3(*graph.get_int_list(range_ref));
+  ivec3 src_offset =
+      api::utils::make_ivec3(*graph.get_int_list(src_offset_ref));
+  ivec3 dst_offset =
+      api::utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
+
+  add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
+}
+
+void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
+}
+
+void copy_channel_offset(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  ValueRef in = args[0];
+  ValueRef channel_range_ref = args[1];
+  ValueRef src_channel_offset_ref = args[2];
+  ValueRef dst_channel_offset_ref = args[3];
+  ValueRef out = args[4];
+
+  auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
+  auto src_channel_offset =
+      graph.extract_scalar<int64_t>(src_channel_offset_ref);
+  auto dst_channel_offset =
+      graph.extract_scalar<int64_t>(dst_channel_offset_ref);
+
+  add_copy_channel_offset_node(
+      graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(etvk.copy_offset, copy_offset);
+  VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
@@ -14,6 +14,13 @@
 
 namespace vkcompute {
 
+// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
+// texture extents specified by the range, src_offset, and dst_offset (all are
+// in texture coordinate (x, y, z) from the input image to the output image.
+//
+// It is possible to have input and output to point to the same image
+// object. But when the source range and destination range overlap, the behavior
+// is undefined.
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
@@ -22,4 +29,25 @@ void add_copy_offset_node(
     const api::utils::ivec3& dst_offset,
     const ValueRef out);
 
+// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
+// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
+// The range and offset arguments are in the tensor coordinate. It assumes the
+// underlying texture is channel-packed.
+//
+// This function is specialized implementation for copying
+// channel packed values. The complication comes from when reading / writing the
+// channel dimension on indices that are not aligned to packing, we will need
+// be careful about the boundaries.
+//
+// It achieves the following:
+//   out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
+//       in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
+void add_copy_channel_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    int32_t channel_range,
+    int32_t src_channel_offset,
+    int32_t dst_channel_offset,
+    const ValueRef out);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp