88
99#include < executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
1010
11+ #include < executorch/backends/vulkan/runtime/api/api.h>
12+ #include < executorch/backends/vulkan/runtime/graph/Logging.h>
13+
14+ #include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
1115#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
1216#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
1317#include < executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
1418
1519namespace vkcompute {
1620
21+ using api::utils::ivec3;
22+ using api::utils::uvec3;
23+
1724void add_copy_offset_node (
1825 ComputeGraph& graph,
1926 const ValueRef in,
20- const api::utils:: ivec3& range,
21- const api::utils:: ivec3& src_offset,
22- const api::utils:: ivec3& dst_offset,
27+ const ivec3& range,
28+ const ivec3& src_offset,
29+ const ivec3& dst_offset,
2330 const ValueRef out) {
2431 vTensorPtr t_in = graph.get_tensor (in);
2532 vTensorPtr t_out = graph.get_tensor (out);
2633
27- VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
28- VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
29-
3034 std::string kernel_name = " copy_offset" ;
3135 kernel_name.reserve (kShaderNameReserve );
3236 add_dtype_suffix (kernel_name, *t_out);
3337
34- api::utils:: uvec3 global_size = api::utils::make_uvec3 (range);
35- api::utils:: uvec3 local_size = adaptive_work_group_size (global_size);
38+ uvec3 global_size = api::utils::make_uvec3 (range);
39+ uvec3 local_size = adaptive_work_group_size (global_size);
3640
3741 const struct Block final {
38- api::utils:: ivec3 range;
42+ ivec3 range;
3943 int32_t unused0;
40- api::utils:: ivec3 src_offset;
44+ ivec3 src_offset;
4145 int32_t unused1;
42- api::utils:: ivec3 dst_offset;
46+ ivec3 dst_offset;
4347 int32_t unused2;
4448 } offset_params{
4549 range,
@@ -58,13 +62,166 @@ void add_copy_offset_node(
5862 global_size,
5963 local_size,
6064 // Inputs and Outputs
61- {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
65+ {
66+ {out, api::MemoryAccessType::WRITE},
67+ {in, api::MemoryAccessType::READ},
68+ },
6269 // Parameter buffers
63- {t_out->texture_limits_ubo (),
64- t_in->texture_limits_ubo (),
65- graph.create_params_buffer (offset_params)},
70+ {graph.create_params_buffer (offset_params)},
6671 // Specialization Constants
6772 {}));
6873}
6974
75+ void add_copy_channel_offset_node (
76+ ComputeGraph& graph,
77+ const ValueRef in,
78+ int32_t channel_range,
79+ int32_t src_channel_offset,
80+ int32_t dst_channel_offset,
81+ const ValueRef out) {
82+ vTensorPtr t_in = graph.get_tensor (in);
83+ vTensorPtr t_out = graph.get_tensor (out);
84+
85+ // Likely need to prepad these numbers.
86+ std::vector<int64_t > in_sizes = t_in->sizes ();
87+ std::vector<int64_t > out_sizes = t_out->sizes ();
88+
89+ VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
90+ VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
91+
92+ // NOTE: This function should be able to support 1d and 2d tensors when
93+ // range=1, src_offset=dst_offset=1.
94+ VK_CHECK_COND (t_in->dim () >= 3 , " Src dim should be at least 3" );
95+ VK_CHECK_COND (t_out->dim () >= 3 , " Dst dim should be at least 3" );
96+
97+ VK_CHECK_COND (
98+ dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
99+ " Source channel plus range should be less than or equal to input tensor's channel size" );
100+ VK_CHECK_COND (
101+ dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
102+ " Source channel and range should be less than or equal to input tensor's channel size" );
103+
104+ VK_CHECK_COND (channel_range >= 0 , " Channel range must be non-negative" );
105+ VK_CHECK_COND (
106+ src_channel_offset >= 0 , " Src channel offset must be non-negative" );
107+ VK_CHECK_COND (
108+ dst_channel_offset >= 0 , " Dst channel offset must be non-negative" );
109+
110+ std::string kernel_name = " copy_channel_offset" ;
111+ kernel_name.reserve (kShaderNameReserve );
112+ add_dtype_suffix (kernel_name, *t_out);
113+
114+ int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
115+
116+ // Copy one batch at a time.
117+ for (int batch_idx = 0 ; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
118+ batch_idx++) {
119+ // Mapping the tensor NCHW coordinates into texture XYZ coordinates
120+ int32_t dst_first_z = dst_channel_offset / 4 ;
121+ int32_t dst_last_z = (dst_channel_offset + channel_range - 1 ) / 4 ;
122+
123+ // We copy the entire width and height dimension. For the channel dimension,
124+ // we use the z-dimension of the global_size to specify the texture range.
125+ // The shader combines the global invocation id and the dst_offset to get
126+ // the actual coordinate.
127+
128+ ivec3 dst_offset{
129+ 0 , 0 , dst_first_z + batch_idx * api::utils::div_up (out_channels, 4 )};
130+
131+ uvec3 global_size{
132+ dim_at<Dim4D::Width>(in_sizes),
133+ dim_at<Dim4D::Height>(in_sizes),
134+ api::utils::safe_downcast<uint32_t >(dst_last_z - dst_first_z + 1 )};
135+
136+ uvec3 local_size = adaptive_work_group_size (global_size);
137+
138+ const struct Block final {
139+ api::utils::ivec4 out_sizes;
140+ api::utils::ivec4 in_sizes;
141+ int32_t channel_range;
142+ int32_t src_channel_offset;
143+ int32_t dst_channel_offset;
144+ int32_t unused;
145+ ivec3 range;
146+ int32_t unused1;
147+ ivec3 dst_offset;
148+ int32_t unused2;
149+
150+ } channel_offset_params{
151+ api::utils::make_whcn_ivec4 (out_sizes),
152+ api::utils::make_whcn_ivec4 (in_sizes),
153+ channel_range,
154+ src_channel_offset,
155+ dst_channel_offset,
156+ 0 ,
157+ api::utils::make_ivec3 (global_size),
158+ 0 ,
159+ dst_offset,
160+ 0 ,
161+ };
162+
163+ auto shader = VK_KERNEL_FROM_STR (kernel_name);
164+
165+ graph.execute_nodes ().emplace_back (new ExecuteNode (
166+ graph,
167+ VK_KERNEL_FROM_STR (kernel_name),
168+ global_size,
169+ local_size,
170+ // Inputs and Outputs
171+ {
172+ {out, api::MemoryAccessType::WRITE},
173+ {out, api::MemoryAccessType::READ},
174+ {in, api::MemoryAccessType::READ},
175+ },
176+ // Parameter buffers
177+ {graph.create_params_buffer (channel_offset_params)},
178+ // Specialization Constants
179+ {}));
180+ }
181+ }
182+
183+ void add_copy_offset_node (
184+ ComputeGraph& graph,
185+ ValueRef in,
186+ ValueRef range_ref,
187+ ValueRef src_offset_ref,
188+ ValueRef dst_offset_ref,
189+ ValueRef out) {
190+ ivec3 range = api::utils::make_ivec3 (*graph.get_int_list (range_ref));
191+ ivec3 src_offset =
192+ api::utils::make_ivec3 (*graph.get_int_list (src_offset_ref));
193+ ivec3 dst_offset =
194+ api::utils::make_ivec3 (*graph.get_int_list (dst_offset_ref));
195+
196+ add_copy_offset_node (graph, in, range, src_offset, dst_offset, out);
197+ }
198+
199+ void copy_offset (ComputeGraph& graph, const std::vector<ValueRef>& args) {
200+ add_copy_offset_node (graph, args[0 ], args[1 ], args[2 ], args[3 ], args[4 ]);
201+ }
202+
203+ void copy_channel_offset (
204+ ComputeGraph& graph,
205+ const std::vector<ValueRef>& args) {
206+ ValueRef in = args[0 ];
207+ ValueRef channel_range_ref = args[1 ];
208+ ValueRef src_channel_offset_ref = args[2 ];
209+ ValueRef dst_channel_offset_ref = args[3 ];
210+ ValueRef out = args[4 ];
211+
212+ auto channel_range = graph.extract_scalar <int64_t >(channel_range_ref);
213+ auto src_channel_offset =
214+ graph.extract_scalar <int64_t >(src_channel_offset_ref);
215+ auto dst_channel_offset =
216+ graph.extract_scalar <int64_t >(dst_channel_offset_ref);
217+
218+ add_copy_channel_offset_node (
219+ graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
220+ }
221+
222+ REGISTER_OPERATORS {
223+ VK_REGISTER_OP (etvk.copy_offset , copy_offset);
224+ VK_REGISTER_OP (etvk.copy_channel_offset , copy_channel_offset);
225+ }
226+
70227} // namespace vkcompute
0 commit comments