pytorch
diff --git a/‎backends/vulkan/runtime/api/Tensor.cpp‎
Lines changed: 48 additions & 3 deletions b/‎backends/vulkan/runtime/api/Tensor.cpp‎
Lines changed: 48 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/api/Tensor.h‎
Lines changed: 26 additions & 6 deletions b/‎backends/vulkan/runtime/api/Tensor.h‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl‎
Lines changed: 3 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl‎
Lines changed: 3 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 3 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl‎
Lines changed: 4 additions & 6 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl‎
Lines changed: 4 additions & 4 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/matmul.glsl‎
Lines changed: 3 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/matmul.glsl‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl‎
Lines changed: 3 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl‎
Lines changed: 7 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl‎
Lines changed: 7 additions & 5 deletions
@@ -139,8 +139,10 @@ vTensor::vTensor(
       // Calculate sizes and strides
       sizes_(sizes.begin(), sizes.end()),
       gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
-      // Utility Uniform Buffer that can be passed to shaders as arguments
-      sizes_uniform_(context, api::utils::make_whcn_ivec4(sizes_)),
+      texture_limits_{{0, 0, 0}},
+      // Utility Uniform Buffers that can be passed to shaders as arguments
+      sizes_uniform_(),
+      texture_limits_uniform_(),
       // Construct Tensor storage
       storage_(
           context,
@@ -149,6 +151,13 @@ vTensor::vTensor(
           gpu_sizes_,
           dtype_,
           allocate_memory) {
+  if (storage_type != api::kBuffer) {
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[0]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[1]),
+        api::utils::safe_downcast<int32_t>(storage_.extents_.data[2])};
+  }
+
   if (dtype == api::kHalf) {
     VK_CHECK_COND(
         api::context()->adapter_ptr()->has_16bit_storage(),
@@ -187,6 +196,22 @@ api::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
+const api::BufferBindInfo vTensor::sizes_ubo() {
+  if (!sizes_uniform_.buffer()) {
+    sizes_uniform_ = api::UniformParamsBuffer(
+        storage_.context_, api::utils::make_whcn_ivec4(sizes_));
+  }
+  return api::BufferBindInfo(sizes_uniform_.buffer());
+}
+
+const api::BufferBindInfo vTensor::texture_limits_ubo() {
+  if (!texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_ =
+        api::UniformParamsBuffer(storage_.context_, texture_limits_);
+  }
+  return api::BufferBindInfo(texture_limits_uniform_.buffer());
+}
+
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
   switch (storage_type()) {
     case api::kBuffer:
@@ -224,7 +249,25 @@ void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
 void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   sizes_ = new_sizes;
   gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
-  sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
+
+  if (storage_type() != api::kBuffer) {
+    // Calculate the extents of the image texture that would have been required
+    // for a tensor of the new sizes.
+    api::utils::uvec3 virtual_extents =
+        create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
+    // Update the texture limits to reflect the new virtual extents.
+    texture_limits_.limits = api::utils::ivec3{
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
+        api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
+  }
+
+  if (sizes_uniform_.buffer()) {
+    sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
+  }
+  if (texture_limits_uniform_.buffer()) {
+    texture_limits_uniform_.update(texture_limits_);
+  }
 }
 
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
@@ -236,6 +279,8 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
 }
 
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  // For texture storage check that the current texture is large enough for the
+  // new sizes of the tensor.
   if (storage_type() != api::kBuffer) {
     api::utils::uvec3 virtual_extents =
         create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
 
@@ -94,6 +94,13 @@ class vTensorStorage final {
 };
 
 class vTensor final {
+  struct TextureLimits {
+    // Alignment is required to conform with Vulkan specification; a 3 or 4
+    // component vector with components of size N must have base alignment of
+    // 4N.
+    alignas(16) api::utils::ivec3 limits;
+  };
+
  public:
   explicit vTensor(
       api::Context* context,
@@ -115,11 +122,18 @@ class vTensor final {
 
   std::vector<int64_t> sizes_;
   std::vector<int64_t> gpu_sizes_;
+  TextureLimits texture_limits_;
 
-  // A Vulkan uniform buffer containing the tensor sizes in WHCN that can be
-  // passed into a shader.
+  // A Vulkan uniform buffer containing the (W, H, C, N) tensor sizes that can
+  // be passed into a shader.
   api::UniformParamsBuffer sizes_uniform_;
 
+  // A Vulkan uniform buffer containing the texture limits derived from the
+  // tensor's current size information that can be passed into a shader. Note
+  // that the texture limits may be different from the texture's extents if the
+  // tensor has been resized with `virtual_resize()`.
+  api::UniformParamsBuffer texture_limits_uniform_;
+
   vTensorStorage storage_;
 
  public:
@@ -194,11 +208,17 @@ class vTensor final {
 
   /*
    * Get the binding information for the uniform buffer object containing the
-   * tensor sizes to use in a compute shader.
+   * tensor sizes to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
    */
-  inline const api::BufferBindInfo sizes_ubo() {
-    return api::BufferBindInfo(sizes_uniform_.buffer());
-  }
+  const api::BufferBindInfo sizes_ubo();
+
+  /*
+   * Get the binding information for the uniform buffer object containing the
+   * texture limits to use in a compute shader. Note that the GPU buffer will be
+   * allocated the first time this function is called.
+   */
+  const api::BufferBindInfo texture_limits_ubo();
 
   inline size_t numel() const {
     return api::utils::multiply_integers(sizes());
 
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a 2D convolution. Each shader invocation calculates the output at
  * a single output location.
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
 
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
 
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
 
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
@@ -71,7 +69,7 @@ void main() {
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (pos_out_of_bounds(pos[0], out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos[0], out_limits))) {
     return;
   }
 
@@ -146,7 +144,7 @@ void main() {
   }
 
   for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
-    if (!pos_out_of_bounds(pos[i], out_sizes, packed_dim)) {
+    if (all(lessThan(pos[i], out_limits))) {
       imageStore(image_out, pos[i], sum[i]);
     }
   }
 
@@ -21,11 +21,11 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
-layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
   ivec4 in_sizes;
 };
 
@@ -54,7 +54,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
 
@@ -16,8 +16,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
 layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -26,12 +26,10 @@ layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int out_packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, out_packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
 
@@ -19,8 +19,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
 layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
-  ivec4 out_sizes;
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
 };
 
 layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -36,12 +36,10 @@ layout(set = 0, binding = 5) uniform PRECISION restrict Params {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
 
 
@@ -25,22 +25,24 @@ layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
 layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict Sizes {
+layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-layout(set = 0, binding = 7) uniform PRECISION restrict Epsilon {
+layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
   float epsilon;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+  if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }