diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl index c1b75ea8d0d..3100565d08a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl @@ -21,16 +21,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} layout(push_constant) uniform restrict Block { ivec4 range; - // if not repeating // xyz is source offset w is channel size - // if repeating - // xyzw is source tensor sizes in WHCB dims respectively ivec4 src_offset; - // if not repeating // xyz is destination offset w is channel size - // if repeating - // xyzw is destination tensor sizes in WHCB dims respectively ivec4 dst_offset; }; @@ -45,9 +39,13 @@ const lowp int packed_dim = unhash_packed_dim(out_layout); ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -${layout_declare_spec_const(C, "int", "repeat", "0")} +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, range.xyz))) { + return; + } -void no_repeat_copy(ivec3 pos) { // Position in input tensor ivec3 in_pos = pos + src_offset.xyz; in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2); @@ -135,103 +133,3 @@ void no_repeat_copy(ivec3 pos) { out_value, out_axis_map); } - -void repeat_copy(ivec3 pos) { - // expand position in packed dim - pos[packed_dim] <<= 2; - - // channel size aligned by 4 when tensors are channel packed raw value otherwise - const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z); - - // find input texel's WHCB index - const int width_index = pos.x % src_offset.x; - const int height_index = pos.y % src_offset.y; - int channel_index; - int batch_index; - - // if tensors are channel packed - if (packed_dim == C_DIM) { - // the output channels in a batch will be channel size * channel repetitions aligned by 4 - const int out_channel_size = alignup4(src_offset.z * dst_offset.z); - - // batch index in the output - const int out_pos_batch_index = pos.z / out_channel_size; - - // source batch index for based on current output pos - batch_index = out_pos_batch_index % src_offset.w; - - // batch repetition count for current output pos - const int batch_repetition_index = out_pos_batch_index / src_offset.w; - - // calculate input channel index based on current output pos and batch index - // its done this way because we want source channel to restart from zero when a batch index increments - // also batch_index will reset to zero after hitting batch repetition count - // so track the current repetition in batch_repetition_index so it can be used for determining current_index - channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z; - } else { - // the output channels in a batch will be channel size * channel repetitions - const int out_channel_size = src_offset.z * dst_offset.z; - - // source batch index for based on current output pos - batch_index = (pos.z / out_channel_size) % src_offset.w; - - // source channel index is current output pos wrapped based on channel count - channel_index = pos.z % src_offset.z; - } - - // input texel's WCB position - const ivec3 in_pos = ivec3(width_index, height_index, channel_index); - - // squeeze position in packed dim - pos[packed_dim] >>= 2; - - // packed dim index of texel last fetched - int fetched_in_pos_packed_dim = -1; - - // fetched input texel - VEC4_T in_value; - - // output texel value - VEC4_T out_value = VEC4_T(0); - - int src_lane_offset = in_pos[packed_dim]; - - for (int i=0; i<4; i++) { - if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) { - fetched_in_pos_packed_dim = (src_lane_offset >> 2); - - ivec3 curr_in_pos = in_pos; - curr_in_pos[packed_dim] = src_lane_offset; - curr_in_pos.z = curr_in_pos.z + batch_index * channel_size; - curr_in_pos[packed_dim] >>= 2; - - in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map); - } - - out_value[i] = in_value[src_lane_offset & 0x3]; - - src_lane_offset++; - // if packed index exceeded source packed dim round to zero - src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]); - } - - write_texel_lpos( - t_out, - pos, - out_value, - out_axis_map); -} - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range.xyz))) { - return; - } - - if (repeat == 1) { - repeat_copy(pos); - } else { - no_repeat_copy(pos); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl new file mode 100644 index 00000000000..441cd57c17d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl @@ -0,0 +1,129 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} + +layout(push_constant) uniform restrict Block { + ivec4 range; + // source tensor sizes in WHCB dims respectively + ivec4 src_dims; + // destination tensor repeats in WHCB dims respectively + ivec4 dst_repeats; +}; + +#include "indexing_utils.h" + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); +const lowp int packed_dim = unhash_packed_dim(out_layout); + +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); + +void main() { + ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, range.xyz))) { + return; + } + + // expand position in packed dim + pos[packed_dim] <<= 2; + + // channel size aligned by 4 when tensors are channel packed raw value otherwise + const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z); + + // find input texel's WHCB index + const int width_index = pos.x % src_dims.x; + const int height_index = pos.y % src_dims.y; + int channel_index; + int batch_index; + + // if tensors are channel packed + if (packed_dim == C_DIM) { + // the output channels in a batch will be channel size * channel repetitions aligned by 4 + const int out_channel_size = alignup4(src_dims.z * dst_repeats.z); + + // batch index in the output + const int out_pos_batch_index = pos.z / out_channel_size; + + // source batch index for based on current output pos + batch_index = out_pos_batch_index % src_dims.w; + + // batch repetition count for current output pos + const int batch_repetition_index = out_pos_batch_index / src_dims.w; + + // calculate input channel index based on current output pos and batch index + // its done this way because we want source channel to restart from zero when a batch index increments + // also batch_index will reset to zero after hitting batch repetition count + // so track the current repetition in batch_repetition_index so it can be used for determining current_index + channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z; + } else { + // the output channels in a batch will be channel size * channel repetitions + const int out_channel_size = src_dims.z * dst_repeats.z; + + // source batch index for based on current output pos + batch_index = (pos.z / out_channel_size) % src_dims.w; + + // source channel index is current output pos wrapped based on channel count + channel_index = pos.z % src_dims.z; + } + + // input texel's WCB position + const ivec3 in_pos = ivec3(width_index, height_index, channel_index); + + // squeeze position in packed dim + pos[packed_dim] >>= 2; + + // packed dim index of texel last fetched + int fetched_in_pos_packed_dim = -1; + + // fetched input texel + VEC4_T in_value; + + // output texel value + VEC4_T out_value = VEC4_T(0); + + int src_lane_offset = in_pos[packed_dim]; + + for (int i=0; i<4; i++) { + if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) { + fetched_in_pos_packed_dim = (src_lane_offset >> 2); + + ivec3 curr_in_pos = in_pos; + curr_in_pos[packed_dim] = src_lane_offset; + curr_in_pos.z = curr_in_pos.z + batch_index * channel_size; + curr_in_pos[packed_dim] >>= 2; + + in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map)); + } + + out_value[i] = in_value[src_lane_offset & 0x3]; + + src_lane_offset++; + // if packed index exceeded source packed dim round to zero + src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]); + } + + write_texel_lpos( + t_out, + pos, + out_value, + out_axis_map); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml new file mode 100644 index 00000000000..526980a0f41 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml @@ -0,0 +1,14 @@ +repeat: + parameter_names_with_default_values: + DTYPE: float + NDIM: 3 + STORAGE: texture3d + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int + - VALUE: int8 + - VALUE: uint8 + shader_variants: + - NAME: repeat diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index ecc2faa392a..80379880b0f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -71,21 +71,17 @@ void add_copy_packed_dim_offset_node( const ivec3& range, const ivec4& src_offset, const ivec4& dst_offset, - const ValueRef out, - bool repeat) { + const ValueRef out) { vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); - // Check the packed dimension is same for both tensors - VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out)); - if (!repeat) { - // For non repeat copy also check if the packed dimension is Width or - // Height. Since the function does not support channel packing. - VK_CHECK_COND( - check_same_packed_dim(*t_in, *t_out) && - (check_packed_dim_is(*t_in, WHCN::kWidthDim) || - check_packed_dim_is(*t_in, WHCN::kHeightDim))); - } + // Check the packed dimension is same for both tensors, also check if the + // packed dimension is Width or Height. Since the function does not support + // channel packing. + VK_CHECK_COND( + check_same_packed_dim(*t_in, *t_out) && + (check_packed_dim_is(*t_in, WHCN::kWidthDim) || + check_packed_dim_is(*t_in, WHCN::kHeightDim))); std::string kernel_name = "copy_packed_dim_offset"; kernel_name.reserve(kShaderNameReserve); @@ -96,43 +92,41 @@ void add_copy_packed_dim_offset_node( range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)}; ivec3 global_wg_size = t_out->logical_limits(); - if (!repeat) { - const auto packed_dim = t_in->packed_dim(); - // The starting offset in a texel where this tensor will start copying from - const auto src_lane_offset = src_offset[packed_dim] & 0x3; - // The starting offset in a texel where this tensor will start copying to - const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; - - // The total packed texels this tensor will be copied from - // The first texel of tensor data in packed dimension will be copied from - // remaining lanes from current source Hence (4 - src_lane_offset) is added - // to tensor size in packed dimension - const auto src_packed_size = utils::div_up_4( - (4 - src_lane_offset) + - dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim))); - - // The total packed texels this tensor will be copied to - // The first texel of tensor data in packed dimension will be copied to - // remaining lanes from previous write Hence (4 - dst_lane_offset) is added - // to tensor size in packed dimension - const auto dst_packed_size = utils::div_up_4( - (4 - dst_lane_offset) + - dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim))); - - // If the starting src offset is not 0, and the total packed texels is - // greater than the source texel range - const bool has_additional_src_work = - src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; - // If the starting dst offset is not 0, and the total packed texels is - // greater than the source texel range - const bool has_additional_dst_work = - dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; - - if (has_additional_src_work || has_additional_dst_work) { - global_wg_size[packed_dim]++; // Increase the global work group size in - // packed dimension - final_range[packed_dim]++; // Increase the range in packed dimension - } + const auto packed_dim = t_in->packed_dim(); + // The starting offset in a texel where this tensor will start copying from + const auto src_lane_offset = src_offset[packed_dim] & 0x3; + // The starting offset in a texel where this tensor will start copying to + const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; + + // The total packed texels this tensor will be copied from + // The first texel of tensor data in packed dimension will be copied from + // remaining lanes from current source Hence (4 - src_lane_offset) is added + // to tensor size in packed dimension + const auto src_packed_size = utils::div_up_4( + (4 - src_lane_offset) + + dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim))); + + // The total packed texels this tensor will be copied to + // The first texel of tensor data in packed dimension will be copied to + // remaining lanes from previous write Hence (4 - dst_lane_offset) is added + // to tensor size in packed dimension + const auto dst_packed_size = utils::div_up_4( + (4 - dst_lane_offset) + + dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim))); + + // If the starting src offset is not 0, and the total packed texels is + // greater than the source texel range + const bool has_additional_src_work = + src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; + // If the starting dst offset is not 0, and the total packed texels is + // greater than the source texel range + const bool has_additional_dst_work = + dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; + + if (has_additional_src_work || has_additional_dst_work) { + global_wg_size[packed_dim]++; // Increase the global work group size in + // packed dimension + final_range[packed_dim]++; // Increase the range in packed dimension } auto shader = VK_KERNEL_FROM_STR(kernel_name); @@ -151,7 +145,7 @@ void add_copy_packed_dim_offset_node( // Parameter buffers {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0}, + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, nullptr, {}, { diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h index 9761d571caf..41956d482d9 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h @@ -52,17 +52,13 @@ void add_copy_offset_node( // It copies the texture extents specified by the range, src_offset, and // dst_offset (all are in texture coordinate (x, y, z) from the input image to // the output image. -// -// repeat flag is used to indicate if copy should wrap around tensor dim. -// only true for repeat op. void add_copy_packed_dim_offset_node( ComputeGraph& graph, const ValueRef in, const utils::ivec3& range, const utils::ivec4& src_offset, const utils::ivec4& dst_offset, - const ValueRef out, - bool repeat = false); + const ValueRef out); // add_copy_channel_offset_node behaves similar to add_copy_node, except that it // works on the channel dimensions of the tensor (up to 4 dimensions in NCHW). diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp index 38221e8a348..24e51e99c73 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp @@ -69,18 +69,49 @@ void add_repeat_node( vTensorPtr t_out = graph.get_tensor(out); check_args(*t_in, repeats, *t_out); - const utils::ivec4 src_offset{ + const utils::ivec4 src_dims{ dim_at(t_in->sizes()), dim_at(t_in->sizes()), dim_at(t_in->sizes()), dim_at(t_in->sizes())}; - const utils::ivec4 dst_offset{ + const utils::ivec4 dst_repeats{ dim_at(repeats), dim_at(repeats), dim_at(repeats), dim_at(repeats)}; - add_copy_packed_dim_offset_node( - graph, in, t_out->logical_limits(), src_offset, dst_offset, out, true); + + std::string kernel_name = "repeat"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, *t_out); + + // A copy of range with the last element set to batch size of the input tensor + const utils::ivec3 wg_size = t_out->logical_limits(); + + const auto shader = VK_KERNEL_FROM_STR(kernel_name); + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + wg_size, + graph.create_local_wg_size(wg_size), + // Inputs and Outputs + { + {out, vkapi::MemoryAccessType::WRITE}, + {in, vkapi::MemoryAccessType::READ}, + }, + // Parameter buffers + {}, + // Specialization Constants + {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + nullptr, + {}, + { + PushConstantDataInfo(&wg_size, sizeof(wg_size), sizeof(utils::ivec4)), + PushConstantDataInfo( + &src_dims, sizeof(src_dims), sizeof(utils::ivec4)), + PushConstantDataInfo( + &dst_repeats, sizeof(dst_repeats), sizeof(utils::ivec4)), + })); } void repeat(ComputeGraph& graph, const std::vector& args) {