From d7bc10e808e91ef578a03a2f61602f3ed34a1573 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Wed, 19 Mar 2025 15:48:08 -0700 Subject: [PATCH 1/2] [ET-VK] Adding boolean parameters to add_copy_offset_node to specify index calculation function in copy op's shader. Pull Request resolved: https://github.com/pytorch/executorch/pull/9343 This diff adds two new boolean flags, `calc_out_pos_using_src_chnl` and `calc_in_pos_using_dst_chnl` to add_copy_offset_node, which can be used to specify an indexing function in the shader. ghstack-source-id: 272554190 @exported-using-ghexport Differential Revision: [D71343588](https://our.internmc.facebook.com/intern/diff/D71343588/) --- .../runtime/graph/ops/glsl/copy_offset.glsl | 20 +++++++++++++------ .../vulkan/runtime/graph/ops/impl/Cat.cpp | 2 +- .../vulkan/runtime/graph/ops/impl/Copy.cpp | 13 +++++++++--- backends/vulkan/runtime/graph/ops/impl/Copy.h | 17 +++++++++++++++- .../vulkan/runtime/graph/ops/impl/Repeat.cpp | 9 +++++---- .../vulkan/runtime/graph/ops/impl/Split.cpp | 9 ++++++--- 6 files changed, 52 insertions(+), 18 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl index a23822765a3..178814a90c3 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl @@ -35,6 +35,8 @@ const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); +${layout_declare_spec_const(C, "int", "batch_index_function", "0")} + void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); @@ -42,14 +44,20 @@ void main() { return; } - const ivec3 in_pos = pos + src_offset.xyz; + ivec3 in_pos = pos + src_offset.xyz; ivec3 out_pos = pos + dst_offset.xyz; - - // If source channel size is specified compose output z based on channel and batch index if (src_offset.w > 0) { - const int channel_index = in_pos.z % src_offset.w; - const int batch_index = in_pos.z / src_offset.w; - out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w; + if (batch_index_function == 1) { + // batch index is calculated using source channel size + const int channel_index = pos.z % src_offset.w; + const int batch_index = pos.z / src_offset.w; + out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w; + } else if (batch_index_function == 2) { + // batch index is calculated using destination channel size + const int channel_index = pos.z % dst_offset.w; + const int batch_index = pos.z / dst_offset.w; + in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w; + } } write_texel_lpos( diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp index 5f172454121..25a0ff9a7f5 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp @@ -80,7 +80,7 @@ void add_cat_default_node( // concatenating channels src_offset[3] = is_concat_channel ? in_channel_size : 0; add_copy_offset_node( - graph, input_ref, range, src_offset, dst_offset, out); + graph, input_ref, range, src_offset, dst_offset, out, true, false); dst_offset[dim_xyz_index] += is_concat_channel ? in_channel_size : range[dim_xyz_index]; } diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index 4b09fbe8619..2ecc7400d3e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -25,7 +25,9 @@ void add_copy_offset_node( const ivec3& range, const ivec4& src_offset, const ivec4& dst_offset, - const ValueRef out) { + const ValueRef out, + bool calc_out_pos_using_src_chnl, + bool calc_in_pos_using_dst_chnl) { vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); @@ -49,7 +51,11 @@ void add_copy_offset_node( // Parameter buffers {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + {graph.hashed_layout_of(out), + graph.hashed_layout_of(in), + (calc_out_pos_using_src_chnl ? 1 + : calc_in_pos_using_dst_chnl ? 2 + : 0)}, nullptr, {}, { @@ -256,7 +262,8 @@ void add_copy_offset_node( ivec4 src_offset = {src[0], src[1], src[2], 0}; ivec4 dst_offset = {dst[0], dst[1], dst[2], 0}; - add_copy_offset_node(graph, in, range, src_offset, dst_offset, out); + add_copy_offset_node( + graph, in, range, src_offset, dst_offset, out, false, false); } void copy_offset(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h index d4b4c0dcc03..e9388345afa 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h @@ -22,13 +22,28 @@ namespace vkcompute { // It is possible to have input and output to point to the same image // object. But when the source range and destination range overlap, the behavior // is undefined. +// +// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl +// can be used to specify an indexing function in the shader +// If calc_out_pos_using_src_chnl is set to true channel and batch index will be +// calculated based on source channel size and will be used to determine +// destination texel position. +// +// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be +// calculated based on destination channel size and will be used to determine +// source texel position. +// +// If both are true calc_out_pos_using_src_chnl is picked. If both are false no +// index calculation happens. void add_copy_offset_node( ComputeGraph& graph, const ValueRef in, const utils::ivec3& range, const utils::ivec4& src_offset, const utils::ivec4& dst_offset, - const ValueRef out); + const ValueRef out, + bool calc_out_pos_using_src_chnl, + bool calc_in_pos_using_dst_chnl); // add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that // its used when copying packed dimension, if tensor is width or height packed. diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp index 49daabdcb76..3f4ed4f1090 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp @@ -151,7 +151,8 @@ void add_repeat_node( utils::ivec4 src_offset{0, 0, 0, 0}; utils::ivec4 dst_offset{0, 0, 0, 0}; - add_copy_offset_node(graph, in, running_range, src_offset, dst_offset, out); + add_copy_offset_node( + graph, in, running_range, src_offset, dst_offset, out, false, false); } else { add_repeat_channel_node(graph, in, channel_repeat, out, running_range); @@ -166,7 +167,7 @@ void add_repeat_node( utils::ivec4 dst_offset{i * dim_at(in_sizes), 0, 0, 0}; add_copy_offset_node( - graph, out, running_range, src_offset, dst_offset, out); + graph, out, running_range, src_offset, dst_offset, out, true, false); } running_range[0] = running_range[0] * width_repeat; @@ -180,7 +181,7 @@ void add_repeat_node( utils::ivec4 dst_offset = {0, i * dim_at(in_sizes), 0, 0}; add_copy_offset_node( - graph, out, running_range, src_offset, dst_offset, out); + graph, out, running_range, src_offset, dst_offset, out, true, false); } running_range[1] = running_range[1] * height_repeat; @@ -194,7 +195,7 @@ void add_repeat_node( utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0}; add_copy_offset_node( - graph, out, running_range, src_offset, dst_offset, out); + graph, out, running_range, src_offset, dst_offset, out, true, false); } running_range[2] = running_range[2] * batch_repeat; diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp index ca585f1fb6d..b74317b078e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp @@ -51,7 +51,8 @@ void add_split_with_sizes_default_node( // output tensor's size matches with the split_size. vTensorPtr t_out = graph.get_tensor(out_ref); utils::ivec3 range = t_out->logical_limits(); - add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); + add_copy_offset_node( + graph, in, range, src_offset, dst_offset, out_ref, false, true); src_offset[0] += range[0]; } @@ -62,7 +63,8 @@ void add_split_with_sizes_default_node( for (ValueRef out_ref : *out_list) { vTensorPtr t_out = graph.get_tensor(out_ref); utils::ivec3 range = t_out->logical_limits(); - add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); + add_copy_offset_node( + graph, in, range, src_offset, dst_offset, out_ref, false, true); src_offset[1] += range[1]; } @@ -73,7 +75,8 @@ void add_split_with_sizes_default_node( for (ValueRef out_ref : *out_list) { vTensorPtr t_out = graph.get_tensor(out_ref); utils::ivec3 range = t_out->logical_limits(); - add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); + add_copy_offset_node( + graph, in, range, src_offset, dst_offset, out_ref, false, true); src_offset[2] += range[2]; } From b684279950d3c724f21322a9f3423413a124185d Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 19 Mar 2025 22:58:50 -0400 Subject: [PATCH 2/2] [ET-VK] Adding source_offset processing to copy_packed_dim_offset function. (#9438) This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: https://github.com/pytorch/executorch/pull/9344 by @trivedivivek ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/trivedivivek/65/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/trivedivivek/65/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/trivedivivek/64/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/trivedivivek/65/orig @diff-train-skip-merge Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> --- .../ops/glsl/copy_packed_dim_offset.glsl | 48 ++++++++++++++++--- .../vulkan/runtime/graph/ops/impl/Copy.cpp | 28 +++++++++-- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl index 02ea6405b4a..e0f09f0be43 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl @@ -44,15 +44,49 @@ void main() { return; } - // Starting offset to write at within a texel - const int out_lane_offset = dst_offset[packed_dim] & 0x3; - const bool has_lane_offset = out_lane_offset != 0; - // Position in input tensor - const ivec3 in_pos = pos + src_offset.xyz; + ivec3 in_pos = pos + src_offset.xyz; + in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2); // Read input value mapping to this output texel - const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map); + VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map); + + // Starting offset to read from a texel + const int src_lane_offset = src_offset[packed_dim] & 0x3; + const bool has_src_lane_offset = src_lane_offset != 0; + + // If input lane offset is non zero i.e packed texel is composed from multiple sources + if (has_src_lane_offset) { + // Boundary values will come from next input texel in the packed dim. + ivec3 next_in_pos = in_pos; + next_in_pos[packed_dim] = in_pos[packed_dim] + 1; + VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map); + + // Keep input values from the end of current input pixel based on src_lane_offset + // offset 1 means the first lane of current input texel is not a part of the output texel + // offset 2 means first 2 lanes are not and so on + if (src_lane_offset == 1) { + in_value.xyz = in_value.yzw; + } else if (src_lane_offset == 2) { + in_value.xy = in_value.zw; + } else { + in_value.x = in_value.w; + } + // Copy next texel's values towards the end of input texel, based on lane offset + // offset 1 means the first lane from next texel is part of the input texel + // offset 2 means first 2 lanes from next texel is part of the input texel and so on + if (src_lane_offset == 1) { + in_value.w = next_value.x; + } else if (src_lane_offset == 2) { + in_value.zw = next_value.xy; + } else { + in_value.yzw = next_value.xyz; + } + } + + // Starting offset to write at within a texel + const int out_lane_offset = dst_offset[packed_dim] & 0x3; + const bool has_dst_lane_offset = out_lane_offset != 0; ivec3 out_pos = pos + dst_offset.xyz; out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2); @@ -60,7 +94,7 @@ void main() { VEC4_T out_value; // If lane offset is non zero i.e packed texel is composed from multiple sources - if (has_lane_offset) { + if (has_dst_lane_offset) { // When position in packed dim is > 0 if (pos[packed_dim] > 0) { // Boundary values will come from previous input texel in the packed dim. diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index 2ecc7400d3e..5756d3a9052 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -92,19 +92,37 @@ void add_copy_packed_dim_offset_node( ivec4 final_range = { range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)}; ivec3 global_wg_size = t_out->logical_limits(); + // The starting offset in a texel where this tensor will start copying from + const auto src_lane_offset = src_offset[packed_dim] & 0x3; // The starting offset in a texel where this tensor will start copying to const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; + + // The total packed texels this tensor will be copied from + // The first texel of tensor data in packed dimension will be copied from + // remaining lanes from current source Hence (4 - src_lane_offset) is added + // to tensor size in packed dimension + const auto src_packed_size = utils::div_up_4( + (4 - src_lane_offset) + + dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim))); + // The total packed texels this tensor will be copied to - // The first texel of tensor data in packed dimension will be copied to remain - // lanes from previous write Hence (4 - dst_lane_offset) is added to tensor - // size in packed dimension + // The first texel of tensor data in packed dimension will be copied to + // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to + // tensor size in packed dimension const auto dst_packed_size = utils::div_up_4( (4 - dst_lane_offset) + dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim))); - // If the starting offset is not 0, and the total packed texels is greater + // If the starting src offset is not 0, and the total packed texels is greater // than the source texel range - if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) { + const bool has_additional_src_work = + src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; + // If the starting dst offset is not 0, and the total packed texels is greater + // than the source texel range + const bool has_additional_dst_work = + dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; + + if (has_additional_src_work || has_additional_dst_work) { global_wg_size[packed_dim]++; // Increase the global work group size in // packed dimension final_range[packed_dim]++; // Increase the range in packed dimension