Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c02d086

Browse files
committedMar 17, 2025·
[ET-VK] Adding all tensor packing support to cat op.
This diff updates Executorch Vulkan backend's cat operation to support width, height and channel packed tensors. It also updates the op_registry.py file to indicate cat operation supports all packing and adds new test cases to the cases.py file to test the operation. Differential Revision: [D71230768](https://our.internmc.facebook.com/intern/diff/D71230768/) [ghstack-poisoned]
1 parent 8c66fcd commit c02d086

File tree

10 files changed

+311
-78
lines changed

10 files changed

+311
-78
lines changed
 

‎backends/vulkan/op_registry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,6 @@ def register_view_op(features: OpFeatures):
528528
exir_ops.edge.aten.index_select.default,
529529
exir_ops.edge.aten.select_copy.int,
530530
# Tensor combination
531-
exir_ops.edge.aten.cat.default,
532531
exir_ops.edge.aten.split_with_sizes_copy.default,
533532
exir_ops.edge.aten.split.Tensor,
534533
exir_ops.edge.aten.repeat.default,
@@ -562,6 +561,8 @@ def register_ported_op(features: OpFeatures):
562561
exir_ops.edge.aten.squeeze_copy.dims,
563562
exir_ops.edge.aten.unsqueeze_copy.default,
564563
exir_ops.edge.aten.permute_copy.default,
564+
# Tensor combination
565+
exir_ops.edge.aten.cat.default,
565566
]
566567
)
567568
def register_ported_op_all_packed_dims(features: OpFeatures):

‎backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
1919

2020
layout(push_constant) uniform restrict Block {
2121
ivec3 range;
22-
ivec3 src_offset;
23-
ivec3 dst_offset;
22+
// xyz is source offset w is channel size
23+
ivec4 src_offset;
24+
// xyz is destination offset w is channel size
25+
ivec4 dst_offset;
2426
};
2527

2628
#include "indexing_utils.h"
@@ -36,13 +38,20 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
3638
void main() {
3739
const ivec3 pos = ivec3(gl_GlobalInvocationID);
3840

39-
const ivec3 out_pos = pos + dst_offset;
40-
const ivec3 in_pos = pos + src_offset;
41-
4241
if (any(greaterThanEqual(pos, range))) {
4342
return;
4443
}
4544

45+
const ivec3 in_pos = pos + src_offset.xyz;
46+
ivec3 out_pos = pos + dst_offset.xyz;
47+
48+
// If source channel size is specified compose output z based on channel and batch index
49+
if (src_offset.w > 0) {
50+
const int channel_index = in_pos.z % src_offset.w;
51+
const int batch_index = in_pos.z / src_offset.w;
52+
out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
53+
}
54+
4655
write_texel_lpos(
4756
t_out,
4857
out_pos,
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
15+
layout(std430) buffer;
16+
17+
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
18+
${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
19+
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
20+
21+
layout(push_constant) uniform restrict Block {
22+
ivec4 range;
23+
// xyz is source offset w is channel size
24+
ivec4 src_offset;
25+
// xyz is destination offset w is channel size
26+
ivec4 dst_offset;
27+
};
28+
29+
#include "indexing_utils.h"
30+
31+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
32+
33+
${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
34+
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
35+
const lowp int packed_dim = unhash_packed_dim(out_layout);
36+
37+
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
38+
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
39+
40+
void main() {
41+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
42+
43+
if (any(greaterThanEqual(pos, range.xyz))) {
44+
return;
45+
}
46+
47+
// Starting offset to write at within a texel
48+
const int out_lane_offset = dst_offset[packed_dim] & 0x3;
49+
const bool has_lane_offset = out_lane_offset != 0;
50+
51+
// Position in input tensor
52+
const ivec3 in_pos = pos + src_offset.xyz;
53+
54+
// Read input value mapping to this output texel
55+
const VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
56+
57+
ivec3 out_pos = pos + dst_offset.xyz;
58+
out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
59+
60+
VEC4_T out_value;
61+
62+
// If lane offset is non zero i.e packed texel is composed from multiple sources
63+
if (has_lane_offset) {
64+
// When position in packed dim is > 0
65+
if (pos[packed_dim] > 0) {
66+
// Boundary values will come from previous input texel in the packed dim.
67+
ivec3 prev_in_pos = in_pos;
68+
prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
69+
VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
70+
71+
// Shift values toward the beginning based on out_lane_offset
72+
// offset 1 means the last lane from the previous texel is a part of the output texel
73+
// offset 2 means last 2 lanes and so on
74+
if (out_lane_offset == 1) {
75+
out_value.x = prev_value.w;
76+
} else if (out_lane_offset == 2) {
77+
out_value.xy = prev_value.zw;
78+
} else {
79+
out_value.xyz = prev_value.yzw;
80+
}
81+
} else {
82+
// When position in packed dim is == 0
83+
// Boundary values will be the previous texel values.
84+
out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
85+
}
86+
87+
// Copy input values towards the end of output array, based on lane offset
88+
// offset 1 means the first lane from previous texel is part of the output texel starting at offset
89+
// offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
90+
if (out_lane_offset == 1) {
91+
out_value.yzw = in_value.xyz;
92+
} else if (out_lane_offset == 2) {
93+
out_value.zw = in_value.xy;
94+
} else {
95+
out_value.w = in_value.x;
96+
}
97+
} else {
98+
out_value = in_value;
99+
}
100+
101+
write_texel_lpos(
102+
t_out,
103+
out_pos,
104+
out_value,
105+
out_axis_map);
106+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
copy_packed_dim_offset:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
STORAGE: texture3d
6+
generate_variant_forall:
7+
DTYPE:
8+
- VALUE: half
9+
- VALUE: float
10+
- VALUE: int
11+
shader_variants:
12+
- NAME: copy_packed_dim_offset

‎backends/vulkan/runtime/graph/ops/impl/Cat.cpp

Lines changed: 50 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,65 +22,68 @@ void add_cat_default_node(
2222
ValueRef dim_ref,
2323
ValueRef out) {
2424
ValueListPtr input_list = graph.get_value_list(in_list_ref);
25-
26-
for (ValueRef input_ref : *input_list) {
27-
vTensorPtr t_in = graph.get_tensor(input_ref);
28-
VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
29-
}
30-
3125
int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
3226
vTensorPtr t_out = graph.get_tensor(out);
3327

28+
const auto packed_dim = t_out->packed_dim();
29+
const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
30+
3431
DimIndex dim_index = normalize_to_dim_index(*t_out, dim);
32+
// Index of dimension to be concatenated in (w, h, c * b) coordinate system
33+
const auto dim_xyz_index = std::min(2, -dim_index - 1);
3534

36-
// TODO: Find ways to factor out the similar code for width, height, and batch
37-
if (dim_index == kWidth4D) {
38-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
39-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
35+
if (dim_index > kWidth4D || dim_index < kBatch4D) {
36+
VK_THROW("Unexpected value of dim_index=", dim_index);
37+
}
4038

41-
for (ValueRef input_ref : *input_list) {
42-
vTensorPtr t_in = graph.get_tensor(input_ref);
43-
utils::ivec3 range = t_in->logical_limits();
44-
add_copy_offset_node(
45-
graph, input_ref, range, src_offset, dst_offset, out);
46-
dst_offset[0] += range[0];
47-
}
39+
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
40+
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
4841

49-
} else if (dim_index == kHeight4D) {
50-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
51-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
42+
const bool is_concat_channel = (dim_index == kChannel4D);
5243

53-
for (ValueRef input_ref : *input_list) {
54-
vTensorPtr t_in = graph.get_tensor(input_ref);
55-
utils::ivec3 range = t_in->logical_limits();
56-
add_copy_offset_node(
57-
graph, input_ref, range, src_offset, dst_offset, out);
58-
dst_offset[1] += range[1];
59-
}
60-
} else if (dim_index == kBatch4D) {
61-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
62-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
44+
// if concatenating channels
45+
if (is_concat_channel) {
46+
// set destination offset w as channel size of the output tensor
47+
dst_offset[3] = dim_at(t_out->sizes(), kChannel4D);
48+
}
6349

64-
for (ValueRef input_ref : *input_list) {
65-
vTensorPtr t_in = graph.get_tensor(input_ref);
66-
utils::ivec3 range = t_in->logical_limits();
50+
for (ValueRef input_ref : *input_list) {
51+
const vTensorPtr t_in = graph.get_tensor(input_ref);
52+
const utils::ivec3 range = t_in->logical_limits();
53+
const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D);
54+
// if concatenating same dimension as the packed dimension
55+
if (dim_index == packed_dim_index) {
56+
// if concatenating channels, use add_copy_channel_offset_node function as
57+
// add_copy_packed_dim_offset_node does not support channel packing
58+
if (is_concat_channel) {
59+
add_copy_channel_offset_node(
60+
graph,
61+
input_ref,
62+
in_channel_size,
63+
src_offset[2],
64+
dst_offset[2],
65+
out);
66+
dst_offset[dim_xyz_index] += in_channel_size;
67+
} else {
68+
// src_offset[3] is not used now but will be used in the future when
69+
// add_copy_packed_dim_offset_node will support channel packing
70+
//
71+
// set source offset w as channel size of the output tensor if
72+
// concatenating channels
73+
src_offset[3] = is_concat_channel ? in_channel_size : 0;
74+
add_copy_packed_dim_offset_node(
75+
graph, input_ref, range, src_offset, dst_offset, out);
76+
dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index);
77+
}
78+
} else {
79+
// set source offset w as channel size of the output tensor if
80+
// concatenating channels
81+
src_offset[3] = is_concat_channel ? in_channel_size : 0;
6782
add_copy_offset_node(
6883
graph, input_ref, range, src_offset, dst_offset, out);
69-
dst_offset[2] += range[2];
84+
dst_offset[dim_xyz_index] +=
85+
is_concat_channel ? in_channel_size : range[dim_xyz_index];
7086
}
71-
} else if (dim_index == kChannel4D) {
72-
int32_t src_offset = 0;
73-
int32_t dst_offset = 0;
74-
75-
for (ValueRef input_ref : *input_list) {
76-
vTensorPtr t_in = graph.get_tensor(input_ref);
77-
int32_t range = dim_at(t_in->sizes(), kChannel4D);
78-
add_copy_channel_offset_node(
79-
graph, input_ref, range, src_offset, dst_offset, out);
80-
dst_offset += range;
81-
}
82-
} else {
83-
VK_THROW("Unexpected value of dim_index=", dim_index);
8487
}
8588
}
8689

‎backends/vulkan/runtime/graph/ops/impl/Copy.cpp

Lines changed: 83 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,15 @@
1616
namespace vkcompute {
1717

1818
using utils::ivec3;
19+
using utils::ivec4;
1920
using utils::uvec3;
2021

2122
void add_copy_offset_node(
2223
ComputeGraph& graph,
2324
const ValueRef in,
2425
const ivec3& range,
25-
const ivec3& src_offset,
26-
const ivec3& dst_offset,
26+
const ivec4& src_offset,
27+
const ivec4& dst_offset,
2728
const ValueRef out) {
2829
vTensorPtr t_in = graph.get_tensor(in);
2930
vTensorPtr t_out = graph.get_tensor(out);
@@ -52,11 +53,81 @@ void add_copy_offset_node(
5253
nullptr,
5354
{},
5455
{
55-
PushConstantDataInfo(&range, sizeof(range), sizeof(utils::ivec4)),
56-
PushConstantDataInfo(
57-
&src_offset, sizeof(src_offset), sizeof(utils::ivec4)),
56+
PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)),
57+
PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
58+
PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
59+
}));
60+
}
61+
62+
void add_copy_packed_dim_offset_node(
63+
ComputeGraph& graph,
64+
const ValueRef in,
65+
const ivec3& range,
66+
const ivec4& src_offset,
67+
const ivec4& dst_offset,
68+
const ValueRef out) {
69+
vTensorPtr t_in = graph.get_tensor(in);
70+
vTensorPtr t_out = graph.get_tensor(out);
71+
72+
// Check the packed dimension is same for both tensors, and if the packed
73+
// dimension is Width or Height. Since the function does not support channel
74+
// packing.
75+
VK_CHECK_COND(
76+
check_same_packed_dim(*t_in, *t_out) &&
77+
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
78+
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
79+
80+
std::string kernel_name = "copy_packed_dim_offset";
81+
kernel_name.reserve(kShaderNameReserve);
82+
add_dtype_suffix(kernel_name, *t_out);
83+
84+
const auto packed_dim = t_in->packed_dim();
85+
// A copy of range with the last element set to batch size of the input tensor
86+
ivec4 final_range = {
87+
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
88+
ivec3 global_wg_size = t_out->logical_limits();
89+
// The starting offset in a texel where this tensor will start copying to
90+
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
91+
// The total packed texels this tensor will be copied to
92+
// The first texel of tensor data in packed dimension will be copied to remain
93+
// lanes from previous write Hence (4 - dst_lane_offset) is added to tensor
94+
// size in packed dimension
95+
const auto dst_packed_size = utils::div_up_4(
96+
(4 - dst_lane_offset) +
97+
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
98+
99+
// If the starting offset is not 0, and the total packed texels is greater
100+
// than the source texel range
101+
if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) {
102+
global_wg_size[packed_dim]++; // Increase the global work group size in
103+
// packed dimension
104+
final_range[packed_dim]++; // Increase the range in packed dimension
105+
}
106+
107+
auto shader = VK_KERNEL_FROM_STR(kernel_name);
108+
109+
graph.execute_nodes().emplace_back(new DispatchNode(
110+
graph,
111+
VK_KERNEL_FROM_STR(kernel_name),
112+
global_wg_size,
113+
graph.create_local_wg_size(global_wg_size),
114+
// Inputs and Outputs
115+
{
116+
{out, vkapi::MemoryAccessType::WRITE},
117+
{out, vkapi::MemoryAccessType::READ},
118+
{in, vkapi::MemoryAccessType::READ},
119+
},
120+
// Parameter buffers
121+
{},
122+
// Specialization Constants
123+
{graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
124+
nullptr,
125+
{},
126+
{
58127
PushConstantDataInfo(
59-
&dst_offset, sizeof(dst_offset), sizeof(utils::ivec4)),
128+
&final_range, sizeof(final_range), sizeof(ivec4)),
129+
PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
130+
PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
60131
}));
61132
}
62133

@@ -140,7 +211,7 @@ void add_copy_channel_offset_node(
140211
static_cast<int>(global_size[2]),
141212
channel_range};
142213

143-
const utils::ivec4 offset_params = {
214+
const ivec4 offset_params = {
144215
dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset};
145216

146217
auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -179,8 +250,11 @@ void add_copy_offset_node(
179250
ValueRef dst_offset_ref,
180251
ValueRef out) {
181252
ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
182-
ivec3 src_offset = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
183-
ivec3 dst_offset = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
253+
ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
254+
ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
255+
256+
ivec4 src_offset = {src[0], src[1], src[2], 0};
257+
ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};
184258

185259
add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
186260
}

‎backends/vulkan/runtime/graph/ops/impl/Copy.h

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ namespace vkcompute {
1717
// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
1818
// texture extents specified by the range, src_offset, and dst_offset (all are
1919
// in texture coordinate (x, y, z) from the input image to the output image.
20+
// src_offset.w and dst_offset.w may contain channel size information.
2021
//
2122
// It is possible to have input and output to point to the same image
2223
// object. But when the source range and destination range overlap, the behavior
@@ -25,8 +26,24 @@ void add_copy_offset_node(
2526
ComputeGraph& graph,
2627
const ValueRef in,
2728
const utils::ivec3& range,
28-
const utils::ivec3& src_offset,
29-
const utils::ivec3& dst_offset,
29+
const utils::ivec4& src_offset,
30+
const utils::ivec4& dst_offset,
31+
const ValueRef out);
32+
33+
// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
34+
// its used when copying packed dimension, if tensor is width or height packed.
35+
// src_offset.w and dst_offset.w may contain channel size information.
36+
//
37+
// It copies the texture extents specified by the range, src_offset, and
38+
// dst_offset (all are in texture coordinate (x, y, z) from the input image to
39+
// the output image.
40+
//
41+
void add_copy_packed_dim_offset_node(
42+
ComputeGraph& graph,
43+
const ValueRef in,
44+
const utils::ivec3& range,
45+
const utils::ivec4& src_offset,
46+
const utils::ivec4& dst_offset,
3047
const ValueRef out);
3148

3249
// add_copy_channel_offset_node behaves similar to add_copy_node, except that it

‎backends/vulkan/runtime/graph/ops/impl/Repeat.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,8 @@ void add_repeat_node(
148148
if (int64_t channel_repeat = dim_at<kChannel4D>(repeats);
149149
channel_repeat == 1) {
150150
// If no repeat, short-cut to a direct copy
151-
utils::ivec3 src_offset{0, 0, 0};
152-
utils::ivec3 dst_offset{0, 0, 0};
151+
utils::ivec4 src_offset{0, 0, 0, 0};
152+
utils::ivec4 dst_offset{0, 0, 0, 0};
153153

154154
add_copy_offset_node(graph, in, running_range, src_offset, dst_offset, out);
155155

@@ -160,10 +160,10 @@ void add_repeat_node(
160160
// TODO: refactor width, height, and batch into a common helper function.
161161
// Width
162162
if (int64_t width_repeat = dim_at<kWidth4D>(repeats); width_repeat > 1) {
163-
utils::ivec3 src_offset{0, 0, 0};
163+
utils::ivec4 src_offset{0, 0, 0, 0};
164164

165165
for (int i = 1; i < width_repeat; ++i) {
166-
utils::ivec3 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0};
166+
utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};
167167

168168
add_copy_offset_node(
169169
graph, out, running_range, src_offset, dst_offset, out);
@@ -174,10 +174,10 @@ void add_repeat_node(
174174

175175
// Height
176176
if (int64_t height_repeat = dim_at<kHeight4D>(repeats); height_repeat > 1) {
177-
utils::ivec3 src_offset{0, 0, 0};
177+
utils::ivec4 src_offset{0, 0, 0, 0};
178178

179179
for (int i = 1; i < height_repeat; ++i) {
180-
utils::ivec3 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0};
180+
utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};
181181

182182
add_copy_offset_node(
183183
graph, out, running_range, src_offset, dst_offset, out);
@@ -188,10 +188,10 @@ void add_repeat_node(
188188

189189
// Batch
190190
if (int64_t batch_repeat = dim_at<kBatch4D>(repeats); batch_repeat > 1) {
191-
utils::ivec3 src_offset{0, 0, 0};
191+
utils::ivec4 src_offset{0, 0, 0, 0};
192192

193193
for (int i = 1; i < batch_repeat; ++i) {
194-
utils::ivec3 dst_offset = {0, 0, i * running_range[2]};
194+
utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};
195195

196196
add_copy_offset_node(
197197
graph, out, running_range, src_offset, dst_offset, out);

‎backends/vulkan/runtime/graph/ops/impl/Split.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ void add_split_with_sizes_default_node(
4343
}
4444

4545
if (dim_index == kWidth4D) {
46-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
47-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
46+
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
47+
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
4848

4949
for (ValueRef out_ref : *out_list) {
5050
// Doesn't need to use split_size since we have already verified that the
@@ -56,8 +56,8 @@ void add_split_with_sizes_default_node(
5656
src_offset[0] += range[0];
5757
}
5858
} else if (dim_index == kHeight4D) {
59-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
60-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
59+
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
60+
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
6161

6262
for (ValueRef out_ref : *out_list) {
6363
vTensorPtr t_out = graph.get_tensor(out_ref);
@@ -67,8 +67,8 @@ void add_split_with_sizes_default_node(
6767
src_offset[1] += range[1];
6868
}
6969
} else if (dim_index == kBatch4D) {
70-
utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
71-
utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
70+
utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
71+
utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
7272

7373
for (ValueRef out_ref : *out_list) {
7474
vTensorPtr t_out = graph.get_tensor(out_ref);

‎backends/vulkan/test/op_tests/cases.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,8 +850,11 @@ def get_cat_inputs():
850850
test_suite = VkTestSuite(
851851
[
852852
# Cat on Height
853+
([(M, M, 3, 5), (M, M, 0, 5)], 2),
853854
([(S1, S1, 3, 5), (S1, S1, 0, 5)], 2),
855+
([(M, M, 3, 5), (M, M, 4, 5)], 2),
854856
([(S1, S1, 3, 5), (S1, S1, 4, 5)], 2),
857+
([(M2, 3, 5), (M2, 4, 5)], 1),
855858
([(S1, 3, 5), (S1, 4, 5)], 1),
856859
([(3, 5), (4, 5)], 0),
857860
([(3, 5), (4, 5), (1, 5)], 0),
@@ -860,7 +863,9 @@ def get_cat_inputs():
860863
0,
861864
),
862865
# Cat on Width
866+
([(M, M, 5, 3), (M, M, 5, 4)], 3),
863867
([(S1, S1, 5, 3), (S1, S1, 5, 4)], 3),
868+
([(M, 5, 3), (M, 5, 4)], 2),
864869
([(S1, 5, 3), (S1, 5, 4)], 2),
865870
([(5, 0), (5, 4)], 1),
866871
([(5, 3), (5, 4)], 1),
@@ -871,7 +876,9 @@ def get_cat_inputs():
871876
),
872877
([(5,), (6,)], 0),
873878
# Cat on Batch
879+
([(M, S1, 5, 4), (M1, S1, 5, 4)], 0),
874880
([(S, S1, 5, 4), (S1, S1, 5, 4)], 0),
881+
([(S, M, 5, 4), (S1, M, 5, 4)], 0),
875882
([(S, XS, 5, 4), (S1, XS, 5, 4)], 0),
876883
([(S, S2, 5, 4), (S1, S2, 5, 4)], 0),
877884
(
@@ -883,7 +890,9 @@ def get_cat_inputs():
883890
0,
884891
),
885892
# Cat on Channel
893+
([(M, 5, 4), (0, 5, 4), (M1, 5, 4)], 0),
886894
([(S, 5, 4), (0, 5, 4), (S2, 5, 4)], 0),
895+
([(M, 5, 4), (M1, 5, 4), (M2, 5, 4)], 0),
887896
([(S, 5, 4), (S1, 5, 4), (S2, 5, 4)], 0),
888897
([(XS, 5, 4), (XS, 5, 4), (S2, 5, 4)], 0),
889898
([(XS, S, 5, 4), (XS, S1, 5, 4), (XS, S2, 5, 4)], 1),
@@ -899,6 +908,8 @@ def get_cat_inputs():
899908
]
900909
)
901910
test_suite.layouts = [
911+
"utils::kWidthPacked",
912+
"utils::kHeightPacked",
902913
"utils::kChannelsPacked",
903914
]
904915
test_suite.data_gen = "make_seq_tensor"

0 commit comments

Comments
 (0)