@@ -25,7 +25,9 @@ void add_copy_offset_node(
25
25
const ivec3& range,
26
26
const ivec4& src_offset,
27
27
const ivec4& dst_offset,
28
- const ValueRef out) {
28
+ const ValueRef out,
29
+ bool calc_out_pos_using_src_chnl,
30
+ bool calc_in_pos_using_dst_chnl) {
29
31
vTensorPtr t_in = graph.get_tensor (in);
30
32
vTensorPtr t_out = graph.get_tensor (out);
31
33
@@ -49,7 +51,11 @@ void add_copy_offset_node(
49
51
// Parameter buffers
50
52
{},
51
53
// Specialization Constants
52
- {graph.hashed_layout_of (out), graph.hashed_layout_of (in)},
54
+ {graph.hashed_layout_of (out),
55
+ graph.hashed_layout_of (in),
56
+ (calc_out_pos_using_src_chnl ? 1
57
+ : calc_in_pos_using_dst_chnl ? 2
58
+ : 0 )},
53
59
nullptr ,
54
60
{},
55
61
{
@@ -86,19 +92,37 @@ void add_copy_packed_dim_offset_node(
86
92
ivec4 final_range = {
87
93
range[0 ], range[1 ], range[2 ], dim_at (t_in->sizes (), kBatch4D )};
88
94
ivec3 global_wg_size = t_out->logical_limits ();
95
+ // The starting offset in a texel where this tensor will start copying from
96
+ const auto src_lane_offset = src_offset[packed_dim] & 0x3 ;
89
97
// The starting offset in a texel where this tensor will start copying to
90
98
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3 ;
99
+
100
+ // The total packed texels this tensor will be copied from
101
+ // The first texel of tensor data in packed dimension will be copied from
102
+ // remaining lanes from current source Hence (4 - src_lane_offset) is added
103
+ // to tensor size in packed dimension
104
+ const auto src_packed_size = utils::div_up_4 (
105
+ (4 - src_lane_offset) +
106
+ dim_at (t_out->sizes (), normalize_to_dim_index (*t_out, packed_dim)));
107
+
91
108
// The total packed texels this tensor will be copied to
92
- // The first texel of tensor data in packed dimension will be copied to remain
93
- // lanes from previous write Hence (4 - dst_lane_offset) is added to tensor
94
- // size in packed dimension
109
+ // The first texel of tensor data in packed dimension will be copied to
110
+ // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
111
+ // tensor size in packed dimension
95
112
const auto dst_packed_size = utils::div_up_4 (
96
113
(4 - dst_lane_offset) +
97
114
dim_at (t_in->sizes (), normalize_to_dim_index (*t_in, packed_dim)));
98
115
99
- // If the starting offset is not 0, and the total packed texels is greater
116
+ // If the starting src offset is not 0, and the total packed texels is greater
117
+ // than the source texel range
118
+ const bool has_additional_src_work =
119
+ src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
120
+ // If the starting dst offset is not 0, and the total packed texels is greater
100
121
// than the source texel range
101
- if (dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]) {
122
+ const bool has_additional_dst_work =
123
+ dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
124
+
125
+ if (has_additional_src_work || has_additional_dst_work) {
102
126
global_wg_size[packed_dim]++; // Increase the global work group size in
103
127
// packed dimension
104
128
final_range[packed_dim]++; // Increase the range in packed dimension
@@ -256,7 +280,8 @@ void add_copy_offset_node(
256
280
ivec4 src_offset = {src[0 ], src[1 ], src[2 ], 0 };
257
281
ivec4 dst_offset = {dst[0 ], dst[1 ], dst[2 ], 0 };
258
282
259
- add_copy_offset_node (graph, in, range, src_offset, dst_offset, out);
283
+ add_copy_offset_node (
284
+ graph, in, range, src_offset, dst_offset, out, false , false );
260
285
}
261
286
262
287
void copy_offset (ComputeGraph& graph, const std::vector<ValueRef>& args) {
0 commit comments