@@ -34,13 +34,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
34
34
35
35
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
36
36
37
+ // shared memory to hold calculated positions, this would reduce register usage thus improving performance.
38
+ shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
39
+
37
40
/*
38
41
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
39
42
* output tile for pointwise convolution is more efficient because the kernel
40
43
* size is only 1x1, making it easier to re-use loaded texels from t_kernel.
41
44
*/
42
45
void main() {
43
46
const uint16_t out_limits_y_scaled = uint16_t((out_limits.y + TILE_SIZE - 1 ) / TILE_SIZE);
47
+ const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
44
48
45
49
const u16vec3 gpos = u16vec3(
46
50
gl_GlobalInvocationID.x / (out_limits_y_scaled * out_limits.z),
@@ -58,6 +62,7 @@ void main() {
58
62
for (int x = 0 ; x < TILE_SIZE; ++ x) {
59
63
pos[i] = u16vec2(
60
64
gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
65
+ pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
61
66
i++ ;
62
67
}
63
68
}
@@ -73,7 +78,7 @@ void main() {
73
78
// the top-left element is in a region added by padding.
74
79
u16vec2 ipos[TILE_SIZE * TILE_SIZE];
75
80
for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
76
- ipos[i] = pos[i].xy * u16vec2(stride) - u16vec2(padding);
81
+ ipos[i] = pos[i] * u16vec2(stride) - u16vec2(padding);
77
82
}
78
83
79
84
vec4 sum[TILE_SIZE * TILE_SIZE];
@@ -138,8 +143,9 @@ void main() {
138
143
}
139
144
140
145
for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
141
- if (all (lessThan (u16vec3(pos[i], gpos.z), out_limits))) {
142
- imageStore(t_out, u16vec3(pos[i], gpos.z), op(sum[i], out_min, out_max));
146
+ const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
147
+ if (all (lessThan (u16vec3(pos, gpos.z), out_limits))) {
148
+ imageStore(t_out, u16vec3(pos, gpos.z), op(sum[i], out_min, out_max));
143
149
}
144
150
}
145
151
}
0 commit comments