[ET-VK] Using shared variable to store calculated output pose to free up registers and improve performance.

trivedivivek · web-flow · commit eef8cb2e79dd · 2025-01-03T14:46:34.000-08:00
Differential Revision: D67742567 Pull Request resolved: #7475
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -34,13 +34,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
+// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
+shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
+
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
   const uint16_t out_limits_y_scaled = uint16_t((out_limits.y + TILE_SIZE - 1) / TILE_SIZE);
+  const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
 
   const u16vec3 gpos = u16vec3(
     gl_GlobalInvocationID.x / (out_limits_y_scaled * out_limits.z),
@@ -58,6 +62,7 @@ void main() {
     for (int x = 0; x < TILE_SIZE; ++x) {
       pos[i] = u16vec2(
           gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
+      pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
       i++;
     }
   }
@@ -73,7 +78,7 @@ void main() {
   // the top-left element is in a region added by padding.
   u16vec2 ipos[TILE_SIZE * TILE_SIZE];
   for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
-    ipos[i] = pos[i].xy * u16vec2(stride) - u16vec2(padding);
+    ipos[i] = pos[i] * u16vec2(stride) - u16vec2(padding);
   }
 
   vec4 sum[TILE_SIZE * TILE_SIZE];
@@ -138,8 +143,9 @@ void main() {
   }
 
   for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
-    if (all(lessThan(u16vec3(pos[i], gpos.z), out_limits))) {
-      imageStore(t_out, u16vec3(pos[i], gpos.z), op(sum[i], out_min, out_max));
+    const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
+    if (all(lessThan(u16vec3(pos, gpos.z), out_limits))) {
+      imageStore(t_out, u16vec3(pos, gpos.z), op(sum[i], out_min, out_max));
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -34,13 +34,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;`
`34`	`34`
`35`	`35`	`#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require`
`36`	`36`
	`37`	`+// shared memory to hold calculated positions, this would reduce register usage thus improving performance.`
	`38`	`+shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];`
	`39`	`+`
`37`	`40`	`/*`
`38`	`41`	`* Computes a 2D pointwise convolution of an NxN output tile. Calculating an`
`39`	`42`	`* output tile for pointwise convolution is more efficient because the kernel`
`40`	`43`	`* size is only 1x1, making it easier to re-use loaded texels from t_kernel.`
`41`	`44`	`*/`
`42`	`45`	`void main() {`
`43`	`46`	`const uint16_t out_limits_y_scaled = uint16_t((out_limits.y + TILE_SIZE - 1) / TILE_SIZE);`
	`47`	`+ const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;`
`44`	`48`
`45`	`49`	`const u16vec3 gpos = u16vec3(`
`46`	`50`	`gl_GlobalInvocationID.x / (out_limits_y_scaled * out_limits.z),`
`@@ -58,6 +62,7 @@ void main() {`
`58`	`62`	`for (int x = 0; x < TILE_SIZE; ++x) {`
`59`	`63`	`pos[i] = u16vec2(`
`60`	`64`	`gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);`
	`65`	`+ pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];`
`61`	`66`	`i++;`
`62`	`67`	`}`
`63`	`68`	`}`
`@@ -73,7 +78,7 @@ void main() {`
`73`	`78`	`// the top-left element is in a region added by padding.`
`74`	`79`	`u16vec2 ipos[TILE_SIZE * TILE_SIZE];`
`75`	`80`	`for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {`
`76`		`- ipos[i] = pos[i].xy * u16vec2(stride) - u16vec2(padding);`
	`81`	`+ ipos[i] = pos[i] * u16vec2(stride) - u16vec2(padding);`
`77`	`82`	`}`
`78`	`83`
`79`	`84`	`vec4 sum[TILE_SIZE * TILE_SIZE];`
`@@ -138,8 +143,9 @@ void main() {`
`138`	`143`	`}`
`139`	`144`
`140`	`145`	`for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {`
`141`		`- if (all(lessThan(u16vec3(pos[i], gpos.z), out_limits))) {`
`142`		`- imageStore(t_out, u16vec3(pos[i], gpos.z), op(sum[i], out_min, out_max));`
	`146`	`+ const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];`
	`147`	`+ if (all(lessThan(u16vec3(pos, gpos.z), out_limits))) {`
	`148`	`+ imageStore(t_out, u16vec3(pos, gpos.z), op(sum[i], out_min, out_max));`
`143`	`149`	`}`
`144`	`150`	`}`
`145`	`151`	`}`