Skip to content

Fix unannounced modification of input operand 8 (lda4) in Haswell GEMVN microkernel #2019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 15, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 62 additions & 63 deletions kernel/x86_64/sgemv_n_microk_haswell-4.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/



#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

Expand All @@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7

"vbroadcastss (%9), %%ymm6 \n\t" // alpha

"testq $0x04, %1 \n\t"
"jz 2f \n\t"

"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"

"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"

"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"

"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"

"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y

"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"

Expand All @@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t"
"jz 3f \n\t"

"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"

"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"

"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"

"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"


"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y

"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"

Expand All @@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y

"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"

"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y

"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"

"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"

"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"

"addq $16, %8 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
"addq $16, %2 \n\t"
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y

"jnz 1b \n\t"

Expand All @@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
Expand All @@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
}



#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

Expand All @@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT

"vbroadcastss (%8), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t"
"jz 2f \n\t"

Expand Down