|
25 | 25 |
|
26 | 26 | #include "openvml_kernel.h"
|
27 | 27 |
|
| 28 | +#include <immintrin.h> |
| 29 | + |
| 30 | +void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { |
| 31 | + VMLLONG loop_count=(COMPSIZE*n) >> 5; |
| 32 | + VMLLONG remain_count=(COMPSIZE*n) & 0x1f; |
| 33 | + |
| 34 | + int i=0; |
| 35 | + |
| 36 | + while(loop_count>0){ |
| 37 | + |
| 38 | + __m256d av0=_mm256_loadu_pd(a); |
| 39 | + __m256d av1=_mm256_loadu_pd(a+4); |
| 40 | + __m256d av2=_mm256_loadu_pd(a+8); |
| 41 | + __m256d av3=_mm256_loadu_pd(a+12); |
| 42 | + |
| 43 | + __m256d av4=_mm256_loadu_pd(a+16); |
| 44 | + __m256d av5=_mm256_loadu_pd(a+20); |
| 45 | + __m256d av6=_mm256_loadu_pd(a+24); |
| 46 | + __m256d av7=_mm256_loadu_pd(a+28); |
| 47 | + |
| 48 | + |
| 49 | + __m256d bv0=_mm256_loadu_pd(b); |
| 50 | + __m256d bv1=_mm256_loadu_pd(b+4); |
| 51 | + __m256d bv2=_mm256_loadu_pd(b+8); |
| 52 | + __m256d bv3=_mm256_loadu_pd(b+12); |
| 53 | + |
| 54 | + __m256d bv4=_mm256_loadu_pd(b+16); |
| 55 | + __m256d bv5=_mm256_loadu_pd(b+20); |
| 56 | + __m256d bv6=_mm256_loadu_pd(b+24); |
| 57 | + __m256d bv7=_mm256_loadu_pd(b+28); |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | + |
| 62 | + __m256d yv0=_mm256_add_pd(av0, bv0); |
| 63 | + __m256d yv1=_mm256_add_pd(av1, bv1); |
| 64 | + __m256d yv2=_mm256_add_pd(av2, bv2); |
| 65 | + __m256d yv3=_mm256_add_pd(av3, bv3); |
| 66 | + |
| 67 | + __m256d yv4=_mm256_add_pd(av4, bv4); |
| 68 | + __m256d yv5=_mm256_add_pd(av5, bv5); |
| 69 | + __m256d yv6=_mm256_add_pd(av6, bv6); |
| 70 | + __m256d yv7=_mm256_add_pd(av7, bv7); |
| 71 | + |
| 72 | + _mm256_storeu_pd(y, yv0); |
| 73 | + _mm256_storeu_pd(y+4, yv1); |
| 74 | + _mm256_storeu_pd(y+8, yv2); |
| 75 | + _mm256_storeu_pd(y+12, yv3); |
| 76 | + |
| 77 | + _mm256_storeu_pd(y+16, yv4); |
| 78 | + _mm256_storeu_pd(y+20, yv5); |
| 79 | + _mm256_storeu_pd(y+24, yv6); |
| 80 | + _mm256_storeu_pd(y+28, yv7); |
| 81 | + |
| 82 | + a+=32; |
| 83 | + b+=32; |
| 84 | + y+=32; |
| 85 | + loop_count--; |
| 86 | + } |
| 87 | + |
| 88 | + for(i=0; i<remain_count; i++){ |
| 89 | + y[i]=a[i]+b[i]; |
| 90 | + } |
| 91 | +} |
| 92 | + |
| 93 | +#if 0 |
28 | 94 | void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
|
29 | 95 |
|
30 | 96 | //unroll 32
|
@@ -114,3 +180,4 @@ void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLO
|
114 | 180 | "memory"
|
115 | 181 | );
|
116 | 182 | }
|
| 183 | +#endif |
0 commit comments