Skip to content

Commit 2ed622c

Browse files
committed
Use intrinsic for x86_64 kernels. Enable detecting the CPU core on Windows.
1 parent 66df69e commit 2ed622c

File tree

8 files changed

+237
-3
lines changed

8 files changed

+237
-3
lines changed

cmake/auto_detect_cpu.cmake

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ try_run(cpu_detect_result cpu_detect_compile_result
2626
RUN_OUTPUT_VARIABLE cpu_detect_output
2727
COMPILE_OUTPUT_VARIABLE cpu_detect_compile_output
2828
)
29-
if(MSVC)
30-
set(cpu_detect_output "generic")
31-
endif()
29+
#if(MSVC)
30+
# set(cpu_detect_output "generic")
31+
#endif()
3232
endif()
3333

3434
if(cpu_detect_compile_result)

kernel/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,4 @@ endforeach(KERNEL)
106106

107107
add_library(openvml_kernel_${OpenVML_ARCH}_${OpenVML_CPU_CORENAME}_core OBJECT ${OpenVML_LIBSRC_S} ${OpenVML_LIBSRC_D} ${OpenVML_LIBSRC_C} ${OpenVML_LIBSRC_Z})
108108

109+
target_compile_options(openvml_kernel_${OpenVML_ARCH}_${OpenVML_CPU_CORENAME}_core PRIVATE ${OpenVML_KERNEL_COMPILE_FLAGS})

kernel/x86_64/Kernel_haswell.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
include(${OpenVML_ARCH}/Kernel_generic.txt)
22

3+
if(NOT MSVC)
4+
set(OpenVML_KERNEL_COMPILE_FLAGS -mavx2)
5+
endif()
6+
37
set(add_S_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)
48
set(add_D_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c)
59
set(add_C_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)

kernel/x86_64/Kernel_sandybridge.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
include(${OpenVML_ARCH}/Kernel_generic.txt)
22

3+
if(NOT MSVC)
4+
set(OpenVML_KERNEL_COMPILE_FLAGS -mavx)
5+
endif()
6+
37
set(add_S_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)
48
set(add_D_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c)
59
set(add_C_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)

kernel/x86_64/dadd_kernel_avx.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,72 @@
2525

2626
#include "openvml_kernel.h"
2727

28+
#include <immintrin.h>
29+
30+
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
31+
VMLLONG loop_count=(COMPSIZE*n) >> 5;
32+
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
33+
34+
int i=0;
35+
36+
while(loop_count>0){
37+
38+
__m256d av0=_mm256_loadu_pd(a);
39+
__m256d av1=_mm256_loadu_pd(a+4);
40+
__m256d av2=_mm256_loadu_pd(a+8);
41+
__m256d av3=_mm256_loadu_pd(a+12);
42+
43+
__m256d av4=_mm256_loadu_pd(a+16);
44+
__m256d av5=_mm256_loadu_pd(a+20);
45+
__m256d av6=_mm256_loadu_pd(a+24);
46+
__m256d av7=_mm256_loadu_pd(a+28);
47+
48+
49+
__m256d bv0=_mm256_loadu_pd(b);
50+
__m256d bv1=_mm256_loadu_pd(b+4);
51+
__m256d bv2=_mm256_loadu_pd(b+8);
52+
__m256d bv3=_mm256_loadu_pd(b+12);
53+
54+
__m256d bv4=_mm256_loadu_pd(b+16);
55+
__m256d bv5=_mm256_loadu_pd(b+20);
56+
__m256d bv6=_mm256_loadu_pd(b+24);
57+
__m256d bv7=_mm256_loadu_pd(b+28);
58+
59+
60+
61+
62+
__m256d yv0=_mm256_add_pd(av0, bv0);
63+
__m256d yv1=_mm256_add_pd(av1, bv1);
64+
__m256d yv2=_mm256_add_pd(av2, bv2);
65+
__m256d yv3=_mm256_add_pd(av3, bv3);
66+
67+
__m256d yv4=_mm256_add_pd(av4, bv4);
68+
__m256d yv5=_mm256_add_pd(av5, bv5);
69+
__m256d yv6=_mm256_add_pd(av6, bv6);
70+
__m256d yv7=_mm256_add_pd(av7, bv7);
71+
72+
_mm256_storeu_pd(y, yv0);
73+
_mm256_storeu_pd(y+4, yv1);
74+
_mm256_storeu_pd(y+8, yv2);
75+
_mm256_storeu_pd(y+12, yv3);
76+
77+
_mm256_storeu_pd(y+16, yv4);
78+
_mm256_storeu_pd(y+20, yv5);
79+
_mm256_storeu_pd(y+24, yv6);
80+
_mm256_storeu_pd(y+28, yv7);
81+
82+
a+=32;
83+
b+=32;
84+
y+=32;
85+
loop_count--;
86+
}
87+
88+
for(i=0; i<remain_count; i++){
89+
y[i]=a[i]+b[i];
90+
}
91+
}
92+
93+
#if 0
2894
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
2995

3096
//unroll 32
@@ -114,3 +180,4 @@ void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLO
114180
"memory"
115181
);
116182
}
183+
#endif

kernel/x86_64/dsub_kernel_avx.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,72 @@
2525

2626
#include "openvml_kernel.h"
2727

28+
#include <immintrin.h>
29+
30+
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
31+
VMLLONG loop_count=(COMPSIZE*n) >> 5;
32+
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
33+
34+
int i=0;
35+
36+
while(loop_count>0){
37+
38+
__m256d av0=_mm256_loadu_pd(a);
39+
__m256d av1=_mm256_loadu_pd(a+4);
40+
__m256d av2=_mm256_loadu_pd(a+8);
41+
__m256d av3=_mm256_loadu_pd(a+12);
42+
43+
__m256d av4=_mm256_loadu_pd(a+16);
44+
__m256d av5=_mm256_loadu_pd(a+20);
45+
__m256d av6=_mm256_loadu_pd(a+24);
46+
__m256d av7=_mm256_loadu_pd(a+28);
47+
48+
49+
__m256d bv0=_mm256_loadu_pd(b);
50+
__m256d bv1=_mm256_loadu_pd(b+4);
51+
__m256d bv2=_mm256_loadu_pd(b+8);
52+
__m256d bv3=_mm256_loadu_pd(b+12);
53+
54+
__m256d bv4=_mm256_loadu_pd(b+16);
55+
__m256d bv5=_mm256_loadu_pd(b+20);
56+
__m256d bv6=_mm256_loadu_pd(b+24);
57+
__m256d bv7=_mm256_loadu_pd(b+28);
58+
59+
60+
61+
62+
__m256d yv0=_mm256_sub_pd(av0, bv0);
63+
__m256d yv1=_mm256_sub_pd(av1, bv1);
64+
__m256d yv2=_mm256_sub_pd(av2, bv2);
65+
__m256d yv3=_mm256_sub_pd(av3, bv3);
66+
67+
__m256d yv4=_mm256_sub_pd(av4, bv4);
68+
__m256d yv5=_mm256_sub_pd(av5, bv5);
69+
__m256d yv6=_mm256_sub_pd(av6, bv6);
70+
__m256d yv7=_mm256_sub_pd(av7, bv7);
71+
72+
_mm256_storeu_pd(y, yv0);
73+
_mm256_storeu_pd(y+4, yv1);
74+
_mm256_storeu_pd(y+8, yv2);
75+
_mm256_storeu_pd(y+12, yv3);
76+
77+
_mm256_storeu_pd(y+16, yv4);
78+
_mm256_storeu_pd(y+20, yv5);
79+
_mm256_storeu_pd(y+24, yv6);
80+
_mm256_storeu_pd(y+28, yv7);
81+
82+
a+=32;
83+
b+=32;
84+
y+=32;
85+
loop_count--;
86+
}
87+
88+
for(i=0; i<remain_count; i++){
89+
y[i]=a[i]-b[i];
90+
}
91+
}
92+
93+
#if 0
2894
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
2995

3096
//unroll 32
@@ -115,3 +181,4 @@ void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLO
115181
"memory"
116182
);
117183
}
184+
#endif

kernel/x86_64/sadd_kernel_avx.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,50 @@
2525

2626
#include "openvml_kernel.h"
2727

28+
#include <immintrin.h>
29+
30+
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
31+
VMLLONG loop_count=(COMPSIZE*n) >> 5;
32+
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
33+
34+
int i=0;
35+
36+
while(loop_count>0){
37+
38+
__m256 av0=_mm256_loadu_ps(a);
39+
__m256 av1=_mm256_loadu_ps(a+8);
40+
__m256 av2=_mm256_loadu_ps(a+16);
41+
__m256 av3=_mm256_loadu_ps(a+24);
42+
43+
__m256 bv0=_mm256_loadu_ps(b);
44+
__m256 bv1=_mm256_loadu_ps(b+8);
45+
__m256 bv2=_mm256_loadu_ps(b+16);
46+
__m256 bv3=_mm256_loadu_ps(b+24);
47+
48+
49+
__m256 yv0=_mm256_add_ps(av0, bv0);
50+
__m256 yv1=_mm256_add_ps(av1, bv1);
51+
__m256 yv2=_mm256_add_ps(av2, bv2);
52+
__m256 yv3=_mm256_add_ps(av3, bv3);
53+
54+
55+
_mm256_storeu_ps(y, yv0);
56+
_mm256_storeu_ps(y+8, yv1);
57+
_mm256_storeu_ps(y+16, yv2);
58+
_mm256_storeu_ps(y+24, yv3);
59+
60+
a+=32;
61+
b+=32;
62+
y+=32;
63+
loop_count--;
64+
}
65+
66+
for(i=0; i<remain_count; i++){
67+
y[i]=a[i]+b[i];
68+
}
69+
}
70+
71+
#if 0
2872
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
2973

3074
//unroll 32
@@ -93,3 +137,4 @@ void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLO
93137
"memory"
94138
);
95139
}
140+
#endif

kernel/x86_64/ssub_kernel_avx.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,51 @@
2525

2626
#include "openvml_kernel.h"
2727

28+
#include <immintrin.h>
29+
30+
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
31+
VMLLONG loop_count=(COMPSIZE*n) >> 5;
32+
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
33+
34+
int i=0;
35+
36+
while(loop_count>0){
37+
38+
__m256 av0=_mm256_loadu_ps(a);
39+
__m256 av1=_mm256_loadu_ps(a+8);
40+
__m256 av2=_mm256_loadu_ps(a+16);
41+
__m256 av3=_mm256_loadu_ps(a+24);
42+
43+
__m256 bv0=_mm256_loadu_ps(b);
44+
__m256 bv1=_mm256_loadu_ps(b+8);
45+
__m256 bv2=_mm256_loadu_ps(b+16);
46+
__m256 bv3=_mm256_loadu_ps(b+24);
47+
48+
49+
__m256 yv0=_mm256_sub_ps(av0, bv0);
50+
__m256 yv1=_mm256_sub_ps(av1, bv1);
51+
__m256 yv2=_mm256_sub_ps(av2, bv2);
52+
__m256 yv3=_mm256_sub_ps(av3, bv3);
53+
54+
55+
_mm256_storeu_ps(y, yv0);
56+
_mm256_storeu_ps(y+8, yv1);
57+
_mm256_storeu_ps(y+16, yv2);
58+
_mm256_storeu_ps(y+24, yv3);
59+
60+
a+=32;
61+
b+=32;
62+
y+=32;
63+
loop_count--;
64+
}
65+
66+
for(i=0; i<remain_count; i++){
67+
y[i]=a[i]-b[i];
68+
}
69+
}
70+
71+
72+
#if 0
2873
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
2974

3075
//unroll 32
@@ -93,3 +138,4 @@ void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLO
93138
"memory"
94139
);
95140
}
141+
#endif

0 commit comments

Comments
 (0)