Skip to content

Commit b1215f2

Browse files
authored
Merge pull request #16 from xianyi/develop
rebase
2 parents 93843c5 + 0b73041 commit b1215f2

File tree

17 files changed

+790
-94
lines changed

17 files changed

+790
-94
lines changed

Makefile.system

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -904,8 +904,8 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
904904
FCOMMON_OPT += -Mrecursive -Kieee
905905
ifeq ($(OSNAME), Linux)
906906
ifeq ($(ARCH), x86_64)
907-
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
908-
ifeq ($(FLANG_VENDOR),AOCC)
907+
FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
908+
ifeq ($(FLANG_VENDOR), AMD)
909909
FCOMMON_OPT += -fno-unroll-loops
910910
endif
911911
endif

benchmark/bench.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){
7474

7575
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
7676
struct timeval start, stop;
77+
#elif defined(__APPLE__)
78+
mach_timebase_info_data_t info;
79+
uint64_t start = 0, stop = 0;
7780
#else
7881
struct timespec start = { 0, 0 }, stop = { 0, 0 };
7982
#endif
@@ -82,6 +85,9 @@ double getsec()
8285
{
8386
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
8487
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
88+
#elif defined(__APPLE__)
89+
mach_timebase_info(&info);
90+
return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
8591
#else
8692
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
8793
#endif
@@ -90,6 +96,8 @@ double getsec()
9096
void begin() {
9197
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
9298
gettimeofday( &start, (struct timezone *)0);
99+
#elif defined(__APPLE__)
100+
start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
93101
#else
94102
clock_gettime(CLOCK_REALTIME, &start);
95103
#endif
@@ -98,7 +106,9 @@ void begin() {
98106
void end() {
99107
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
100108
gettimeofday( &stop, (struct timezone *)0);
109+
#elif defined(__APPLE__)
110+
stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
101111
#else
102112
clock_gettime(CLOCK_REALTIME, &stop);
103113
#endif
104-
}
114+
}

f_check

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ if ($link ne "") {
330330
$flags =~ s/\@/\,/g;
331331
$linker_L .= "-Wl,". $flags . " " ;
332332
}
333-
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
333+
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
334334
$flags = "-lomp";
335335
}
336336

kernel/power/dasum.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4646

4747
#endif
4848

49-
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
5049
#if defined(__VEC__) || defined(__ALTIVEC__)
50+
#if defined(POWER8) || defined(POWER9)
5151
#include "dasum_microk_power8.c"
52+
#elif defined(POWER10)
53+
#include "dasum_microk_power10.c"
5254
#endif
5355
#endif
5456

@@ -110,13 +112,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
110112
if ( inc_x == 1 )
111113
{
112114

115+
#if defined(POWER10)
116+
if ( n >= 16 )
117+
{
118+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
119+
for (i = 0; i < align; i++) {
120+
sumf += ABS(x[i]);
121+
}
122+
}
123+
n1 = (n-i) & -16;
124+
if ( n1 > 0 )
125+
{
126+
sumf += dasum_kernel_16(n1, &x[i]);
127+
i+=n1;
128+
}
129+
#else
113130
n1 = n & -16;
114131
if ( n1 > 0 )
115132
{
116133

117134
sumf = dasum_kernel_16(n1, x);
118135
i=n1;
119136
}
137+
#endif
120138

121139
while(i < n)
122140
{

kernel/power/dasum_microk_power10.c

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/***************************************************************************
2+
Copyright (c) 2021, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_16 1
29+
30+
static double dasum_kernel_16 (long n, double *x)
31+
{
32+
double sum;
33+
__vector double t0;
34+
__vector double t1;
35+
__vector double t2;
36+
__vector double t3;
37+
38+
__asm__
39+
(
40+
"dcbt 0, %2 \n\t"
41+
42+
"xxlxor 32, 32, 32 \n\t"
43+
"xxlxor 33, 33, 33 \n\t"
44+
"xxlxor 34, 34, 34 \n\t"
45+
"xxlxor 35, 35, 35 \n\t"
46+
"xxlxor 36, 36, 36 \n\t"
47+
"xxlxor 37, 37, 37 \n\t"
48+
"xxlxor 38, 38, 38 \n\t"
49+
"xxlxor 39, 39, 39 \n\t"
50+
51+
"lxvp 40, 0(%2) \n\t"
52+
"lxvp 42, 32(%2) \n\t"
53+
"lxvp 44, 64(%2) \n\t"
54+
"lxvp 46, 96(%2) \n\t"
55+
56+
"addi %2, %2, 128 \n\t"
57+
58+
"addic. %1, %1, -16 \n\t"
59+
"ble two%= \n\t"
60+
61+
".align 5 \n"
62+
"one%=: \n\t"
63+
64+
"xvabsdp 48, 40 \n\t"
65+
"xvabsdp 49, 41 \n\t"
66+
"xvabsdp 50, 42 \n\t"
67+
"xvabsdp 51, 43 \n\t"
68+
"lxvp 40, 0(%2) \n\t"
69+
70+
71+
"xvabsdp %x3, 44 \n\t"
72+
"xvabsdp %x4, 45 \n\t"
73+
"lxvp 42, 32(%2) \n\t"
74+
75+
76+
"xvabsdp %x5, 46 \n\t"
77+
"xvabsdp %x6, 47 \n\t"
78+
"lxvp 44, 64(%2) \n\t"
79+
80+
81+
"xvadddp 32, 32, 48 \n\t"
82+
"xvadddp 33, 33, 49 \n\t"
83+
84+
"lxvp 46, 96(%2) \n\t"
85+
86+
"xvadddp 34, 34, 50 \n\t"
87+
"xvadddp 35, 35, 51 \n\t"
88+
"addi %2, %2, 128 \n\t"
89+
"xvadddp 36, 36, %x3 \n\t"
90+
"xvadddp 37, 37, %x4 \n\t"
91+
"addic. %1, %1, -16 \n\t"
92+
"xvadddp 38, 38, %x5 \n\t"
93+
"xvadddp 39, 39, %x6 \n\t"
94+
95+
"bgt one%= \n"
96+
97+
"two%=: \n\t"
98+
99+
"xvabsdp 48, 40 \n\t"
100+
"xvabsdp 49, 41 \n\t"
101+
"xvabsdp 50, 42 \n\t"
102+
"xvabsdp 51, 43 \n\t"
103+
"xvabsdp %x3, 44 \n\t"
104+
"xvabsdp %x4, 45 \n\t"
105+
"xvabsdp %x5, 46 \n\t"
106+
"xvabsdp %x6, 47 \n\t"
107+
108+
"xvadddp 32, 32, 48 \n\t"
109+
"xvadddp 33, 33, 49 \n\t"
110+
"xvadddp 34, 34, 50 \n\t"
111+
"xvadddp 35, 35, 51 \n\t"
112+
"xvadddp 36, 36, %x3 \n\t"
113+
"xvadddp 37, 37, %x4 \n\t"
114+
"xvadddp 38, 38, %x5 \n\t"
115+
"xvadddp 39, 39, %x6 \n\t"
116+
117+
"xvadddp 32, 32, 33 \n\t"
118+
"xvadddp 34, 34, 35 \n\t"
119+
"xvadddp 36, 36, 37 \n\t"
120+
"xvadddp 38, 38, 39 \n\t"
121+
122+
"xvadddp 32, 32, 34 \n\t"
123+
"xvadddp 36, 36, 38 \n\t"
124+
125+
"xvadddp 32, 32, 36 \n\t"
126+
127+
XXSWAPD_S(33,32)
128+
"xsadddp %x0, 32, 33 \n"
129+
130+
"#n=%1 x=%3=%2 sum=%0\n"
131+
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
132+
:
133+
"=d" (sum), // 0
134+
"+r" (n), // 1
135+
"+b" (x), // 2
136+
"=wa" (t0), // 3
137+
"=wa" (t1), // 4
138+
"=wa" (t2), // 5
139+
"=wa" (t3) // 6
140+
:
141+
"m" (*x)
142+
:
143+
"cr0",
144+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
145+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
146+
"vs48","vs49","vs50","vs51"
147+
);
148+
149+
return sum;
150+
}
151+
152+

0 commit comments

Comments
 (0)