Skip to content

Commit 634f2bd

Browse files
authored
Merge pull request #2414 from marxin/fix-iamax_sse-implementation
Fix iamax sse implementation and add utests
2 parents 79e201f + aeea14e commit 634f2bd

File tree

6 files changed

+112
-64
lines changed

6 files changed

+112
-64
lines changed

kernel/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
4747
GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type})
4848
endif ()
4949
if (DEFINED ${float_char}MINKERNEL)
50-
GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type})
50+
GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "USE_MIN" "min_k" false "" "" false ${float_type})
5151
endif ()
5252
GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type})
5353
GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type})
5454
if (DEFINED I${float_char}MAXKERNEL)
5555
GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type})
5656
endif ()
5757
if (DEFINED I${float_char}MINKERNEL)
58-
GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type})
58+
GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "USE_MIN" "i*min_k" false "" "" false ${float_type})
5959
endif ()
6060
GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type})
6161
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type})

kernel/x86_64/KERNEL

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S
171171
endif
172172

173173
ifndef ISAMINKERNEL
174-
ISAMINKERNEL = iamax.S
174+
ISAMINKERNEL = iamax_sse.S
175175
endif
176176

177177
ifndef IDAMINKERNEL
@@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S
207207
endif
208208

209209
ifndef ISMINKERNEL
210-
ISMINKERNEL = iamax.S
210+
ISMINKERNEL = iamax_sse.S
211211
endif
212212

213213
ifndef IDMINKERNEL

kernel/x86_64/iamax_sse.S

Lines changed: 17 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,6 @@
3636
/* or implied, of The University of Texas at Austin. */
3737
/*********************************************************************/
3838

39-
/* This kernel was found to give wrong results when used for ISMIN/ISAMIN
40-
with increment != 1, although it appears to be correct for corresponding
41-
MAX operations. See issue 2116 */
42-
4339
#define ASSEMBLER
4440
#include "common.h"
4541

@@ -59,6 +55,15 @@
5955
#define MAXSS minss
6056
#endif
6157

58+
.macro LOAD_AND_COMPARE_TO_MXX REG
59+
movss 0 * SIZE(X), \REG
60+
addq INCX, X
61+
#ifdef USE_ABS
62+
andps %xmm15, \REG
63+
#endif
64+
cmpeqss %xmm0, \REG
65+
.endm
66+
6267
#include "l1param.h"
6368

6469
PROLOGUE
@@ -830,61 +835,14 @@
830835
ALIGN_4
831836

832837
.L93:
833-
movss 0 * SIZE(X), %xmm1
834-
addq INCX, X
835-
#ifdef USE_ABS
836-
andps %xmm15, %xmm1
837-
#endif
838-
cmpeqss %xmm0, %xmm1
839-
840-
movss 0 * SIZE(X), %xmm2
841-
addq INCX, X
842-
#ifdef USE_ABS
843-
andps %xmm15, %xmm2
844-
#endif
845-
cmpeqss %xmm0, %xmm2
846-
847-
movss 0 * SIZE(X), %xmm3
848-
addq INCX, X
849-
#ifdef USE_ABS
850-
andps %xmm15, %xmm3
851-
#endif
852-
cmpeqss %xmm0, %xmm3
853-
854-
movss 0 * SIZE(X), %xmm4
855-
addq INCX, X
856-
#ifdef USE_ABS
857-
andps %xmm15, %xmm4
858-
#endif
859-
cmpeqss %xmm0, %xmm4
860-
861-
movss 0 * SIZE(X), %xmm5
862-
addq INCX, X
863-
#ifdef USE_ABS
864-
andps %xmm15, %xmm5
865-
#endif
866-
cmpeqps %xmm0, %xmm5
867-
868-
movss 0 * SIZE(X), %xmm6
869-
addq INCX, X
870-
#ifdef USE_ABS
871-
andps %xmm15, %xmm6
872-
#endif
873-
cmpeqss %xmm0, %xmm6
874-
875-
movss 0 * SIZE(X), %xmm7
876-
addq INCX, X
877-
#ifdef USE_ABS
878-
andps %xmm15, %xmm7
879-
#endif
880-
cmpeqss %xmm0, %xmm7
881-
882-
movss 0 * SIZE(X), %xmm8
883-
addq INCX, X
884-
#ifdef USE_ABS
885-
andps %xmm15, %xmm8
886-
#endif
887-
cmpeqss %xmm0, %xmm8
838+
LOAD_AND_COMPARE_TO_MXX %xmm1
839+
LOAD_AND_COMPARE_TO_MXX %xmm2
840+
LOAD_AND_COMPARE_TO_MXX %xmm3
841+
LOAD_AND_COMPARE_TO_MXX %xmm4
842+
LOAD_AND_COMPARE_TO_MXX %xmm5
843+
LOAD_AND_COMPARE_TO_MXX %xmm6
844+
LOAD_AND_COMPARE_TO_MXX %xmm7
845+
LOAD_AND_COMPARE_TO_MXX %xmm8
888846

889847
orps %xmm2, %xmm1
890848
orps %xmm4, %xmm3

utest/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ else ()
77
set(OpenBLAS_utest_src
88
utest_main.c
99
test_amax.c
10+
test_ismin.c
1011
test_rotmg.c
1112
test_rot.c
1213
test_axpy.c

utest/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ UTESTBIN=openblas_utest
1111

1212
include $(TOPDIR)/Makefile.system
1313

14-
OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o
14+
OBJS=utest_main.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o
1515
#test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
1616

1717
ifneq ($(NO_LAPACK), 1)

utest/test_ismin.c

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
/*****************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in
14+
the documentation and/or other materials provided with the
15+
distribution.
16+
3. Neither the name of the OpenBLAS project nor the names of
17+
its contributors may be used to endorse or promote products
18+
derived from this software without specific prior written
19+
permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
32+
**********************************************************************************/
33+
34+
#include "openblas_utest.h"
35+
36+
#define ELEMENTS 50
37+
#define INCREMENT 2
38+
39+
CTEST(ismin, positive_step_2){
40+
blasint i;
41+
blasint N = ELEMENTS, inc = INCREMENT;
42+
float x[ELEMENTS * INCREMENT];
43+
for (i = 0; i < N * inc; i ++) {
44+
x[i] = i + 1000;
45+
}
46+
47+
x[8 * inc] = 0;
48+
blasint index = BLASFUNC(ismin)(&N, x, &inc);
49+
ASSERT_EQUAL(9, index);
50+
}
51+
52+
CTEST(ismin, negative_step_2){
53+
blasint i;
54+
blasint N = ELEMENTS, inc = INCREMENT;
55+
float x[ELEMENTS * INCREMENT];
56+
for (i = 0; i < N * inc; i ++) {
57+
x[i] = - i - 1000;
58+
}
59+
60+
x[8 * inc] = -123456.0f;
61+
blasint index = BLASFUNC(ismin)(&N, x, &inc);
62+
ASSERT_EQUAL(9, index);
63+
}
64+
65+
CTEST(ismax, positive_step_2){
66+
blasint i;
67+
blasint N = ELEMENTS, inc = INCREMENT;
68+
float x[ELEMENTS * INCREMENT];
69+
for (i = 0; i < N * inc; i ++) {
70+
x[i] = i + 1000;
71+
}
72+
73+
x[8 * inc] = 123456.0f;
74+
blasint index = BLASFUNC(ismax)(&N, x, &inc);
75+
ASSERT_EQUAL(9, index);
76+
}
77+
78+
CTEST(ismax, negative_step_2){
79+
blasint i;
80+
blasint N = ELEMENTS, inc = INCREMENT;
81+
float x[ELEMENTS * INCREMENT];
82+
for (i = 0; i < N * inc; i ++) {
83+
x[i] = - i - 1000;
84+
}
85+
86+
x[8 * inc] = 0;
87+
blasint index = BLASFUNC(ismax)(&N, x, &inc);
88+
ASSERT_EQUAL(9, index);
89+
}

0 commit comments

Comments
 (0)