Skip to content

Commit df08a9e

Browse files
Hamlin LiXiaohong Gong
andcommitted
8312425: [vectorapi] AArch64: Optimize vector math operations with SLEEF
Co-authored-by: Xiaohong Gong <[email protected]> Reviewed-by: ihse, fgao, aph
1 parent e7f0bf1 commit df08a9e

File tree

8 files changed

+342
-5
lines changed

8 files changed

+342
-5
lines changed

make/autoconf/flags-cflags.m4

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -912,6 +912,36 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
912912
IF_FALSE: [$2FDLIBM_CFLAGS=""])
913913
fi
914914
AC_SUBST($2FDLIBM_CFLAGS)
915+
916+
# Check whether the compiler supports the Arm C Language Extensions (ACLE)
917+
# for SVE. Set SVE_CFLAGS to -march=armv8-a+sve if it does.
918+
# ACLE and this flag are required to build the aarch64 SVE related functions in
919+
# libvectormath.
920+
if test "x$OPENJDK_TARGET_CPU" = "xaarch64"; then
921+
if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
922+
AC_LANG_PUSH(C)
923+
OLD_CFLAGS="$CFLAGS"
924+
CFLAGS="$CFLAGS -march=armv8-a+sve"
925+
AC_MSG_CHECKING([if Arm SVE ACLE is supported])
926+
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <arm_sve.h>],
927+
[
928+
svint32_t r = svdup_n_s32(1);
929+
return 0;
930+
])],
931+
[
932+
AC_MSG_RESULT([yes])
933+
$2SVE_CFLAGS="-march=armv8-a+sve"
934+
],
935+
[
936+
AC_MSG_RESULT([no])
937+
$2SVE_CFLAGS=""
938+
]
939+
)
940+
CFLAGS="$OLD_CFLAGS"
941+
AC_LANG_POP(C)
942+
fi
943+
fi
944+
AC_SUBST($2SVE_CFLAGS)
915945
])
916946

917947
# FLAGS_SETUP_GCC6_COMPILER_FLAGS([PREFIX])

make/autoconf/spec.gmk.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,9 @@ OS_VERSION_MAJOR := @OS_VERSION_MAJOR@
827827
OS_VERSION_MINOR := @OS_VERSION_MINOR@
828828
OS_VERSION_MICRO := @OS_VERSION_MICRO@
829829

830+
# Arm SVE
831+
SVE_CFLAGS := @SVE_CFLAGS@
832+
830833
# Images directory definitions
831834
JDK_IMAGE_SUBDIR := jdk
832835
JRE_IMAGE_SUBDIR := jre

make/modules/jdk.incubator.vector/Lib.gmk

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,17 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, riscv64)+$(INCLUDE_COMPILER2
5555

5656
TARGETS += $(BUILD_LIBSLEEF)
5757
endif
58+
59+
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2), true+true+true)
60+
$(eval $(call SetupJdkLibrary, BUILD_LIBSLEEF, \
61+
NAME := sleef, \
62+
OPTIMIZATION := HIGH, \
63+
SRC := libsleef/lib, \
64+
EXTRA_SRC := libsleef/generated, \
65+
DISABLED_WARNINGS_gcc := unused-function sign-compare tautological-compare ignored-qualifiers, \
66+
DISABLED_WARNINGS_clang := unused-function sign-compare tautological-compare ignored-qualifiers, \
67+
CFLAGS := $(SVE_CFLAGS), \
68+
))
69+
70+
TARGETS += $(BUILD_LIBSLEEF)
71+
endif

src/hotspot/cpu/aarch64/aarch64.ad

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2307,14 +2307,18 @@ const RegMask* Matcher::predicate_reg_mask(void) {
23072307
return &_PR_REG_mask;
23082308
}
23092309

2310-
// Vector calling convention not yet implemented.
23112310
bool Matcher::supports_vector_calling_convention(void) {
2312-
return false;
2311+
return EnableVectorSupport && UseVectorStubs;
23132312
}
23142313

23152314
OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
2316-
Unimplemented();
2317-
return OptoRegPair(0, 0);
2315+
assert(EnableVectorSupport && UseVectorStubs, "sanity");
2316+
int lo = V0_num;
2317+
int hi = V0_H_num;
2318+
if (ideal_reg == Op_VecX || ideal_reg == Op_VecA) {
2319+
hi = V0_K_num;
2320+
}
2321+
return OptoRegPair(hi, lo);
23182322
}
23192323

23202324
// Is this branch offset short enough that a short branch can be used?
@@ -16109,6 +16113,22 @@ instruct CallLeafDirect(method meth)
1610916113
ins_pipe(pipe_class_call);
1611016114
%}
1611116115

16116+
// Call Runtime Instruction without safepoint and with vector arguments
16117+
instruct CallLeafDirectVector(method meth)
16118+
%{
16119+
match(CallLeafVector);
16120+
16121+
effect(USE meth);
16122+
16123+
ins_cost(CALL_COST);
16124+
16125+
format %{ "CALL, runtime leaf vector $meth" %}
16126+
16127+
ins_encode(aarch64_enc_java_to_runtime(meth));
16128+
16129+
ins_pipe(pipe_class_call);
16130+
%}
16131+
1611216132
// Call Runtime Instruction
1611316133

1611416134
instruct CallLeafNoFPDirect(method meth)

src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,20 @@ static int c_calling_convention_priv(const BasicType *sig_bt,
871871
int SharedRuntime::vector_calling_convention(VMRegPair *regs,
872872
uint num_bits,
873873
uint total_args_passed) {
874-
Unimplemented();
874+
// More than 8 argument inputs are not supported now.
875+
assert(total_args_passed <= Argument::n_float_register_parameters_c, "unsupported");
876+
assert(num_bits >= 64 && num_bits <= 2048 && is_power_of_2(num_bits), "unsupported");
877+
878+
static const FloatRegister VEC_ArgReg[Argument::n_float_register_parameters_c] = {
879+
v0, v1, v2, v3, v4, v5, v6, v7
880+
};
881+
882+
// On SVE, we use the same vector registers with 128-bit vector registers on NEON.
883+
int next_reg_val = num_bits == 64 ? 1 : 3;
884+
for (uint i = 0; i < total_args_passed; i++) {
885+
VMReg vmreg = VEC_ArgReg[i]->as_VMReg();
886+
regs[i].set_pair(vmreg->next(next_reg_val), vmreg);
887+
}
875888
return 0;
876889
}
877890

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "oops/oop.inline.hpp"
4343
#include "prims/methodHandles.hpp"
4444
#include "prims/upcallLinker.hpp"
45+
#include "runtime/arguments.hpp"
4546
#include "runtime/atomic.hpp"
4647
#include "runtime/continuation.hpp"
4748
#include "runtime/continuationEntry.inline.hpp"
@@ -8176,6 +8177,78 @@ class StubGenerator: public StubCodeGenerator {
81768177
// }
81778178
};
81788179

8180+
void generate_vector_math_stubs() {
8181+
// Get native vector math stub routine addresses
8182+
void* libsleef = nullptr;
8183+
char ebuf[1024];
8184+
char dll_name[JVM_MAXPATHLEN];
8185+
if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
8186+
libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
8187+
}
8188+
if (libsleef == nullptr) {
8189+
log_info(library)("Failed to load native vector math library, %s!", ebuf);
8190+
return;
8191+
}
8192+
// Method naming convention
8193+
// All the methods are named as <OP><T><N>_<U><suffix>
8194+
// Where:
8195+
// <OP> is the operation name, e.g. sin
8196+
// <T> is optional to indicate float/double
8197+
// "f/d" for vector float/double operation
8198+
// <N> is the number of elements in the vector
8199+
// "2/4" for neon, and "x" for sve
8200+
// <U> is the precision level
8201+
// "u10/u05" represents 1.0/0.5 ULP error bounds
8202+
// We use "u10" for all operations by default
8203+
// But for those functions do not have u10 support, we use "u05" instead
8204+
// <suffix> indicates neon/sve
8205+
// "sve/advsimd" for sve/neon implementations
8206+
// e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
8207+
// cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
8208+
//
8209+
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
8210+
8211+
// Math vector stubs implemented with SVE for scalable vector size.
8212+
if (UseSVE > 0) {
8213+
for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8214+
int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8215+
// Skip "tanh" because there is performance regression
8216+
if (vop == VectorSupport::VECTOR_OP_TANH) {
8217+
continue;
8218+
}
8219+
8220+
// The native library does not support u10 level of "hypot".
8221+
const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8222+
8223+
snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
8224+
StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8225+
8226+
snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
8227+
StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8228+
}
8229+
}
8230+
8231+
// Math vector stubs implemented with NEON for 64/128 bits vector size.
8232+
for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8233+
int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8234+
// Skip "tanh" because there is performance regression
8235+
if (vop == VectorSupport::VECTOR_OP_TANH) {
8236+
continue;
8237+
}
8238+
8239+
// The native library does not support u10 level of "hypot".
8240+
const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8241+
8242+
snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8243+
StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
8244+
8245+
snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8246+
StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8247+
8248+
snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
8249+
StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8250+
}
8251+
}
81798252

81808253
// Initialization
81818254
void generate_initial_stubs() {
@@ -8329,6 +8402,9 @@ class StubGenerator: public StubCodeGenerator {
83298402
// because it's faster for the sizes of modulus we care about.
83308403
StubRoutines::_montgomerySquare = g.generate_multiply();
83318404
}
8405+
8406+
generate_vector_math_stubs();
8407+
83328408
#endif // COMPILER2
83338409

83348410
if (UseChaCha20Intrinsics) {
@@ -8384,6 +8460,7 @@ class StubGenerator: public StubCodeGenerator {
83848460
if (UseAdler32Intrinsics) {
83858461
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
83868462
}
8463+
83878464
#endif // COMPILER2_OR_JVMCI
83888465
}
83898466

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* Copyright (c) 2024, Arm Limited. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
25+
26+
#include <stdint.h>
27+
#include <arm_neon.h>
28+
29+
#include "../generated/misc.h"
30+
#include "../generated/sleefinline_advsimd.h"
31+
32+
33+
#include <jni.h>
34+
35+
#define DEFINE_VECTOR_MATH_UNARY(op, type) \
36+
JNIEXPORT \
37+
type op##advsimd(type input) { \
38+
return Sleef_##op##advsimd(input); \
39+
}
40+
41+
#define DEFINE_VECTOR_MATH_BINARY(op, type) \
42+
JNIEXPORT \
43+
type op##advsimd(type input1, type input2) { \
44+
return Sleef_##op##advsimd(input1, input2); \
45+
}
46+
47+
DEFINE_VECTOR_MATH_UNARY(tanf4_u10, float32x4_t)
48+
DEFINE_VECTOR_MATH_UNARY(tanhf4_u10, float32x4_t)
49+
DEFINE_VECTOR_MATH_UNARY(sinf4_u10, float32x4_t)
50+
DEFINE_VECTOR_MATH_UNARY(sinhf4_u10, float32x4_t)
51+
DEFINE_VECTOR_MATH_UNARY(cosf4_u10, float32x4_t)
52+
DEFINE_VECTOR_MATH_UNARY(coshf4_u10, float32x4_t)
53+
DEFINE_VECTOR_MATH_UNARY(asinf4_u10, float32x4_t)
54+
DEFINE_VECTOR_MATH_UNARY(acosf4_u10, float32x4_t)
55+
DEFINE_VECTOR_MATH_UNARY(atanf4_u10, float32x4_t)
56+
DEFINE_VECTOR_MATH_UNARY(cbrtf4_u10, float32x4_t)
57+
DEFINE_VECTOR_MATH_UNARY(logf4_u10, float32x4_t)
58+
DEFINE_VECTOR_MATH_UNARY(log10f4_u10, float32x4_t)
59+
DEFINE_VECTOR_MATH_UNARY(log1pf4_u10, float32x4_t)
60+
DEFINE_VECTOR_MATH_UNARY(expf4_u10, float32x4_t)
61+
DEFINE_VECTOR_MATH_UNARY(expm1f4_u10, float32x4_t)
62+
63+
DEFINE_VECTOR_MATH_UNARY(tand2_u10, float64x2_t)
64+
DEFINE_VECTOR_MATH_UNARY(tanhd2_u10, float64x2_t)
65+
DEFINE_VECTOR_MATH_UNARY(sind2_u10, float64x2_t)
66+
DEFINE_VECTOR_MATH_UNARY(sinhd2_u10, float64x2_t)
67+
DEFINE_VECTOR_MATH_UNARY(cosd2_u10, float64x2_t)
68+
DEFINE_VECTOR_MATH_UNARY(coshd2_u10, float64x2_t)
69+
DEFINE_VECTOR_MATH_UNARY(asind2_u10, float64x2_t)
70+
DEFINE_VECTOR_MATH_UNARY(acosd2_u10, float64x2_t)
71+
DEFINE_VECTOR_MATH_UNARY(atand2_u10, float64x2_t)
72+
DEFINE_VECTOR_MATH_UNARY(cbrtd2_u10, float64x2_t)
73+
DEFINE_VECTOR_MATH_UNARY(logd2_u10, float64x2_t)
74+
DEFINE_VECTOR_MATH_UNARY(log10d2_u10, float64x2_t)
75+
DEFINE_VECTOR_MATH_UNARY(log1pd2_u10, float64x2_t)
76+
DEFINE_VECTOR_MATH_UNARY(expd2_u10, float64x2_t)
77+
DEFINE_VECTOR_MATH_UNARY(expm1d2_u10, float64x2_t)
78+
79+
DEFINE_VECTOR_MATH_BINARY(atan2f4_u10, float32x4_t)
80+
DEFINE_VECTOR_MATH_BINARY(powf4_u10, float32x4_t)
81+
DEFINE_VECTOR_MATH_BINARY(hypotf4_u05, float32x4_t)
82+
83+
DEFINE_VECTOR_MATH_BINARY(atan2d2_u10, float64x2_t)
84+
DEFINE_VECTOR_MATH_BINARY(powd2_u10, float64x2_t)
85+
DEFINE_VECTOR_MATH_BINARY(hypotd2_u05, float64x2_t)
86+
87+
#undef DEFINE_VECTOR_MATH_UNARY
88+
89+
#undef DEFINE_VECTOR_MATH_BINARY
90+
91+
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)

0 commit comments

Comments
 (0)