8312425: [vectorapi] AArch64: Optimize vector math operations with SLEEF

Hamlin Li · Xiaohong Gong · Hamlin Li · commit df08a9ec0d81 · 2024-11-04T09:19:39.000Z
Co-authored-by: Xiaohong Gong &lt;xgong@openjdk.org&gt;
Reviewed-by: ihse, fgao, aph
diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4
@@ -912,6 +912,36 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
         IF_FALSE: [$2FDLIBM_CFLAGS=""])
   fi
   AC_SUBST($2FDLIBM_CFLAGS)
+
+  # Check whether the compiler supports the Arm C Language Extensions (ACLE)
+  # for SVE. Set SVE_CFLAGS to -march=armv8-a+sve if it does.
+  # ACLE and this flag are required to build the aarch64 SVE related functions in
+  # libvectormath.
+  if test "x$OPENJDK_TARGET_CPU" = "xaarch64"; then
+    if test "x$TOOLCHAIN_TYPE" = xgcc || test "x$TOOLCHAIN_TYPE" = xclang; then
+      AC_LANG_PUSH(C)
+      OLD_CFLAGS="$CFLAGS"
+      CFLAGS="$CFLAGS -march=armv8-a+sve"
+      AC_MSG_CHECKING([if Arm SVE ACLE is supported])
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <arm_sve.h>],
+          [
+            svint32_t r = svdup_n_s32(1);
+            return 0;
+          ])],
+          [
+            AC_MSG_RESULT([yes])
+            $2SVE_CFLAGS="-march=armv8-a+sve"
+          ],
+          [
+            AC_MSG_RESULT([no])
+            $2SVE_CFLAGS=""
+          ]
+      )
+      CFLAGS="$OLD_CFLAGS"
+      AC_LANG_POP(C)
+    fi
+  fi
+  AC_SUBST($2SVE_CFLAGS)
 ])
 
 # FLAGS_SETUP_GCC6_COMPILER_FLAGS([PREFIX])
diff --git a/make/autoconf/spec.gmk.template b/make/autoconf/spec.gmk.template
@@ -827,6 +827,9 @@ OS_VERSION_MAJOR := @OS_VERSION_MAJOR@
 OS_VERSION_MINOR := @OS_VERSION_MINOR@
 OS_VERSION_MICRO := @OS_VERSION_MICRO@
 
+# Arm SVE
+SVE_CFLAGS := @SVE_CFLAGS@
+
 # Images directory definitions
 JDK_IMAGE_SUBDIR := jdk
 JRE_IMAGE_SUBDIR := jre
diff --git a/make/modules/jdk.incubator.vector/Lib.gmk b/make/modules/jdk.incubator.vector/Lib.gmk
@@ -55,3 +55,17 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, riscv64)+$(INCLUDE_COMPILER2
 
   TARGETS += $(BUILD_LIBSLEEF)
 endif
+
+ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2), true+true+true)
+  $(eval $(call SetupJdkLibrary, BUILD_LIBSLEEF, \
+      NAME := sleef, \
+      OPTIMIZATION := HIGH, \
+      SRC := libsleef/lib, \
+      EXTRA_SRC := libsleef/generated, \
+      DISABLED_WARNINGS_gcc := unused-function sign-compare tautological-compare ignored-qualifiers, \
+      DISABLED_WARNINGS_clang := unused-function sign-compare tautological-compare ignored-qualifiers, \
+      CFLAGS := $(SVE_CFLAGS), \
+  ))
+
+  TARGETS += $(BUILD_LIBSLEEF)
+endif
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -2307,14 +2307,18 @@ const RegMask* Matcher::predicate_reg_mask(void) {
   return &_PR_REG_mask;
 }
 
-// Vector calling convention not yet implemented.
 bool Matcher::supports_vector_calling_convention(void) {
-  return false;
+  return EnableVectorSupport && UseVectorStubs;
 }
 
 OptoRegPair Matcher::vector_return_value(uint ideal_reg) {
-  Unimplemented();
-  return OptoRegPair(0, 0);
+  assert(EnableVectorSupport && UseVectorStubs, "sanity");
+  int lo = V0_num;
+  int hi = V0_H_num;
+  if (ideal_reg == Op_VecX || ideal_reg == Op_VecA) {
+    hi = V0_K_num;
+  }
+  return OptoRegPair(hi, lo);
 }
 
 // Is this branch offset short enough that a short branch can be used?
@@ -16109,6 +16113,22 @@ instruct CallLeafDirect(method meth)
   ins_pipe(pipe_class_call);
 %}
 
+// Call Runtime Instruction without safepoint and with vector arguments
+instruct CallLeafDirectVector(method meth)
+%{
+  match(CallLeafVector);
+
+  effect(USE meth);
+
+  ins_cost(CALL_COST);
+
+  format %{ "CALL, runtime leaf vector $meth" %}
+
+  ins_encode(aarch64_enc_java_to_runtime(meth));
+
+  ins_pipe(pipe_class_call);
+%}
+
 // Call Runtime Instruction
 
 instruct CallLeafNoFPDirect(method meth)
diff --git a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp
@@ -871,7 +871,20 @@ static int c_calling_convention_priv(const BasicType *sig_bt,
 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
                                              uint num_bits,
                                              uint total_args_passed) {
-  Unimplemented();
+  // More than 8 argument inputs are not supported now.
+  assert(total_args_passed <= Argument::n_float_register_parameters_c, "unsupported");
+  assert(num_bits >= 64 && num_bits <= 2048 && is_power_of_2(num_bits), "unsupported");
+
+  static const FloatRegister VEC_ArgReg[Argument::n_float_register_parameters_c] = {
+    v0, v1, v2, v3, v4, v5, v6, v7
+  };
+
+  // On SVE, we use the same vector registers with 128-bit vector registers on NEON.
+  int next_reg_val = num_bits == 64 ? 1 : 3;
+  for (uint i = 0; i < total_args_passed; i++) {
+    VMReg vmreg = VEC_ArgReg[i]->as_VMReg();
+    regs[i].set_pair(vmreg->next(next_reg_val), vmreg);
+  }
   return 0;
 }
 
diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -42,6 +42,7 @@
 #include "oops/oop.inline.hpp"
 #include "prims/methodHandles.hpp"
 #include "prims/upcallLinker.hpp"
+#include "runtime/arguments.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/continuation.hpp"
 #include "runtime/continuationEntry.inline.hpp"
@@ -8176,6 +8177,78 @@ class StubGenerator: public StubCodeGenerator {
     // }
   };
 
+  void generate_vector_math_stubs() {
+    // Get native vector math stub routine addresses
+    void* libsleef = nullptr;
+    char ebuf[1024];
+    char dll_name[JVM_MAXPATHLEN];
+    if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
+      libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
+    }
+    if (libsleef == nullptr) {
+      log_info(library)("Failed to load native vector math library, %s!", ebuf);
+      return;
+    }
+    // Method naming convention
+    //   All the methods are named as <OP><T><N>_<U><suffix>
+    //   Where:
+    //     <OP>     is the operation name, e.g. sin
+    //     <T>      is optional to indicate float/double
+    //              "f/d" for vector float/double operation
+    //     <N>      is the number of elements in the vector
+    //              "2/4" for neon, and "x" for sve
+    //     <U>      is the precision level
+    //              "u10/u05" represents 1.0/0.5 ULP error bounds
+    //               We use "u10" for all operations by default
+    //               But for those functions do not have u10 support, we use "u05" instead
+    //     <suffix> indicates neon/sve
+    //              "sve/advsimd" for sve/neon implementations
+    //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
+    //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
+    //
+    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
+
+    // Math vector stubs implemented with SVE for scalable vector size.
+    if (UseSVE > 0) {
+      for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
+        int vop = VectorSupport::VECTOR_OP_MATH_START + op;
+        // Skip "tanh" because there is performance regression
+        if (vop == VectorSupport::VECTOR_OP_TANH) {
+          continue;
+        }
+
+        // The native library does not support u10 level of "hypot".
+        const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
+
+        snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
+        StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
+
+        snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
+        StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
+      }
+    }
+
+    // Math vector stubs implemented with NEON for 64/128 bits vector size.
+    for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
+      int vop = VectorSupport::VECTOR_OP_MATH_START + op;
+      // Skip "tanh" because there is performance regression
+      if (vop == VectorSupport::VECTOR_OP_TANH) {
+        continue;
+      }
+
+      // The native library does not support u10 level of "hypot".
+      const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
+
+      snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
+      StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
+
+      snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
+      StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
+
+      snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
+      StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
+    }
+  }
 
   // Initialization
   void generate_initial_stubs() {
@@ -8329,6 +8402,9 @@ class StubGenerator: public StubCodeGenerator {
       // because it's faster for the sizes of modulus we care about.
       StubRoutines::_montgomerySquare = g.generate_multiply();
     }
+
+    generate_vector_math_stubs();
+
 #endif // COMPILER2
 
     if (UseChaCha20Intrinsics) {
@@ -8384,6 +8460,7 @@ class StubGenerator: public StubCodeGenerator {
     if (UseAdler32Intrinsics) {
       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
     }
+
 #endif // COMPILER2_OR_JVMCI
   }
 
diff --git a/src/jdk.incubator.vector/linux/native/libsleef/lib/vector_math_neon.c b/src/jdk.incubator.vector/linux/native/libsleef/lib/vector_math_neon.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#include "../generated/misc.h"
+#include "../generated/sleefinline_advsimd.h"
+
+
+#include <jni.h>
+
+#define DEFINE_VECTOR_MATH_UNARY(op, type) \
+JNIEXPORT                                  \
+type op##advsimd(type input) {             \
+  return Sleef_##op##advsimd(input);       \
+}
+
+#define DEFINE_VECTOR_MATH_BINARY(op, type)   \
+JNIEXPORT                                     \
+type op##advsimd(type input1, type input2) {  \
+  return Sleef_##op##advsimd(input1, input2); \
+}
+
+DEFINE_VECTOR_MATH_UNARY(tanf4_u10,   float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(tanhf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(sinf4_u10,   float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(sinhf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(cosf4_u10,   float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(coshf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(asinf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(acosf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(atanf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(cbrtf4_u10,  float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(logf4_u10,   float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(log10f4_u10, float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(log1pf4_u10, float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(expf4_u10,   float32x4_t)
+DEFINE_VECTOR_MATH_UNARY(expm1f4_u10, float32x4_t)
+
+DEFINE_VECTOR_MATH_UNARY(tand2_u10,   float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(tanhd2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(sind2_u10,   float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(sinhd2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(cosd2_u10,   float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(coshd2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(asind2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(acosd2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(atand2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(cbrtd2_u10,  float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(logd2_u10,   float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(log10d2_u10, float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(log1pd2_u10, float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(expd2_u10,   float64x2_t)
+DEFINE_VECTOR_MATH_UNARY(expm1d2_u10, float64x2_t)
+
+DEFINE_VECTOR_MATH_BINARY(atan2f4_u10, float32x4_t)
+DEFINE_VECTOR_MATH_BINARY(powf4_u10,   float32x4_t)
+DEFINE_VECTOR_MATH_BINARY(hypotf4_u05, float32x4_t)
+
+DEFINE_VECTOR_MATH_BINARY(atan2d2_u10, float64x2_t)
+DEFINE_VECTOR_MATH_BINARY(powd2_u10,   float64x2_t)
+DEFINE_VECTOR_MATH_BINARY(hypotd2_u05, float64x2_t)
+
+#undef DEFINE_VECTOR_MATH_UNARY
+
+#undef DEFINE_VECTOR_MATH_BINARY
+
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
diff --git a/src/jdk.incubator.vector/linux/native/libsleef/lib/vector_math_sve.c b/src/jdk.incubator.vector/linux/native/libsleef/lib/vector_math_sve.c