From e98e5ef446ef919c5b1a4b146a1e6d4e32381e11 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 30 May 2023 11:51:00 -0700 Subject: [PATCH 01/40] 8309130: x86_64 AVX512 intrinsics for Arrays.sort methods (int, long, float and double arrays) --- make/modules/java.base/Lib.gmk | 21 + src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 26 + src/hotspot/share/classfile/vmIntrinsics.hpp | 10 + src/hotspot/share/opto/c2compiler.cpp | 4 + src/hotspot/share/opto/library_call.cpp | 59 ++ src/hotspot/share/opto/library_call.hpp | 2 +- src/hotspot/share/opto/runtime.cpp | 19 + src/hotspot/share/opto/runtime.hpp | 1 + src/hotspot/share/runtime/stubRoutines.cpp | 17 + src/hotspot/share/runtime/stubRoutines.hpp | 5 + src/hotspot/share/runtime/vmStructs.cpp | 4 + .../libavx512_x86_64/avx512-32bit-qsort.hpp | 601 +++++++++++++ .../libavx512_x86_64/avx512-64bit-common.h | 588 ++++++++++++ .../libavx512_x86_64/avx512-64bit-qsort.hpp | 834 ++++++++++++++++++ .../libavx512_x86_64/avx512-common-qsort.h | 521 +++++++++++ .../libavx512_x86_64/avxsort_linux_x86.cpp | 54 ++ .../share/classes/java/util/Arrays.java | 36 +- .../openjdk/bench/java/util/ArraysSort.java | 114 +++ 18 files changed, 2907 insertions(+), 9 deletions(-) create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h create mode 100644 src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp create mode 100644 test/micro/org/openjdk/bench/java/util/ArraysSort.java diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index d6ca293291470..5ec5d03d59c07 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -230,3 +230,24 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true) TARGETS += $(BUILD_LIBFALLBACKLINKER) endif + +################################################################################ + +ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) + $(eval $(call SetupJdkLibrary, BUILD_LIBAVX512_X86_64, \ + NAME := avx512_x86_64, \ + OPTIMIZATION := HIGH, \ + CFLAGS := $(CFLAGS_JDKLIB) -mavx512f -mavx512dq, \ + CXXFLAGS := $(CXXFLAGS_JDKLIB) -mavx512f -mavx512dq, \ + LDFLAGS := $(LDFLAGS_JDKLIB) \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ + LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \ + LDFLAGS_windows := -defaultlib:msvcrt, \ + LIBS := $(LIBCXX), \ + LIBS_linux := -lc -lm -ldl -lstdc++, \ + )) + + TARGETS += $(BUILD_LIBAVX512_X86_64) +endif + +################################################################################ diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 6cd1765151492..f1a8d4928488b 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4126,6 +4126,32 @@ void StubGenerator::generate_compiler_stubs() { = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); } + // Get avx512 sort stub routine addresses + void *libavx512_x86_64 = nullptr; + char ebuf_avx512[1024]; + char dll_name_avx512[JVM_MAXPATHLEN]; + if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "avx512_x86_64")) { + libavx512_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512); + } + if (libavx512_x86_64 != nullptr) { + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "avx512_x86_64" JNI_LIB_SUFFIX, p2i(libavx512_x86_64)); + + if (UseAVX > 2 && VM_Version::supports_avx512dq()) { + + snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_int"); + StubRoutines::_arraysort_int = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + + snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_long"); + StubRoutines::_arraysort_long = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + + snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_float"); + StubRoutines::_arraysort_float = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + + snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_double"); + StubRoutines::_arraysort_double = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + } + } + // Get svml stub routine addresses void *libjsvml = nullptr; char ebuf[1024]; diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 86d5cc9ce5f8c..de02d4bad0092 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -341,6 +341,16 @@ class methodHandle; do_name( copyOf_name, "copyOf") \ do_signature(copyOf_signature, "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;") \ \ + do_intrinsic(_arraySortI, java_util_Arrays, arraySort_name, arraySortI_signature, F_S) \ + do_name( arraySort_name, "arraySort") \ + do_signature(arraySortI_signature, "([III)V") \ + do_intrinsic(_arraySortL, java_util_Arrays, arraySort_name, arraySortL_signature, F_S) \ + do_signature(arraySortL_signature, "([JII)V") \ + do_intrinsic(_arraySortF, java_util_Arrays, arraySort_name, arraySortF_signature, F_S) \ + do_signature(arraySortF_signature, "([FII)V") \ + do_intrinsic(_arraySortD, java_util_Arrays, arraySort_name, arraySortD_signature, F_S) \ + do_signature(arraySortD_signature, "([DII)V") \ + \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ do_name( copyOfRange_name, "copyOfRange") \ do_signature(copyOfRange_signature, "([Ljava/lang/Object;IILjava/lang/Class;)[Ljava/lang/Object;") \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index e26c992d55827..c904c49d22046 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -575,6 +575,10 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) { case vmIntrinsics::_min_strict: case vmIntrinsics::_max_strict: case vmIntrinsics::_arraycopy: + case vmIntrinsics::_arraySortI: + case vmIntrinsics::_arraySortL: + case vmIntrinsics::_arraySortF: + case vmIntrinsics::_arraySortD: case vmIntrinsics::_indexOfL: case vmIntrinsics::_indexOfU: case vmIntrinsics::_indexOfUL: diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index f2e095a4d1740..21eb6b4483064 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -292,6 +292,11 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_arraycopy: return inline_arraycopy(); + case vmIntrinsics::_arraySortI: + case vmIntrinsics::_arraySortL: + case vmIntrinsics::_arraySortF: + case vmIntrinsics::_arraySortD: return inline_arraysort(intrinsic_id()); + case vmIntrinsics::_compareToL: return inline_string_compareTo(StrIntrinsicNode::LL); case vmIntrinsics::_compareToU: return inline_string_compareTo(StrIntrinsicNode::UU); case vmIntrinsics::_compareToLU: return inline_string_compareTo(StrIntrinsicNode::LU); @@ -5192,6 +5197,60 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_ uncommon_trap_call->set_req(0, top()); // not used anymore, kill it } +//------------------------------inline_arraysort----------------------- +bool LibraryCallKit::inline_arraysort(vmIntrinsics::ID id) { + + address stubAddr = nullptr; + const char *stubName; + stubName = "arraysort_stub"; + BasicType bt; + + switch(id) { + case vmIntrinsics::_arraySortI: + bt = T_INT; + break; + case vmIntrinsics::_arraySortL: + bt = T_LONG; + break; + case vmIntrinsics::_arraySortF: + bt = T_FLOAT; + break; + case vmIntrinsics::_arraySortD: + bt = T_DOUBLE; + break; + default: + break; + } + + stubAddr = StubRoutines::select_arraysort_function(bt); + if (stubAddr == nullptr) return false; + + Node* array = argument(0); + Node* fromIndex = argument(1); + Node* toIndex = argument(2); + + array = must_be_not_null(array, true); + + const TypeAryPtr* array_type = array->Value(&_gvn)->isa_aryptr(); + assert(array_type != nullptr && array_type->elem() != Type::BOTTOM, "args are strange"); + + // for the quick and dirty code we will skip all the checks. + // we are just trying to get the call to be generated. + Node* array_fromIndex = array; + if (fromIndex != nullptr || toIndex != nullptr) { + assert(fromIndex != nullptr && toIndex != nullptr, ""); + array_fromIndex = array_element_address(array, fromIndex, bt); + } + + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + array_fromIndex, fromIndex, toIndex); + + return true; +} + + //------------------------------inline_arraycopy----------------------- // public static native void java.lang.System.arraycopy(Object src, int srcPos, // Object dest, int destPos, diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index 46dd51bf654a9..52725e87080f1 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -279,7 +279,7 @@ class LibraryCallKit : public GraphKit { JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp); void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp, uint new_idx); - + bool inline_arraysort(vmIntrinsics::ID id); typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind; bool inline_unsafe_load_store(BasicType type, LoadStoreKind kind, AccessKind access_kind); bool inline_unsafe_fence(vmIntrinsics::ID id); diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 6cc044962c2f8..cd556c2d85cd9 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -857,6 +857,25 @@ const TypeFunc* OptoRuntime::array_fill_Type() { return TypeFunc::make(domain, range); } +const TypeFunc* OptoRuntime::array_sort_Type() { + // create input type (domain) + int num_args = 3; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // array(fromIndex) + fields[argp++] = TypeInt::INT; // fromIndex + fields[argp++] = TypeInt::INT; // toIndex + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = nullptr; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + // for aescrypt encrypt/decrypt operations, just three pointers returning void (length is constant) const TypeFunc* OptoRuntime::aescrypt_block_Type() { // create input type (domain) diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp index cd13c14148d71..e4d5f749d3efa 100644 --- a/src/hotspot/share/opto/runtime.hpp +++ b/src/hotspot/share/opto/runtime.hpp @@ -268,6 +268,7 @@ class OptoRuntime : public AllStatic { static const TypeFunc* array_fill_Type(); + static const TypeFunc* array_sort_Type(); static const TypeFunc* aescrypt_block_Type(); static const TypeFunc* cipherBlockChaining_aescrypt_Type(); static const TypeFunc* electronicCodeBook_aescrypt_Type(); diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 7a6974088ba43..e5b39646f52f0 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -175,6 +175,11 @@ address StubRoutines::_hf2f = nullptr; address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; +address StubRoutines::_arraysort_int = nullptr; +address StubRoutines::_arraysort_long = nullptr; +address StubRoutines::_arraysort_float = nullptr; +address StubRoutines::_arraysort_double = nullptr; + address StubRoutines::_cont_thaw = nullptr; address StubRoutines::_cont_returnBarrier = nullptr; address StubRoutines::_cont_returnBarrierExc = nullptr; @@ -647,3 +652,15 @@ UnsafeCopyMemoryMark::~UnsafeCopyMemoryMark() { } } } + +address StubRoutines::select_arraysort_function(BasicType t) { + switch(t) { + case T_INT: return _arraysort_int; + case T_LONG: return _arraysort_long; + case T_FLOAT: return _arraysort_float; + case T_DOUBLE: return _arraysort_double; + default: + ShouldNotReachHere(); + return nullptr; + } +} diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index 5ce9176f08a2f..0e54f43e93646 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -153,6 +153,10 @@ class StubRoutines: AllStatic { static BufferBlob* _compiler_stubs_code; // code buffer for C2 intrinsics static BufferBlob* _final_stubs_code; // code buffer for all other routines + static address _arraysort_int; + static address _arraysort_long; + static address _arraysort_float; + static address _arraysort_double; // Leaf routines which implement arraycopy and their addresses // arraycopy operands aligned on element type boundary static address _jbyte_arraycopy; @@ -372,6 +376,7 @@ class StubRoutines: AllStatic { static UnsafeArrayCopyStub UnsafeArrayCopy_stub() { return CAST_TO_FN_PTR(UnsafeArrayCopyStub, _unsafe_arraycopy); } static address generic_arraycopy() { return _generic_arraycopy; } + static address select_arraysort_function(BasicType t); static address jbyte_fill() { return _jbyte_fill; } static address jshort_fill() { return _jshort_fill; } diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index 37241534b2b7e..0b252d1c53760 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -588,6 +588,10 @@ static_field(StubRoutines, _checkcast_arraycopy_uninit, address) \ static_field(StubRoutines, _unsafe_arraycopy, address) \ static_field(StubRoutines, _generic_arraycopy, address) \ + static_field(StubRoutines, _arraysort_int, address) \ + static_field(StubRoutines, _arraysort_long, address) \ + static_field(StubRoutines, _arraysort_float, address) \ + static_field(StubRoutines, _arraysort_double, address) \ \ /*****************/ \ /* SharedRuntime */ \ diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp new file mode 100644 index 0000000000000..05efac20cbdb2 --- /dev/null +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Intel x86-simd-sort source code. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ +#ifndef AVX512_QSORT_32BIT +#define AVX512_QSORT_32BIT + +#include "avx512-common-qsort.h" + +/* + * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +#define NETWORK_32BIT_1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +#define NETWORK_32BIT_2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 +#define NETWORK_32BIT_3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +#define NETWORK_32BIT_4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 +#define NETWORK_32BIT_5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 +#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + +template <> +struct zmm_vector { + using type_t = int32_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask16; + static const uint8_t numlanes = 16; + + static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; } + static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; } + static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); } + + static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); + } + template + static ymm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_epi32(index, base, scale); + } + static zmm_t merge(ymm_t y1, ymm_t y2) { + zmm_t z1 = _mm512_castsi256_si512(y1); + return _mm512_inserti32x8(z1, y2, 1); + } + static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_compressstoreu_epi32(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm512_mask_loadu_epi32(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm512_mask_mov_epi32(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_storeu_epi32(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); } + static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) { + return _mm512_permutexvar_epi32(idx, zmm); + } + static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi32(v); } + static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi32(v); } + static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } + template + static zmm_t shuffle(zmm_t zmm) { + return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) { + return _mm512_storeu_si512(mem, x); + } + + static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); } + static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); } +}; +template <> +struct zmm_vector { + using type_t = uint32_t; + using zmm_t = __m512i; + using ymm_t = __m256i; + using opmask_t = __mmask16; + static const uint8_t numlanes = 16; + + static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; } + static type_t type_min() { return 0; } + static zmm_t zmm_max() { + return _mm512_set1_epi32(type_max()); + } // TODO: this should broadcast bits as is? + + template + static ymm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_epi32(index, base, scale); + } + static zmm_t merge(ymm_t y1, ymm_t y2) { + zmm_t z1 = _mm512_castsi256_si512(y1); + return _mm512_inserti32x8(z1, y2, 1); + } + static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); + } + static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } + static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_compressstoreu_epi32(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm512_mask_loadu_epi32(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm512_mask_mov_epi32(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_storeu_epi32(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) { + return _mm512_permutexvar_epi32(idx, zmm); + } + static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu32(v); } + static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu32(v); } + static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } + template + static zmm_t shuffle(zmm_t zmm) { + return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) { + return _mm512_storeu_si512(mem, x); + } + + static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); } + static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); } +}; +template <> +struct zmm_vector { + using type_t = float; + using zmm_t = __m512; + using ymm_t = __m256; + using opmask_t = __mmask16; + static const uint8_t numlanes = 16; + + static type_t type_max() { return X86_SIMD_SORT_INFINITYF; } + static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; } + static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); } + + static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); + } + template + static ymm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_ps(index, base, scale); + } + static zmm_t merge(ymm_t y1, ymm_t y2) { + zmm_t z1 = _mm512_castsi512_ps( + _mm512_castsi256_si512(_mm256_castps_si256(y1))); + return _mm512_insertf32x8(z1, y2, 1); + } + static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); } + static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_compressstoreu_ps(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm512_mask_loadu_ps(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm512_mask_mov_ps(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_storeu_ps(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) { + return _mm512_permutexvar_ps(idx, zmm); + } + static type_t reducemax(zmm_t v) { return _mm512_reduce_max_ps(v); } + static type_t reducemin(zmm_t v) { return _mm512_reduce_min_ps(v); } + static zmm_t set1(type_t v) { return _mm512_set1_ps(v); } + template + static zmm_t shuffle(zmm_t zmm) { + return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); } + + static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); } + static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); } +}; + +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +template +X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) { + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAAAA); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xCCCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAAAA); + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_3), zmm), + 0xF0F0); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xCCCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAAAA); + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm), + 0xFF00); + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm), + 0xF0F0); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xCCCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAAAA); + return zmm; +} + +// Assumes zmm is bitonic and performs a recursive half cleaner +template +X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) { + // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm), + 0xFF00); + // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc .. + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm), + 0xF0F0); + // 3) half_cleaner[4] + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xCCCC); + // 3) half_cleaner[1] + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAAAA); + return zmm; +} + +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +template +X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, + zmm_t *zmm2) { + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2); + zmm_t zmm3 = vtype::min(*zmm1, *zmm2); + zmm_t zmm4 = vtype::max(*zmm1, *zmm2); + // 2) Recursive half cleaner for each + *zmm1 = bitonic_merge_zmm_32bit(zmm3); + *zmm2 = bitonic_merge_zmm_32bit(zmm4); +} + +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive +// half cleaner +template +X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) { + zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]); + zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); + zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[1], zmm2r)); + zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[0], zmm3r)); + zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); + zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); + zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); + zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_32bit(zmm0); + zmm[1] = bitonic_merge_zmm_32bit(zmm1); + zmm[2] = bitonic_merge_zmm_32bit(zmm2); + zmm[3] = bitonic_merge_zmm_32bit(zmm3); +} + +template +X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) { + zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]); + zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]); + zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]); + zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); + zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[3], zmm4r)); + zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[2], zmm5r)); + zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[1], zmm6r)); + zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), + vtype::max(zmm[0], zmm7r)); + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + zmm[0] = bitonic_merge_zmm_32bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_32bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_32bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_32bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_32bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_32bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_32bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_32bit(zmm_t8); +} + +template +X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N) { + typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001; + typename vtype::zmm_t zmm = + vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); + vtype::mask_storeu(arr, load_mask, sort_zmm_32bit(zmm)); +} + +template +X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N) { + if (N <= 16) { + sort_16_32bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + zmm_t zmm1 = vtype::loadu(arr); + typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001; + zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16); + zmm1 = sort_zmm_32bit(zmm1); + zmm2 = sort_zmm_32bit(zmm2); + bitonic_merge_two_zmm_32bit(&zmm1, &zmm2); + vtype::storeu(arr, zmm1); + vtype::mask_storeu(arr + 16, load_mask, zmm2); +} + +template +X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N) { + if (N <= 32) { + sort_32_32bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[4]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 16); + opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; + load_mask1 &= combined_mask & 0xFFFF; + load_mask2 &= (combined_mask >> 16) & 0xFFFF; + zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); + zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48); + zmm[0] = sort_zmm_32bit(zmm[0]); + zmm[1] = sort_zmm_32bit(zmm[1]); + zmm[2] = sort_zmm_32bit(zmm[2]); + zmm[3] = sort_zmm_32bit(zmm[3]); + bitonic_merge_two_zmm_32bit(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_32bit(&zmm[2], &zmm[3]); + bitonic_merge_four_zmm_32bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 16, zmm[1]); + vtype::mask_storeu(arr + 32, load_mask1, zmm[2]); + vtype::mask_storeu(arr + 48, load_mask2, zmm[3]); +} + +template +X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) { + if (N <= 64) { + sort_64_32bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[8]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 16); + zmm[2] = vtype::loadu(arr + 32); + zmm[3] = vtype::loadu(arr + 48); + zmm[0] = sort_zmm_32bit(zmm[0]); + zmm[1] = sort_zmm_32bit(zmm[1]); + zmm[2] = sort_zmm_32bit(zmm[2]); + zmm[3] = sort_zmm_32bit(zmm[3]); + opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF; + opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF; + if (N != 128) { + uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; + load_mask1 &= combined_mask & 0xFFFF; + load_mask2 &= (combined_mask >> 16) & 0xFFFF; + load_mask3 &= (combined_mask >> 32) & 0xFFFF; + load_mask4 &= (combined_mask >> 48) & 0xFFFF; + } + zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); + zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80); + zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96); + zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112); + zmm[4] = sort_zmm_32bit(zmm[4]); + zmm[5] = sort_zmm_32bit(zmm[5]); + zmm[6] = sort_zmm_32bit(zmm[6]); + zmm[7] = sort_zmm_32bit(zmm[7]); + bitonic_merge_two_zmm_32bit(&zmm[0], &zmm[1]); + bitonic_merge_two_zmm_32bit(&zmm[2], &zmm[3]); + bitonic_merge_two_zmm_32bit(&zmm[4], &zmm[5]); + bitonic_merge_two_zmm_32bit(&zmm[6], &zmm[7]); + bitonic_merge_four_zmm_32bit(zmm); + bitonic_merge_four_zmm_32bit(zmm + 4); + bitonic_merge_eight_zmm_32bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 16, zmm[1]); + vtype::storeu(arr + 32, zmm[2]); + vtype::storeu(arr + 48, zmm[3]); + vtype::mask_storeu(arr + 64, load_mask1, zmm[4]); + vtype::mask_storeu(arr + 80, load_mask2, zmm[5]); + vtype::mask_storeu(arr + 96, load_mask3, zmm[6]); + vtype::mask_storeu(arr + 112, load_mask4, zmm[7]); +} + +template +X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, const int64_t left, + const int64_t right) { + // median of 16 + int64_t size = (right - left) / 16; + using zmm_t = typename vtype::zmm_t; + using ymm_t = typename vtype::ymm_t; + __m512i rand_index1 = _mm512_set_epi64( + left + size, left + 2 * size, left + 3 * size, left + 4 * size, + left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size); + __m512i rand_index2 = _mm512_set_epi64( + left + 9 * size, left + 10 * size, left + 11 * size, left + 12 * size, + left + 13 * size, left + 14 * size, left + 15 * size, left + 16 * size); + ymm_t rand_vec1 = + vtype::template i64gather(rand_index1, arr); + ymm_t rand_vec2 = + vtype::template i64gather(rand_index2, arr); + zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2); + zmm_t sort = sort_zmm_32bit(rand_vec); + // pivot will never be a nan, since there are no nan's! + return ((type_t *)&sort)[8]; +} + +template +static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, + int64_t max_iters) { + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_32bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512_unrolled( + arr, left, right + 1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_32bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_32bit_(arr, pivot_index, right, max_iters - 1); +} + +template +static void qselect_32bit_(type_t *arr, int64_t pos, int64_t left, + int64_t right, int64_t max_iters) { + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_32bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512_unrolled( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_32bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_32bit_(arr, pos, pivot_index, right, max_iters - 1); +} + +X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) { + int64_t nan_count = 0; + __mmask16 loadmask = 0xFFFF; + while (arrsize > 0) { + if (arrsize < 16) { + loadmask = (0x0001 << arrsize) - 0x0001; + } + __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); + __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((int32_t)nanmask); + _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); + arr += 16; + arrsize -= 16; + } + return nan_count; +} + +X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize, + int64_t nan_count) { + for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + arr[ii] = std::nanf("1"); + nan_count -= 1; + } +} + +template <> +void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize) { + if (arrsize > 1) { + qselect_32bit_, int32_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize) { + if (arrsize > 1) { + qselect_32bit_, uint32_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(float *arr, int64_t k, int64_t arrsize) { + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_32bit_, float>(arr, k, 0, arrsize - 1, + 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + +template <> +void avx512_qsort(int32_t *arr, int64_t arrsize) { + if (arrsize > 1) { + qsort_32bit_, int32_t>(arr, 0, arrsize - 1, + 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(uint32_t *arr, int64_t arrsize) { + if (arrsize > 1) { + qsort_32bit_, uint32_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(float *arr, int64_t arrsize) { + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qsort_32bit_, float>(arr, 0, arrsize - 1, + 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + +#endif // AVX512_QSORT_32BIT diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h new file mode 100644 index 0000000000000..88fee99c0d79e --- /dev/null +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Intel x86-simd-sort source code. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AVX512_64BIT_COMMON +#define AVX512_64BIT_COMMON +#include "avx512-common-qsort.h" + +/* + * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +// ZMM 7, 6, 5, 4, 3, 2, 1, 0 +#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3 +#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7 +#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 +#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 + +template <> +struct ymm_vector { + using type_t = float; + using zmm_t = __m256; + using zmmi_t = __m256i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() { return X86_SIMD_SORT_INFINITYF; } + static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; } + static zmm_t zmm_max() { return _mm256_set1_ps(type_max()); } + + static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + } + static opmask_t kxor_opmask(opmask_t x, opmask_t y) { + return _kxor_mask8(x, y); + } + static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } + static opmask_t le(zmm_t x, zmm_t y) { + return _mm256_cmp_ps_mask(x, y, _CMP_LE_OQ); + } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm256_cmp_ps_mask(x, y, _CMP_GE_OQ); + } + static opmask_t eq(zmm_t x, zmm_t y) { + return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ); + } + template + static opmask_t fpclass(zmm_t x) { + return _mm256_fpclass_ps_mask(x, type); + } + template + static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, + void const *base) { + return _mm512_mask_i64gather_ps(src, mask, index, base, scale); + } + template + static zmm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_ps(index, base, scale); + } + static zmm_t loadu(void const *mem) { + return _mm256_loadu_ps((float *)mem); + } + static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_ps(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm256_mask_compressstoreu_ps(mem, mask, x); + } + static zmm_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm256_maskz_loadu_ps(mask, mem); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm256_mask_loadu_ps(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm256_mask_mov_ps(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm256_mask_storeu_ps(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_ps(x, y); } + static zmm_t permutexvar(__m256i idx, zmm_t zmm) { + return _mm256_permutexvar_ps(idx, zmm); + } + static type_t reducemax(zmm_t v) { + __m128 v128 = + _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1)); + __m128 v64 = _mm_max_ps( + v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 v32 = + _mm_max_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); + return _mm_cvtss_f32(v32); + } + static type_t reducemin(zmm_t v) { + __m128 v128 = + _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1)); + __m128 v64 = _mm_min_ps( + v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 v32 = + _mm_min_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); + return _mm_cvtss_f32(v32); + } + static zmm_t set1(type_t v) { return _mm256_set1_ps(v); } + template + static zmm_t shuffle(zmm_t zmm) { + /* Hack!: have to make shuffles within 128-bit lanes work for both + * 32-bit and 64-bit */ + return _mm256_shuffle_ps(zmm, zmm, 0b10110001); + // if constexpr (mask == 0b01010101) { + // } + // else { + // /* Not used, so far */ + // return _mm256_shuffle_ps(zmm, zmm, mask); + // } + } + static void storeu(void *mem, zmm_t x) { + _mm256_storeu_ps((float *)mem, x); + } +}; +template <> +struct ymm_vector { + using type_t = uint32_t; + using zmm_t = __m256i; + using zmmi_t = __m256i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; } + static type_t type_min() { return 0; } + static zmm_t zmm_max() { return _mm256_set1_epi32(type_max()); } + + static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + } + static opmask_t kxor_opmask(opmask_t x, opmask_t y) { + return _kxor_mask8(x, y); + } + static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } + static opmask_t le(zmm_t x, zmm_t y) { + return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_LE); + } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); + } + static opmask_t eq(zmm_t x, zmm_t y) { + return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ); + } + template + static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, + void const *base) { + return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); + } + template + static zmm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_epi32(index, base, scale); + } + static zmm_t loadu(void const *mem) { + return _mm256_loadu_si256((__m256i *)mem); + } + static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epu32(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm256_mask_compressstoreu_epi32(mem, mask, x); + } + static zmm_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm256_maskz_loadu_epi32(mask, mem); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm256_mask_loadu_epi32(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm256_mask_mov_epi32(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm256_mask_storeu_epi32(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epu32(x, y); } + static zmm_t permutexvar(__m256i idx, zmm_t zmm) { + return _mm256_permutexvar_epi32(idx, zmm); + } + static type_t reducemax(zmm_t v) { + __m128i v128 = _mm_max_epu32(_mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); + __m128i v64 = _mm_max_epu32( + v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i v32 = + _mm_max_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); + return (type_t)_mm_cvtsi128_si32(v32); + } + static type_t reducemin(zmm_t v) { + __m128i v128 = _mm_min_epu32(_mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); + __m128i v64 = _mm_min_epu32( + v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i v32 = + _mm_min_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); + return (type_t)_mm_cvtsi128_si32(v32); + } + static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); } + template + static zmm_t shuffle(zmm_t zmm) { + /* Hack!: have to make shuffles within 128-bit lanes work for both + * 32-bit and 64-bit */ + return _mm256_shuffle_epi32(zmm, 0b10110001); + } + static void storeu(void *mem, zmm_t x) { + _mm256_storeu_si256((__m256i *)mem, x); + } +}; +template <> +struct ymm_vector { + using type_t = int32_t; + using zmm_t = __m256i; + using zmmi_t = __m256i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; } + static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; } + static zmm_t zmm_max() { + return _mm256_set1_epi32(type_max()); + } // TODO: this should broadcast bits as is? + + static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + } + static opmask_t kxor_opmask(opmask_t x, opmask_t y) { + return _kxor_mask8(x, y); + } + static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } + static opmask_t le(zmm_t x, zmm_t y) { + return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_LE); + } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); + } + static opmask_t eq(zmm_t x, zmm_t y) { + return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ); + } + template + static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, + void const *base) { + return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); + } + template + static zmm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_epi32(index, base, scale); + } + static zmm_t loadu(void const *mem) { + return _mm256_loadu_si256((__m256i *)mem); + } + static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epi32(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm256_mask_compressstoreu_epi32(mem, mask, x); + } + static zmm_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm256_maskz_loadu_epi32(mask, mem); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm256_mask_loadu_epi32(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm256_mask_mov_epi32(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm256_mask_storeu_epi32(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epi32(x, y); } + static zmm_t permutexvar(__m256i idx, zmm_t zmm) { + return _mm256_permutexvar_epi32(idx, zmm); + } + static type_t reducemax(zmm_t v) { + __m128i v128 = _mm_max_epi32(_mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); + __m128i v64 = _mm_max_epi32( + v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i v32 = + _mm_max_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); + return (type_t)_mm_cvtsi128_si32(v32); + } + static type_t reducemin(zmm_t v) { + __m128i v128 = _mm_min_epi32(_mm256_castsi256_si128(v), + _mm256_extracti128_si256(v, 1)); + __m128i v64 = _mm_min_epi32( + v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i v32 = + _mm_min_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); + return (type_t)_mm_cvtsi128_si32(v32); + } + static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); } + template + static zmm_t shuffle(zmm_t zmm) { + /* Hack!: have to make shuffles within 128-bit lanes work for both + * 32-bit and 64-bit */ + return _mm256_shuffle_epi32(zmm, 0b10110001); + } + static void storeu(void *mem, zmm_t x) { + _mm256_storeu_si256((__m256i *)mem, x); + } +}; +template <> +struct zmm_vector { + using type_t = int64_t; + using zmm_t = __m512i; + using zmmi_t = __m512i; + using ymm_t = __m512i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() { return X86_SIMD_SORT_MAX_INT64; } + static type_t type_min() { return X86_SIMD_SORT_MIN_INT64; } + static zmm_t zmm_max() { + return _mm512_set1_epi64(type_max()); + } // TODO: this should broadcast bits as is? + + static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); + } + static opmask_t kxor_opmask(opmask_t x, opmask_t y) { + return _kxor_mask8(x, y); + } + static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } + static opmask_t le(zmm_t x, zmm_t y) { + return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_LE); + } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT); + } + static opmask_t eq(zmm_t x, zmm_t y) { + return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); + } + template + static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, + void const *base) { + return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); + } + template + static zmm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_epi64(index, base, scale); + } + static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } + static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi64(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_compressstoreu_epi64(mem, mask, x); + } + static zmm_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm512_maskz_loadu_epi64(mask, mem); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm512_mask_loadu_epi64(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm512_mask_mov_epi64(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_storeu_epi64(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi64(x, y); } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) { + return _mm512_permutexvar_epi64(idx, zmm); + } + static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi64(v); } + static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi64(v); } + static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); } + template + static zmm_t shuffle(zmm_t zmm) { + __m512d temp = _mm512_castsi512_pd(zmm); + return _mm512_castpd_si512( + _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); + } + static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); } +}; +template <> +struct zmm_vector { + using type_t = uint64_t; + using zmm_t = __m512i; + using zmmi_t = __m512i; + using ymm_t = __m512i; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() { return X86_SIMD_SORT_MAX_UINT64; } + static type_t type_min() { return 0; } + static zmm_t zmm_max() { return _mm512_set1_epi64(type_max()); } + + static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); + } + template + static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, + void const *base) { + return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); + } + template + static zmm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_epi64(index, base, scale); + } + static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT); + } + static opmask_t eq(zmm_t x, zmm_t y) { + return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ); + } + static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } + static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu64(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_compressstoreu_epi64(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm512_mask_loadu_epi64(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm512_mask_mov_epi64(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_storeu_epi64(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu64(x, y); } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) { + return _mm512_permutexvar_epi64(idx, zmm); + } + static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu64(v); } + static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu64(v); } + static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); } + template + static zmm_t shuffle(zmm_t zmm) { + __m512d temp = _mm512_castsi512_pd(zmm); + return _mm512_castpd_si512( + _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); + } + static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); } +}; +template <> +struct zmm_vector { + using type_t = double; + using zmm_t = __m512d; + using zmmi_t = __m512i; + using ymm_t = __m512d; + using opmask_t = __mmask8; + static const uint8_t numlanes = 8; + + static type_t type_max() { return X86_SIMD_SORT_INFINITY; } + static type_t type_min() { return -X86_SIMD_SORT_INFINITY; } + static zmm_t zmm_max() { return _mm512_set1_pd(type_max()); } + + static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); + } + + static zmm_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm512_maskz_loadu_pd(mask, mem); + } + static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } + static opmask_t ge(zmm_t x, zmm_t y) { + return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); + } + static opmask_t eq(zmm_t x, zmm_t y) { + return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); + } + template + static opmask_t fpclass(zmm_t x) { + return _mm512_fpclass_pd_mask(x, type); + } + template + static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, + void const *base) { + return _mm512_mask_i64gather_pd(src, mask, index, base, scale); + } + template + static zmm_t i64gather(__m512i index, void const *base) { + return _mm512_i64gather_pd(index, base, scale); + } + static zmm_t loadu(void const *mem) { return _mm512_loadu_pd(mem); } + static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_pd(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_compressstoreu_pd(mem, mask, x); + } + static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { + return _mm512_mask_loadu_pd(x, mask, mem); + } + static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { + return _mm512_mask_mov_pd(x, mask, y); + } + static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { + return _mm512_mask_storeu_pd(mem, mask, x); + } + static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_pd(x, y); } + static zmm_t permutexvar(__m512i idx, zmm_t zmm) { + return _mm512_permutexvar_pd(idx, zmm); + } + static type_t reducemax(zmm_t v) { return _mm512_reduce_max_pd(v); } + static type_t reducemin(zmm_t v) { return _mm512_reduce_min_pd(v); } + static zmm_t set1(type_t v) { return _mm512_set1_pd(v); } + template + static zmm_t shuffle(zmm_t zmm) { + return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask); + } + static void storeu(void *mem, zmm_t x) { _mm512_storeu_pd(mem, x); } +}; +X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, + int64_t arrsize) { + int64_t nan_count = 0; + __mmask8 loadmask = 0xFF; + while (arrsize > 0) { + if (arrsize < 8) { + loadmask = (0x01 << arrsize) - 0x01; + } + __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr); + __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); + nan_count += _mm_popcnt_u32((int32_t)nanmask); + _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE); + arr += 8; + arrsize -= 8; + } + return nan_count; +} + +X86_SIMD_SORT_INLINE void replace_inf_with_nan(double *arr, int64_t arrsize, + int64_t nan_count) { + for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + arr[ii] = std::nan("1"); + nan_count -= 1; + } +} +/* + * Assumes zmm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +template +X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) { + const typename vtype::zmmi_t rev_index = vtype::seti(NETWORK_64BIT_2); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + zmm = cmp_merge( + zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_1), zmm), 0xCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + zmm = cmp_merge(zmm, vtype::permutexvar(rev_index, zmm), 0xF0); + zmm = cmp_merge( + zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_3), zmm), 0xCC); + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + return zmm; +} + +template +X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, const int64_t left, + const int64_t right) { + // median of 8 + int64_t size = (right - left) / 8; + using zmm_t = typename vtype::zmm_t; + __m512i rand_index = _mm512_set_epi64( + left + size, left + 2 * size, left + 3 * size, left + 4 * size, + left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size); + zmm_t rand_vec = vtype::template i64gather(rand_index, arr); + // pivot will never be a nan, since there are no nan's! + zmm_t sort = sort_zmm_64bit(rand_vec); + return ((type_t *)&sort)[4]; +} + +#endif diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp new file mode 100644 index 0000000000000..893f2ce8363c8 --- /dev/null +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Intel x86-simd-sort source code. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AVX512_QSORT_64BIT +#define AVX512_QSORT_64BIT + +#include "avx512-64bit-common.h" + +// Assumes zmm is bitonic and performs a recursive half cleaner +template +X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) { + // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7 + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_4), zmm), 0xF0); + // 2) half_cleaner[4] + zmm = cmp_merge( + zmm, vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm), 0xCC); + // 3) half_cleaner[1] + zmm = cmp_merge( + zmm, vtype::template shuffle(zmm), 0xAA); + return zmm; +} +// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner +template +X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, + zmm_t &zmm2) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); + // 1) First step of a merging network: coex of zmm1 and zmm2 reversed + zmm2 = vtype::permutexvar(rev_index, zmm2); + zmm_t zmm3 = vtype::min(zmm1, zmm2); + zmm_t zmm4 = vtype::max(zmm1, zmm2); + // 2) Recursive half cleaner for each + zmm1 = bitonic_merge_zmm_64bit(zmm3); + zmm2 = bitonic_merge_zmm_64bit(zmm4); +} +// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive +// half cleaner +template +X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); + // 1) First step of a merging network + zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]); + zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r); + // 2) Recursive half clearer: 16 + zmm_t zmm_t3 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm2r)); + zmm_t zmm_t4 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm3r)); + zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2); + zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2); + zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4); + zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4); + zmm[0] = bitonic_merge_zmm_64bit(zmm0); + zmm[1] = bitonic_merge_zmm_64bit(zmm1); + zmm[2] = bitonic_merge_zmm_64bit(zmm2); + zmm[3] = bitonic_merge_zmm_64bit(zmm3); +} +template +X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); + zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]); + zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]); + zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]); + zmm_t zmm7r = vtype::permutexvar(rev_index, zmm[7]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r); + zmm_t zmm_t5 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm4r)); + zmm_t zmm_t6 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm5r)); + zmm_t zmm_t7 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm6r)); + zmm_t zmm_t8 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm7r)); + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); +} +template +X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); + zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]); + zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]); + zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]); + zmm_t zmm11r = vtype::permutexvar(rev_index, zmm[11]); + zmm_t zmm12r = vtype::permutexvar(rev_index, zmm[12]); + zmm_t zmm13r = vtype::permutexvar(rev_index, zmm[13]); + zmm_t zmm14r = vtype::permutexvar(rev_index, zmm[14]); + zmm_t zmm15r = vtype::permutexvar(rev_index, zmm[15]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm15r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm14r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm13r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm12r); + zmm_t zmm_t5 = vtype::min(zmm[4], zmm11r); + zmm_t zmm_t6 = vtype::min(zmm[5], zmm10r); + zmm_t zmm_t7 = vtype::min(zmm[6], zmm9r); + zmm_t zmm_t8 = vtype::min(zmm[7], zmm8r); + zmm_t zmm_t9 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm8r)); + zmm_t zmm_t10 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm9r)); + zmm_t zmm_t11 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm10r)); + zmm_t zmm_t12 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm11r)); + zmm_t zmm_t13 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm12r)); + zmm_t zmm_t14 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm13r)); + zmm_t zmm_t15 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm14r)); + zmm_t zmm_t16 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm15r)); + // Recusive half clear 16 zmm regs + COEX(zmm_t1, zmm_t5); + COEX(zmm_t2, zmm_t6); + COEX(zmm_t3, zmm_t7); + COEX(zmm_t4, zmm_t8); + COEX(zmm_t9, zmm_t13); + COEX(zmm_t10, zmm_t14); + COEX(zmm_t11, zmm_t15); + COEX(zmm_t12, zmm_t16); + // + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t9, zmm_t11); + COEX(zmm_t10, zmm_t12); + COEX(zmm_t13, zmm_t15); + COEX(zmm_t14, zmm_t16); + // + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + COEX(zmm_t9, zmm_t10); + COEX(zmm_t11, zmm_t12); + COEX(zmm_t13, zmm_t14); + COEX(zmm_t15, zmm_t16); + // + zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); + zmm[8] = bitonic_merge_zmm_64bit(zmm_t9); + zmm[9] = bitonic_merge_zmm_64bit(zmm_t10); + zmm[10] = bitonic_merge_zmm_64bit(zmm_t11); + zmm[11] = bitonic_merge_zmm_64bit(zmm_t12); + zmm[12] = bitonic_merge_zmm_64bit(zmm_t13); + zmm[13] = bitonic_merge_zmm_64bit(zmm_t14); + zmm[14] = bitonic_merge_zmm_64bit(zmm_t15); + zmm[15] = bitonic_merge_zmm_64bit(zmm_t16); +} + +template +X86_SIMD_SORT_INLINE void bitonic_merge_32_zmm_64bit(zmm_t *zmm) { + const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); + zmm_t zmm16r = vtype::permutexvar(rev_index, zmm[16]); + zmm_t zmm17r = vtype::permutexvar(rev_index, zmm[17]); + zmm_t zmm18r = vtype::permutexvar(rev_index, zmm[18]); + zmm_t zmm19r = vtype::permutexvar(rev_index, zmm[19]); + zmm_t zmm20r = vtype::permutexvar(rev_index, zmm[20]); + zmm_t zmm21r = vtype::permutexvar(rev_index, zmm[21]); + zmm_t zmm22r = vtype::permutexvar(rev_index, zmm[22]); + zmm_t zmm23r = vtype::permutexvar(rev_index, zmm[23]); + zmm_t zmm24r = vtype::permutexvar(rev_index, zmm[24]); + zmm_t zmm25r = vtype::permutexvar(rev_index, zmm[25]); + zmm_t zmm26r = vtype::permutexvar(rev_index, zmm[26]); + zmm_t zmm27r = vtype::permutexvar(rev_index, zmm[27]); + zmm_t zmm28r = vtype::permutexvar(rev_index, zmm[28]); + zmm_t zmm29r = vtype::permutexvar(rev_index, zmm[29]); + zmm_t zmm30r = vtype::permutexvar(rev_index, zmm[30]); + zmm_t zmm31r = vtype::permutexvar(rev_index, zmm[31]); + zmm_t zmm_t1 = vtype::min(zmm[0], zmm31r); + zmm_t zmm_t2 = vtype::min(zmm[1], zmm30r); + zmm_t zmm_t3 = vtype::min(zmm[2], zmm29r); + zmm_t zmm_t4 = vtype::min(zmm[3], zmm28r); + zmm_t zmm_t5 = vtype::min(zmm[4], zmm27r); + zmm_t zmm_t6 = vtype::min(zmm[5], zmm26r); + zmm_t zmm_t7 = vtype::min(zmm[6], zmm25r); + zmm_t zmm_t8 = vtype::min(zmm[7], zmm24r); + zmm_t zmm_t9 = vtype::min(zmm[8], zmm23r); + zmm_t zmm_t10 = vtype::min(zmm[9], zmm22r); + zmm_t zmm_t11 = vtype::min(zmm[10], zmm21r); + zmm_t zmm_t12 = vtype::min(zmm[11], zmm20r); + zmm_t zmm_t13 = vtype::min(zmm[12], zmm19r); + zmm_t zmm_t14 = vtype::min(zmm[13], zmm18r); + zmm_t zmm_t15 = vtype::min(zmm[14], zmm17r); + zmm_t zmm_t16 = vtype::min(zmm[15], zmm16r); + zmm_t zmm_t17 = vtype::permutexvar(rev_index, vtype::max(zmm[15], zmm16r)); + zmm_t zmm_t18 = vtype::permutexvar(rev_index, vtype::max(zmm[14], zmm17r)); + zmm_t zmm_t19 = vtype::permutexvar(rev_index, vtype::max(zmm[13], zmm18r)); + zmm_t zmm_t20 = vtype::permutexvar(rev_index, vtype::max(zmm[12], zmm19r)); + zmm_t zmm_t21 = vtype::permutexvar(rev_index, vtype::max(zmm[11], zmm20r)); + zmm_t zmm_t22 = vtype::permutexvar(rev_index, vtype::max(zmm[10], zmm21r)); + zmm_t zmm_t23 = vtype::permutexvar(rev_index, vtype::max(zmm[9], zmm22r)); + zmm_t zmm_t24 = vtype::permutexvar(rev_index, vtype::max(zmm[8], zmm23r)); + zmm_t zmm_t25 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm24r)); + zmm_t zmm_t26 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm25r)); + zmm_t zmm_t27 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm26r)); + zmm_t zmm_t28 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm27r)); + zmm_t zmm_t29 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm28r)); + zmm_t zmm_t30 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm29r)); + zmm_t zmm_t31 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm30r)); + zmm_t zmm_t32 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm31r)); + // Recusive half clear 16 zmm regs + COEX(zmm_t1, zmm_t9); + COEX(zmm_t2, zmm_t10); + COEX(zmm_t3, zmm_t11); + COEX(zmm_t4, zmm_t12); + COEX(zmm_t5, zmm_t13); + COEX(zmm_t6, zmm_t14); + COEX(zmm_t7, zmm_t15); + COEX(zmm_t8, zmm_t16); + COEX(zmm_t17, zmm_t25); + COEX(zmm_t18, zmm_t26); + COEX(zmm_t19, zmm_t27); + COEX(zmm_t20, zmm_t28); + COEX(zmm_t21, zmm_t29); + COEX(zmm_t22, zmm_t30); + COEX(zmm_t23, zmm_t31); + COEX(zmm_t24, zmm_t32); + // + COEX(zmm_t1, zmm_t5); + COEX(zmm_t2, zmm_t6); + COEX(zmm_t3, zmm_t7); + COEX(zmm_t4, zmm_t8); + COEX(zmm_t9, zmm_t13); + COEX(zmm_t10, zmm_t14); + COEX(zmm_t11, zmm_t15); + COEX(zmm_t12, zmm_t16); + COEX(zmm_t17, zmm_t21); + COEX(zmm_t18, zmm_t22); + COEX(zmm_t19, zmm_t23); + COEX(zmm_t20, zmm_t24); + COEX(zmm_t25, zmm_t29); + COEX(zmm_t26, zmm_t30); + COEX(zmm_t27, zmm_t31); + COEX(zmm_t28, zmm_t32); + // + COEX(zmm_t1, zmm_t3); + COEX(zmm_t2, zmm_t4); + COEX(zmm_t5, zmm_t7); + COEX(zmm_t6, zmm_t8); + COEX(zmm_t9, zmm_t11); + COEX(zmm_t10, zmm_t12); + COEX(zmm_t13, zmm_t15); + COEX(zmm_t14, zmm_t16); + COEX(zmm_t17, zmm_t19); + COEX(zmm_t18, zmm_t20); + COEX(zmm_t21, zmm_t23); + COEX(zmm_t22, zmm_t24); + COEX(zmm_t25, zmm_t27); + COEX(zmm_t26, zmm_t28); + COEX(zmm_t29, zmm_t31); + COEX(zmm_t30, zmm_t32); + // + COEX(zmm_t1, zmm_t2); + COEX(zmm_t3, zmm_t4); + COEX(zmm_t5, zmm_t6); + COEX(zmm_t7, zmm_t8); + COEX(zmm_t9, zmm_t10); + COEX(zmm_t11, zmm_t12); + COEX(zmm_t13, zmm_t14); + COEX(zmm_t15, zmm_t16); + COEX(zmm_t17, zmm_t18); + COEX(zmm_t19, zmm_t20); + COEX(zmm_t21, zmm_t22); + COEX(zmm_t23, zmm_t24); + COEX(zmm_t25, zmm_t26); + COEX(zmm_t27, zmm_t28); + COEX(zmm_t29, zmm_t30); + COEX(zmm_t31, zmm_t32); + // + zmm[0] = bitonic_merge_zmm_64bit(zmm_t1); + zmm[1] = bitonic_merge_zmm_64bit(zmm_t2); + zmm[2] = bitonic_merge_zmm_64bit(zmm_t3); + zmm[3] = bitonic_merge_zmm_64bit(zmm_t4); + zmm[4] = bitonic_merge_zmm_64bit(zmm_t5); + zmm[5] = bitonic_merge_zmm_64bit(zmm_t6); + zmm[6] = bitonic_merge_zmm_64bit(zmm_t7); + zmm[7] = bitonic_merge_zmm_64bit(zmm_t8); + zmm[8] = bitonic_merge_zmm_64bit(zmm_t9); + zmm[9] = bitonic_merge_zmm_64bit(zmm_t10); + zmm[10] = bitonic_merge_zmm_64bit(zmm_t11); + zmm[11] = bitonic_merge_zmm_64bit(zmm_t12); + zmm[12] = bitonic_merge_zmm_64bit(zmm_t13); + zmm[13] = bitonic_merge_zmm_64bit(zmm_t14); + zmm[14] = bitonic_merge_zmm_64bit(zmm_t15); + zmm[15] = bitonic_merge_zmm_64bit(zmm_t16); + zmm[16] = bitonic_merge_zmm_64bit(zmm_t17); + zmm[17] = bitonic_merge_zmm_64bit(zmm_t18); + zmm[18] = bitonic_merge_zmm_64bit(zmm_t19); + zmm[19] = bitonic_merge_zmm_64bit(zmm_t20); + zmm[20] = bitonic_merge_zmm_64bit(zmm_t21); + zmm[21] = bitonic_merge_zmm_64bit(zmm_t22); + zmm[22] = bitonic_merge_zmm_64bit(zmm_t23); + zmm[23] = bitonic_merge_zmm_64bit(zmm_t24); + zmm[24] = bitonic_merge_zmm_64bit(zmm_t25); + zmm[25] = bitonic_merge_zmm_64bit(zmm_t26); + zmm[26] = bitonic_merge_zmm_64bit(zmm_t27); + zmm[27] = bitonic_merge_zmm_64bit(zmm_t28); + zmm[28] = bitonic_merge_zmm_64bit(zmm_t29); + zmm[29] = bitonic_merge_zmm_64bit(zmm_t30); + zmm[30] = bitonic_merge_zmm_64bit(zmm_t31); + zmm[31] = bitonic_merge_zmm_64bit(zmm_t32); +} + +template +X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N) { + typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; + typename vtype::zmm_t zmm = + vtype::mask_loadu(vtype::zmm_max(), load_mask, arr); + vtype::mask_storeu(arr, load_mask, sort_zmm_64bit(zmm)); +} + +template +X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N) { + if (N <= 8) { + sort_8_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + zmm_t zmm1 = vtype::loadu(arr); + typename vtype::opmask_t load_mask = (0x01 << (N - 8)) - 0x01; + zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 8); + zmm1 = sort_zmm_64bit(zmm1); + zmm2 = sort_zmm_64bit(zmm2); + bitonic_merge_two_zmm_64bit(zmm1, zmm2); + vtype::storeu(arr, zmm1); + vtype::mask_storeu(arr + 8, load_mask, zmm2); +} + +template +X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N) { + if (N <= 16) { + sort_16_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[4]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 16); + zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 24); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_four_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::mask_storeu(arr + 16, load_mask1, zmm[2]); + vtype::mask_storeu(arr + 24, load_mask2, zmm[3]); +} + +template +X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N) { + if (N <= 32) { + sort_32_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[8]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + zmm[2] = vtype::loadu(arr + 16); + zmm[3] = vtype::loadu(arr + 24); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; + // N-32 >= 1 + uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + load_mask3 = (combined_mask >> 16) & 0xFF; + load_mask4 = (combined_mask >> 24) & 0xFF; + zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32); + zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 40); + zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 48); + zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 56); + zmm[4] = sort_zmm_64bit(zmm[4]); + zmm[5] = sort_zmm_64bit(zmm[5]); + zmm[6] = sort_zmm_64bit(zmm[6]); + zmm[7] = sort_zmm_64bit(zmm[7]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); + bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); + bitonic_merge_four_zmm_64bit(zmm); + bitonic_merge_four_zmm_64bit(zmm + 4); + bitonic_merge_eight_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::storeu(arr + 16, zmm[2]); + vtype::storeu(arr + 24, zmm[3]); + vtype::mask_storeu(arr + 32, load_mask1, zmm[4]); + vtype::mask_storeu(arr + 40, load_mask2, zmm[5]); + vtype::mask_storeu(arr + 48, load_mask3, zmm[6]); + vtype::mask_storeu(arr + 56, load_mask4, zmm[7]); +} + +template +X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N) { + if (N <= 64) { + sort_64_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[16]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + zmm[2] = vtype::loadu(arr + 16); + zmm[3] = vtype::loadu(arr + 24); + zmm[4] = vtype::loadu(arr + 32); + zmm[5] = vtype::loadu(arr + 40); + zmm[6] = vtype::loadu(arr + 48); + zmm[7] = vtype::loadu(arr + 56); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + zmm[4] = sort_zmm_64bit(zmm[4]); + zmm[5] = sort_zmm_64bit(zmm[5]); + zmm[6] = sort_zmm_64bit(zmm[6]); + zmm[7] = sort_zmm_64bit(zmm[7]); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; + opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF; + opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF; + if (N != 128) { + uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + load_mask3 = (combined_mask >> 16) & 0xFF; + load_mask4 = (combined_mask >> 24) & 0xFF; + load_mask5 = (combined_mask >> 32) & 0xFF; + load_mask6 = (combined_mask >> 40) & 0xFF; + load_mask7 = (combined_mask >> 48) & 0xFF; + load_mask8 = (combined_mask >> 56) & 0xFF; + } + zmm[8] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64); + zmm[9] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 72); + zmm[10] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 80); + zmm[11] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 88); + zmm[12] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 96); + zmm[13] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 104); + zmm[14] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 112); + zmm[15] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 120); + zmm[8] = sort_zmm_64bit(zmm[8]); + zmm[9] = sort_zmm_64bit(zmm[9]); + zmm[10] = sort_zmm_64bit(zmm[10]); + zmm[11] = sort_zmm_64bit(zmm[11]); + zmm[12] = sort_zmm_64bit(zmm[12]); + zmm[13] = sort_zmm_64bit(zmm[13]); + zmm[14] = sort_zmm_64bit(zmm[14]); + zmm[15] = sort_zmm_64bit(zmm[15]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); + bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); + bitonic_merge_two_zmm_64bit(zmm[8], zmm[9]); + bitonic_merge_two_zmm_64bit(zmm[10], zmm[11]); + bitonic_merge_two_zmm_64bit(zmm[12], zmm[13]); + bitonic_merge_two_zmm_64bit(zmm[14], zmm[15]); + bitonic_merge_four_zmm_64bit(zmm); + bitonic_merge_four_zmm_64bit(zmm + 4); + bitonic_merge_four_zmm_64bit(zmm + 8); + bitonic_merge_four_zmm_64bit(zmm + 12); + bitonic_merge_eight_zmm_64bit(zmm); + bitonic_merge_eight_zmm_64bit(zmm + 8); + bitonic_merge_sixteen_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::storeu(arr + 16, zmm[2]); + vtype::storeu(arr + 24, zmm[3]); + vtype::storeu(arr + 32, zmm[4]); + vtype::storeu(arr + 40, zmm[5]); + vtype::storeu(arr + 48, zmm[6]); + vtype::storeu(arr + 56, zmm[7]); + vtype::mask_storeu(arr + 64, load_mask1, zmm[8]); + vtype::mask_storeu(arr + 72, load_mask2, zmm[9]); + vtype::mask_storeu(arr + 80, load_mask3, zmm[10]); + vtype::mask_storeu(arr + 88, load_mask4, zmm[11]); + vtype::mask_storeu(arr + 96, load_mask5, zmm[12]); + vtype::mask_storeu(arr + 104, load_mask6, zmm[13]); + vtype::mask_storeu(arr + 112, load_mask7, zmm[14]); + vtype::mask_storeu(arr + 120, load_mask8, zmm[15]); +} + +template +X86_SIMD_SORT_INLINE void sort_256_64bit(type_t *arr, int32_t N) { + if (N <= 128) { + sort_128_64bit(arr, N); + return; + } + using zmm_t = typename vtype::zmm_t; + using opmask_t = typename vtype::opmask_t; + zmm_t zmm[32]; + zmm[0] = vtype::loadu(arr); + zmm[1] = vtype::loadu(arr + 8); + zmm[2] = vtype::loadu(arr + 16); + zmm[3] = vtype::loadu(arr + 24); + zmm[4] = vtype::loadu(arr + 32); + zmm[5] = vtype::loadu(arr + 40); + zmm[6] = vtype::loadu(arr + 48); + zmm[7] = vtype::loadu(arr + 56); + zmm[8] = vtype::loadu(arr + 64); + zmm[9] = vtype::loadu(arr + 72); + zmm[10] = vtype::loadu(arr + 80); + zmm[11] = vtype::loadu(arr + 88); + zmm[12] = vtype::loadu(arr + 96); + zmm[13] = vtype::loadu(arr + 104); + zmm[14] = vtype::loadu(arr + 112); + zmm[15] = vtype::loadu(arr + 120); + zmm[0] = sort_zmm_64bit(zmm[0]); + zmm[1] = sort_zmm_64bit(zmm[1]); + zmm[2] = sort_zmm_64bit(zmm[2]); + zmm[3] = sort_zmm_64bit(zmm[3]); + zmm[4] = sort_zmm_64bit(zmm[4]); + zmm[5] = sort_zmm_64bit(zmm[5]); + zmm[6] = sort_zmm_64bit(zmm[6]); + zmm[7] = sort_zmm_64bit(zmm[7]); + zmm[8] = sort_zmm_64bit(zmm[8]); + zmm[9] = sort_zmm_64bit(zmm[9]); + zmm[10] = sort_zmm_64bit(zmm[10]); + zmm[11] = sort_zmm_64bit(zmm[11]); + zmm[12] = sort_zmm_64bit(zmm[12]); + zmm[13] = sort_zmm_64bit(zmm[13]); + zmm[14] = sort_zmm_64bit(zmm[14]); + zmm[15] = sort_zmm_64bit(zmm[15]); + opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF; + opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF; + opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF; + opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF; + opmask_t load_mask9 = 0xFF, load_mask10 = 0xFF; + opmask_t load_mask11 = 0xFF, load_mask12 = 0xFF; + opmask_t load_mask13 = 0xFF, load_mask14 = 0xFF; + opmask_t load_mask15 = 0xFF, load_mask16 = 0xFF; + if (N != 256) { + uint64_t combined_mask; + if (N < 192) { + combined_mask = (0x1ull << (N - 128)) - 0x1ull; + load_mask1 = (combined_mask)&0xFF; + load_mask2 = (combined_mask >> 8) & 0xFF; + load_mask3 = (combined_mask >> 16) & 0xFF; + load_mask4 = (combined_mask >> 24) & 0xFF; + load_mask5 = (combined_mask >> 32) & 0xFF; + load_mask6 = (combined_mask >> 40) & 0xFF; + load_mask7 = (combined_mask >> 48) & 0xFF; + load_mask8 = (combined_mask >> 56) & 0xFF; + load_mask9 = 0x00; + load_mask10 = 0x0; + load_mask11 = 0x00; + load_mask12 = 0x00; + load_mask13 = 0x00; + load_mask14 = 0x00; + load_mask15 = 0x00; + load_mask16 = 0x00; + } else { + combined_mask = (0x1ull << (N - 192)) - 0x1ull; + load_mask9 = (combined_mask)&0xFF; + load_mask10 = (combined_mask >> 8) & 0xFF; + load_mask11 = (combined_mask >> 16) & 0xFF; + load_mask12 = (combined_mask >> 24) & 0xFF; + load_mask13 = (combined_mask >> 32) & 0xFF; + load_mask14 = (combined_mask >> 40) & 0xFF; + load_mask15 = (combined_mask >> 48) & 0xFF; + load_mask16 = (combined_mask >> 56) & 0xFF; + } + } + zmm[16] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 128); + zmm[17] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 136); + zmm[18] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 144); + zmm[19] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 152); + zmm[20] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 160); + zmm[21] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 168); + zmm[22] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 176); + zmm[23] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 184); + if (N < 192) { + zmm[24] = vtype::zmm_max(); + zmm[25] = vtype::zmm_max(); + zmm[26] = vtype::zmm_max(); + zmm[27] = vtype::zmm_max(); + zmm[28] = vtype::zmm_max(); + zmm[29] = vtype::zmm_max(); + zmm[30] = vtype::zmm_max(); + zmm[31] = vtype::zmm_max(); + } else { + zmm[24] = vtype::mask_loadu(vtype::zmm_max(), load_mask9, arr + 192); + zmm[25] = vtype::mask_loadu(vtype::zmm_max(), load_mask10, arr + 200); + zmm[26] = vtype::mask_loadu(vtype::zmm_max(), load_mask11, arr + 208); + zmm[27] = vtype::mask_loadu(vtype::zmm_max(), load_mask12, arr + 216); + zmm[28] = vtype::mask_loadu(vtype::zmm_max(), load_mask13, arr + 224); + zmm[29] = vtype::mask_loadu(vtype::zmm_max(), load_mask14, arr + 232); + zmm[30] = vtype::mask_loadu(vtype::zmm_max(), load_mask15, arr + 240); + zmm[31] = vtype::mask_loadu(vtype::zmm_max(), load_mask16, arr + 248); + } + zmm[16] = sort_zmm_64bit(zmm[16]); + zmm[17] = sort_zmm_64bit(zmm[17]); + zmm[18] = sort_zmm_64bit(zmm[18]); + zmm[19] = sort_zmm_64bit(zmm[19]); + zmm[20] = sort_zmm_64bit(zmm[20]); + zmm[21] = sort_zmm_64bit(zmm[21]); + zmm[22] = sort_zmm_64bit(zmm[22]); + zmm[23] = sort_zmm_64bit(zmm[23]); + zmm[24] = sort_zmm_64bit(zmm[24]); + zmm[25] = sort_zmm_64bit(zmm[25]); + zmm[26] = sort_zmm_64bit(zmm[26]); + zmm[27] = sort_zmm_64bit(zmm[27]); + zmm[28] = sort_zmm_64bit(zmm[28]); + zmm[29] = sort_zmm_64bit(zmm[29]); + zmm[30] = sort_zmm_64bit(zmm[30]); + zmm[31] = sort_zmm_64bit(zmm[31]); + bitonic_merge_two_zmm_64bit(zmm[0], zmm[1]); + bitonic_merge_two_zmm_64bit(zmm[2], zmm[3]); + bitonic_merge_two_zmm_64bit(zmm[4], zmm[5]); + bitonic_merge_two_zmm_64bit(zmm[6], zmm[7]); + bitonic_merge_two_zmm_64bit(zmm[8], zmm[9]); + bitonic_merge_two_zmm_64bit(zmm[10], zmm[11]); + bitonic_merge_two_zmm_64bit(zmm[12], zmm[13]); + bitonic_merge_two_zmm_64bit(zmm[14], zmm[15]); + bitonic_merge_two_zmm_64bit(zmm[16], zmm[17]); + bitonic_merge_two_zmm_64bit(zmm[18], zmm[19]); + bitonic_merge_two_zmm_64bit(zmm[20], zmm[21]); + bitonic_merge_two_zmm_64bit(zmm[22], zmm[23]); + bitonic_merge_two_zmm_64bit(zmm[24], zmm[25]); + bitonic_merge_two_zmm_64bit(zmm[26], zmm[27]); + bitonic_merge_two_zmm_64bit(zmm[28], zmm[29]); + bitonic_merge_two_zmm_64bit(zmm[30], zmm[31]); + bitonic_merge_four_zmm_64bit(zmm); + bitonic_merge_four_zmm_64bit(zmm + 4); + bitonic_merge_four_zmm_64bit(zmm + 8); + bitonic_merge_four_zmm_64bit(zmm + 12); + bitonic_merge_four_zmm_64bit(zmm + 16); + bitonic_merge_four_zmm_64bit(zmm + 20); + bitonic_merge_four_zmm_64bit(zmm + 24); + bitonic_merge_four_zmm_64bit(zmm + 28); + bitonic_merge_eight_zmm_64bit(zmm); + bitonic_merge_eight_zmm_64bit(zmm + 8); + bitonic_merge_eight_zmm_64bit(zmm + 16); + bitonic_merge_eight_zmm_64bit(zmm + 24); + bitonic_merge_sixteen_zmm_64bit(zmm); + bitonic_merge_sixteen_zmm_64bit(zmm + 16); + bitonic_merge_32_zmm_64bit(zmm); + vtype::storeu(arr, zmm[0]); + vtype::storeu(arr + 8, zmm[1]); + vtype::storeu(arr + 16, zmm[2]); + vtype::storeu(arr + 24, zmm[3]); + vtype::storeu(arr + 32, zmm[4]); + vtype::storeu(arr + 40, zmm[5]); + vtype::storeu(arr + 48, zmm[6]); + vtype::storeu(arr + 56, zmm[7]); + vtype::storeu(arr + 64, zmm[8]); + vtype::storeu(arr + 72, zmm[9]); + vtype::storeu(arr + 80, zmm[10]); + vtype::storeu(arr + 88, zmm[11]); + vtype::storeu(arr + 96, zmm[12]); + vtype::storeu(arr + 104, zmm[13]); + vtype::storeu(arr + 112, zmm[14]); + vtype::storeu(arr + 120, zmm[15]); + vtype::mask_storeu(arr + 128, load_mask1, zmm[16]); + vtype::mask_storeu(arr + 136, load_mask2, zmm[17]); + vtype::mask_storeu(arr + 144, load_mask3, zmm[18]); + vtype::mask_storeu(arr + 152, load_mask4, zmm[19]); + vtype::mask_storeu(arr + 160, load_mask5, zmm[20]); + vtype::mask_storeu(arr + 168, load_mask6, zmm[21]); + vtype::mask_storeu(arr + 176, load_mask7, zmm[22]); + vtype::mask_storeu(arr + 184, load_mask8, zmm[23]); + if (N > 192) { + vtype::mask_storeu(arr + 192, load_mask9, zmm[24]); + vtype::mask_storeu(arr + 200, load_mask10, zmm[25]); + vtype::mask_storeu(arr + 208, load_mask11, zmm[26]); + vtype::mask_storeu(arr + 216, load_mask12, zmm[27]); + vtype::mask_storeu(arr + 224, load_mask13, zmm[28]); + vtype::mask_storeu(arr + 232, load_mask14, zmm[29]); + vtype::mask_storeu(arr + 240, load_mask15, zmm[30]); + vtype::mask_storeu(arr + 248, load_mask16, zmm[31]); + } +} + +template +static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, + int64_t max_iters) { + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 256) { + sort_256_64bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_64bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512_unrolled( + arr, left, right + 1, pivot, &smallest, &biggest); + if (pivot != smallest) + qsort_64bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_64bit_(arr, pivot_index, right, max_iters - 1); +} + +template +static void qselect_64bit_(type_t *arr, int64_t pos, int64_t left, + int64_t right, int64_t max_iters) { + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_64bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_64bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512_unrolled( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_64bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_64bit_(arr, pos, pivot_index, right, max_iters - 1); +} + +template <> +void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize) { + if (arrsize > 1) { + qselect_64bit_, int64_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize) { + if (arrsize > 1) { + qselect_64bit_, uint64_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(double *arr, int64_t k, int64_t arrsize) { + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_64bit_, double>(arr, k, 0, arrsize - 1, + 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + +template <> +void avx512_qsort(int64_t *arr, int64_t arrsize) { + if (arrsize > 1) { + qsort_64bit_, int64_t>(arr, 0, arrsize - 1, + 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(uint64_t *arr, int64_t arrsize) { + if (arrsize > 1) { + qsort_64bit_, uint64_t>( + arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qsort(double *arr, int64_t arrsize) { + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qsort_64bit_, double>(arr, 0, arrsize - 1, + 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} +#endif // AVX512_QSORT_64BIT diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h new file mode 100644 index 0000000000000..b477f9e65c233 --- /dev/null +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Intel x86-simd-sort source code. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AVX512_QSORT_COMMON +#define AVX512_QSORT_COMMON + +/* + * Quicksort using AVX-512. The ideas and code are based on these two research + * papers [1] and [2]. On a high level, the idea is to vectorize quicksort + * partitioning using AVX-512 compressstore instructions. If the array size is + * < 128, then use Bitonic sorting network implemented on 512-bit registers. + * The precise network definitions depend on the dtype and are defined in + * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and + * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting + * network. The core implementations of the vectorized qsort functions + * avx512_qsort(T*, int64_t) are modified versions of avx2 quicksort + * presented in the paper [2] and source code associated with that paper [3]. + * + * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types + * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ + * + * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel + * Skylake https://arxiv.org/pdf/1704.08579.pdf + * + * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: + * MIT + * + * [4] + * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 + * + */ + +#include + +#include +#include +#include +#include +#include + +#define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() +#define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() +#define X86_SIMD_SORT_INFINITYH 0x7c00 +#define X86_SIMD_SORT_NEGINFINITYH 0xfc00 +#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits::max() +#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits::max() +#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits::min() +#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits::max() +#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits::max() +#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits::min() +#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits::max() +#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits::max() +#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits::min() +#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY) +#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64) +#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64) +#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF) +#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32) +#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32) +#define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH) +#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH) +#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16) +#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) +#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d + +#ifdef _MSC_VER +#define X86_SIMD_SORT_INLINE static inline +#define X86_SIMD_SORT_FINLINE static __forceinline +#elif defined(__CYGWIN__) +/* + * Force inline in cygwin to work around a compiler bug. See + * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584 + */ +#define X86_SIMD_SORT_INLINE static __attribute__((always_inline)) +#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) +#elif defined(__GNUC__) +#define X86_SIMD_SORT_INLINE static inline +#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) +#else +#define X86_SIMD_SORT_INLINE static +#define X86_SIMD_SORT_FINLINE static +#endif + +template +struct zmm_vector; + +template +struct ymm_vector; + +// Regular quicksort routines: +template +void avx512_qsort(T *arr, int64_t arrsize); +void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); + +template +void avx512_qselect(T *arr, int64_t k, int64_t arrsize); +void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize); + +template +inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) { + avx512_qselect(arr, k - 1, arrsize); + avx512_qsort(arr, k - 1); +} +inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, + int64_t arrsize) { + avx512_qselect_fp16(arr, k - 1, arrsize); + avx512_qsort_fp16(arr, k - 1); +} + +// key-value sort routines +template +void avx512_qsort_kv(T *keys, uint64_t *indexes, int64_t arrsize); + +template +bool comparison_func(const T &a, const T &b) { + return a < b; +} + +/* + * COEX == Compare and Exchange two registers by swapping min and max values + */ +template +static void COEX(mm_t &a, mm_t &b) { + mm_t temp = a; + a = vtype::min(a, b); + b = vtype::max(temp, b); +} +template +static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) { + zmm_t min = vtype::min(in2, in1); + zmm_t max = vtype::max(in2, in1); + return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max +} +/* + * Parition one ZMM register based on the pivot and returns the + * number of elements that are greater than or equal to the pivot. + */ +template +static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right, + const zmm_t curr_vec, const zmm_t pivot_vec, + zmm_t *smallest_vec, zmm_t *biggest_vec) { + /* which elements are larger than or equal to the pivot */ + typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); + int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask); + vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(ge_mask), + curr_vec); + vtype::mask_compressstoreu(arr + right - amount_ge_pivot, ge_mask, + curr_vec); + *smallest_vec = vtype::min(curr_vec, *smallest_vec); + *biggest_vec = vtype::max(curr_vec, *biggest_vec); + return amount_ge_pivot; +} +/* + * Parition an array based on the pivot and returns the index of the + * first element that is greater than or equal to the pivot. + */ +template +static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right, + type_t pivot, type_t *smallest, + type_t *biggest) { + /* make array length divisible by vtype::numlanes , shortening the array */ + for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { + ++left; + } + } + + if (left == right) + return left; /* less than vtype::numlanes elements in the array */ + + using zmm_t = typename vtype::zmm_t; + zmm_t pivot_vec = vtype::set1(pivot); + zmm_t min_vec = vtype::set1(*smallest); + zmm_t max_vec = vtype::set1(*biggest); + + if (right - left == vtype::numlanes) { + zmm_t vec = vtype::loadu(arr + left); + int32_t amount_ge_pivot = + partition_vec(arr, left, left + vtype::numlanes, vec, + pivot_vec, &min_vec, &max_vec); + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return left + (vtype::numlanes - amount_ge_pivot); + } + + // first and last vtype::numlanes values are partitioned at the end + zmm_t vec_left = vtype::loadu(arr + left); + zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); + // store points of the vectors + int64_t r_store = right - vtype::numlanes; + int64_t l_store = left; + // indices for loading the elements + left += vtype::numlanes; + right -= vtype::numlanes; + while (right - left != 0) { + zmm_t curr_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((r_store + vtype::numlanes) - right < left - l_store) { + right -= vtype::numlanes; + curr_vec = vtype::loadu(arr + right); + } else { + curr_vec = vtype::loadu(arr + left); + left += vtype::numlanes; + } + // partition the current vector and save it on both sides of the array + int32_t amount_ge_pivot = + partition_vec(arr, l_store, r_store + vtype::numlanes, + curr_vec, pivot_vec, &min_vec, &max_vec); + ; + r_store -= amount_ge_pivot; + l_store += (vtype::numlanes - amount_ge_pivot); + } + + /* partition and save vec_left and vec_right */ + int32_t amount_ge_pivot = + partition_vec(arr, l_store, r_store + vtype::numlanes, vec_left, + pivot_vec, &min_vec, &max_vec); + l_store += (vtype::numlanes - amount_ge_pivot); + amount_ge_pivot = + partition_vec(arr, l_store, l_store + vtype::numlanes, vec_right, + pivot_vec, &min_vec, &max_vec); + l_store += (vtype::numlanes - amount_ge_pivot); + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; +} + +template +static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, + int64_t right, type_t pivot, + type_t *smallest, + type_t *biggest) { + if (right - left <= 2 * num_unroll * vtype::numlanes) { + return partition_avx512(arr, left, right, pivot, smallest, + biggest); + } + /* make array length divisible by 8*vtype::numlanes , shortening the array + */ + for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0; + --i) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { + ++left; + } + } + + if (left == right) + return left; /* less than vtype::numlanes elements in the array */ + + using zmm_t = typename vtype::zmm_t; + zmm_t pivot_vec = vtype::set1(pivot); + zmm_t min_vec = vtype::set1(*smallest); + zmm_t max_vec = vtype::set1(*biggest); + + // We will now have atleast 16 registers worth of data to process: + // left and right vtype::numlanes values are partitioned at the end + zmm_t vec_left[num_unroll], vec_right[num_unroll]; +#pragma GCC unroll 8 + for (int ii = 0; ii < num_unroll; ++ii) { + vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); + vec_right[ii] = + vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii))); + } + // store points of the vectors + int64_t r_store = right - vtype::numlanes; + int64_t l_store = left; + // indices for loading the elements + left += num_unroll * vtype::numlanes; + right -= num_unroll * vtype::numlanes; + while (right - left != 0) { + zmm_t curr_vec[num_unroll]; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((r_store + vtype::numlanes) - right < left - l_store) { + right -= num_unroll * vtype::numlanes; +#pragma GCC unroll 8 + for (int ii = 0; ii < num_unroll; ++ii) { + curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); + } + } else { +#pragma GCC unroll 8 + for (int ii = 0; ii < num_unroll; ++ii) { + curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); + } + left += num_unroll * vtype::numlanes; + } +// partition the current vector and save it on both sides of the array +#pragma GCC unroll 8 + for (int ii = 0; ii < num_unroll; ++ii) { + int32_t amount_ge_pivot = partition_vec( + arr, l_store, r_store + vtype::numlanes, curr_vec[ii], + pivot_vec, &min_vec, &max_vec); + l_store += (vtype::numlanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + } + } + +/* partition and save vec_left[8] and vec_right[8] */ +#pragma GCC unroll 8 + for (int ii = 0; ii < num_unroll; ++ii) { + int32_t amount_ge_pivot = + partition_vec(arr, l_store, r_store + vtype::numlanes, + vec_left[ii], pivot_vec, &min_vec, &max_vec); + l_store += (vtype::numlanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + } +#pragma GCC unroll 8 + for (int ii = 0; ii < num_unroll; ++ii) { + int32_t amount_ge_pivot = + partition_vec(arr, l_store, r_store + vtype::numlanes, + vec_right[ii], pivot_vec, &min_vec, &max_vec); + l_store += (vtype::numlanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + } + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; +} + +// Key-value sort helper functions + +template +static void COEX(zmm_t1 &key1, zmm_t1 &key2, zmm_t2 &index1, zmm_t2 &index2) { + zmm_t1 key_t1 = vtype1::min(key1, key2); + zmm_t1 key_t2 = vtype1::max(key1, key2); + + zmm_t2 index_t1 = + vtype2::mask_mov(index2, vtype1::eq(key_t1, key1), index1); + zmm_t2 index_t2 = + vtype2::mask_mov(index1, vtype1::eq(key_t1, key1), index2); + + key1 = key_t1; + key2 = key_t2; + index1 = index_t1; + index2 = index_t2; +} +template +static inline zmm_t1 cmp_merge(zmm_t1 in1, zmm_t1 in2, zmm_t2 &indexes1, + zmm_t2 indexes2, opmask_t mask) { + zmm_t1 tmp_keys = cmp_merge(in1, in2, mask); + indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1); + return tmp_keys; // 0 -> min, 1 -> max +} + +/* + * Parition one ZMM register based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +template +static inline int32_t partition_vec(type_t1 *keys, type_t2 *indexes, + int64_t left, int64_t right, + const zmm_t1 keys_vec, + const zmm_t2 indexes_vec, + const zmm_t1 pivot_vec, + zmm_t1 *smallest_vec, zmm_t1 *biggest_vec) { + /* which elements are larger than the pivot */ + typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec); + int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); + vtype1::mask_compressstoreu(keys + left, vtype1::knot_opmask(gt_mask), + keys_vec); + vtype1::mask_compressstoreu(keys + right - amount_gt_pivot, gt_mask, + keys_vec); + vtype2::mask_compressstoreu(indexes + left, vtype2::knot_opmask(gt_mask), + indexes_vec); + vtype2::mask_compressstoreu(indexes + right - amount_gt_pivot, gt_mask, + indexes_vec); + *smallest_vec = vtype1::min(keys_vec, *smallest_vec); + *biggest_vec = vtype1::max(keys_vec, *biggest_vec); + return amount_gt_pivot; +} +/* + * Parition an array based on the pivot and returns the index of the + * last element that is less than equal to the pivot. + */ +template +static inline int64_t partition_avx512(type_t1 *keys, type_t2 *indexes, + int64_t left, int64_t right, + type_t1 pivot, type_t1 *smallest, + type_t1 *biggest) { + /* make array length divisible by vtype1::numlanes , shortening the array */ + for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) { + *smallest = std::min(*smallest, keys[left]); + *biggest = std::max(*biggest, keys[left]); + if (keys[left] > pivot) { + right--; + std::swap(keys[left], keys[right]); + std::swap(indexes[left], indexes[right]); + } else { + ++left; + } + } + + if (left == right) + return left; /* less than vtype1::numlanes elements in the array */ + + zmm_t1 pivot_vec = vtype1::set1(pivot); + zmm_t1 min_vec = vtype1::set1(*smallest); + zmm_t1 max_vec = vtype1::set1(*biggest); + + if (right - left == vtype1::numlanes) { + zmm_t1 keys_vec = vtype1::loadu(keys + left); + int32_t amount_gt_pivot; + + zmm_t2 indexes_vec = vtype2::loadu(indexes + left); + amount_gt_pivot = partition_vec( + keys, indexes, left, left + vtype1::numlanes, keys_vec, indexes_vec, + pivot_vec, &min_vec, &max_vec); + + *smallest = vtype1::reducemin(min_vec); + *biggest = vtype1::reducemax(max_vec); + return left + (vtype1::numlanes - amount_gt_pivot); + } + + // first and last vtype1::numlanes values are partitioned at the end + zmm_t1 keys_vec_left = vtype1::loadu(keys + left); + zmm_t1 keys_vec_right = vtype1::loadu(keys + (right - vtype1::numlanes)); + zmm_t2 indexes_vec_left; + zmm_t2 indexes_vec_right; + indexes_vec_left = vtype2::loadu(indexes + left); + indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes)); + + // store points of the vectors + int64_t r_store = right - vtype1::numlanes; + int64_t l_store = left; + // indices for loading the elements + left += vtype1::numlanes; + right -= vtype1::numlanes; + while (right - left != 0) { + zmm_t1 keys_vec; + zmm_t2 indexes_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((r_store + vtype1::numlanes) - right < left - l_store) { + right -= vtype1::numlanes; + keys_vec = vtype1::loadu(keys + right); + indexes_vec = vtype2::loadu(indexes + right); + } else { + keys_vec = vtype1::loadu(keys + left); + indexes_vec = vtype2::loadu(indexes + left); + left += vtype1::numlanes; + } + // partition the current vector and save it on both sides of the array + int32_t amount_gt_pivot; + + amount_gt_pivot = partition_vec( + keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec, + indexes_vec, pivot_vec, &min_vec, &max_vec); + r_store -= amount_gt_pivot; + l_store += (vtype1::numlanes - amount_gt_pivot); + } + + /* partition and save vec_left and vec_right */ + int32_t amount_gt_pivot; + amount_gt_pivot = partition_vec( + keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec_left, + indexes_vec_left, pivot_vec, &min_vec, &max_vec); + l_store += (vtype1::numlanes - amount_gt_pivot); + amount_gt_pivot = partition_vec( + keys, indexes, l_store, l_store + vtype1::numlanes, keys_vec_right, + indexes_vec_right, pivot_vec, &min_vec, &max_vec); + l_store += (vtype1::numlanes - amount_gt_pivot); + *smallest = vtype1::reducemin(min_vec); + *biggest = vtype1::reducemax(max_vec); + return l_store; +} +#endif // AVX512_QSORT_COMMON diff --git a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp new file mode 100644 index 0000000000000..ec436bb49eee6 --- /dev/null +++ b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Intel x86-simd-sort source code. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" + +#define DLL_PUBLIC __attribute__((visibility("default"))) + +extern "C" { + + DLL_PUBLIC void avx512_sort_int(int32_t *array_fromIndex, int64_t fromIndex, + int64_t toIndex) { + avx512_qsort(array_fromIndex, toIndex - fromIndex); + } + + DLL_PUBLIC void avx512_sort_long(int64_t *array_fromIndex, int64_t fromIndex, + int64_t toIndex) { + avx512_qsort(array_fromIndex, toIndex - fromIndex); + } + + DLL_PUBLIC void avx512_sort_float(float *array_fromIndex, int64_t fromIndex, + int64_t toIndex) { + avx512_qsort(array_fromIndex, toIndex - fromIndex); + } + + DLL_PUBLIC void avx512_sort_double(double *array_fromIndex, int64_t fromIndex, + int64_t toIndex) { + avx512_qsort(array_fromIndex, toIndex - fromIndex); + } + +} diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 9d1034e6fef6c..927def142476d 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -78,6 +78,26 @@ public final class Arrays { // Suppresses default constructor, ensuring non-instantiability. private Arrays() {} + @IntrinsicCandidate + private static void arraySort(int[] array, int fromIndex, int toIndex) { + DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); + } + + @IntrinsicCandidate + private static void arraySort(long[] array, int fromIndex, int toIndex) { + DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); + } + + @IntrinsicCandidate + private static void arraySort(float[] array, int fromIndex, int toIndex) { + DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); + } + + @IntrinsicCandidate + private static void arraySort(double[] array, int fromIndex, int toIndex) { + DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); + } + /* * Sorting methods. Note that all public "sort" methods take the * same form: performing argument checks if necessary, and then @@ -97,7 +117,7 @@ private Arrays() {} * @param a the array to be sorted */ public static void sort(int[] a) { - DualPivotQuicksort.sort(a, 0, 0, a.length); + arraySort(a, 0, a.length); } /** @@ -121,7 +141,7 @@ public static void sort(int[] a) { */ public static void sort(int[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); + arraySort(a, fromIndex, toIndex); } /** @@ -135,7 +155,7 @@ public static void sort(int[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(long[] a) { - DualPivotQuicksort.sort(a, 0, 0, a.length); + arraySort(a, 0, a.length); } /** @@ -159,7 +179,7 @@ public static void sort(long[] a) { */ public static void sort(long[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); + arraySort(a, fromIndex, toIndex); } /** @@ -295,7 +315,7 @@ public static void sort(byte[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(float[] a) { - DualPivotQuicksort.sort(a, 0, 0, a.length); + arraySort(a, 0, a.length); } /** @@ -327,7 +347,7 @@ public static void sort(float[] a) { */ public static void sort(float[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); + arraySort(a, fromIndex, toIndex); } /** @@ -349,7 +369,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(double[] a) { - DualPivotQuicksort.sort(a, 0, 0, a.length); + arraySort(a, 0, a.length); } /** @@ -381,7 +401,7 @@ public static void sort(double[] a) { */ public static void sort(double[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); + arraySort(a, fromIndex, toIndex); } /** diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java new file mode 100644 index 0000000000000..2780a70b66926 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OperationsPerInvocation; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.Arrays; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.io.UnsupportedEncodingException; +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Method; + +/** + * Performance test of Arrays.sort() methods + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 3, time=60) +@Measurement(iterations = 3, time=120) +@Fork(value = 1) +public class ArraysSort { + + + @Param({"10", "100", "1000", "10000", "100000", "1000000"}) + private int size; + + private int[] ints_unsorted; + private long[] longs_unsorted; + private float[] floats_unsorted; + private double[] doubles_unsorted; + + private int[] ints_sorted; + private long[] longs_sorted; + private float[] floats_sorted; + private double[] doubles_sorted; + + + @Setup + public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + Random rnd = new Random(42); + + ints_unsorted = new int[size]; + longs_unsorted = new long[size]; + floats_unsorted = new float[size]; + doubles_unsorted = new double[size]; + + for (int i = 0; i < size; i++) { + ints_unsorted[i] = rnd.nextInt(); + longs_unsorted[i] = rnd.nextLong(); + floats_unsorted[i] = rnd.nextFloat(); + doubles_unsorted[i] = rnd.nextDouble(); + } + } + + @Benchmark + public void intSort() throws Throwable { + ints_sorted = ints_unsorted.clone(); + Arrays.sort(ints_sorted); + } + + @Benchmark + public void longSort() throws Throwable { + longs_sorted = longs_unsorted.clone(); + Arrays.sort(longs_sorted); + } + + @Benchmark + public void floatSort() throws Throwable { + floats_sorted = floats_unsorted.clone(); + Arrays.sort(floats_sorted); + } + + @Benchmark + public void doubleSort() throws Throwable { + doubles_sorted = doubles_unsorted.clone(); + Arrays.sort(doubles_sorted); + } + +} From 923a7cae3d328a76f50354202243e0a592535469 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 30 May 2023 12:54:54 -0700 Subject: [PATCH 02/40] remove libstdc++ --- make/modules/java.base/Lib.gmk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 5ec5d03d59c07..4cbd39546261c 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -244,7 +244,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \ LDFLAGS_windows := -defaultlib:msvcrt, \ LIBS := $(LIBCXX), \ - LIBS_linux := -lc -lm -ldl -lstdc++, \ + LIBS_linux := -lc -lm -ldl, \ )) TARGETS += $(BUILD_LIBAVX512_X86_64) From 30a50d998b0e7276ceefb102def660ed449fbe2d Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Wed, 31 May 2023 18:09:09 -0700 Subject: [PATCH 03/40] fix license --- .../linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp | 5 ++++- .../linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp | 4 +++- .../linux/native/libavx512_x86_64/avx512-common-qsort.h | 5 ++++- test/micro/org/openjdk/bench/java/util/ArraysSort.java | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp index 05efac20cbdb2..d2240b29292f9 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Intel x86-simd-sort source code. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -23,6 +23,9 @@ * questions. * */ + +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + #ifndef AVX512_QSORT_32BIT #define AVX512_QSORT_32BIT diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp index 893f2ce8363c8..1b4cb0a1936a0 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Intel x86-simd-sort source code. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -24,6 +24,8 @@ * */ +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + #ifndef AVX512_QSORT_64BIT #define AVX512_QSORT_64BIT diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h index b477f9e65c233..3c5806db607d2 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h @@ -1,5 +1,6 @@ /* - * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. * Intel x86-simd-sort source code. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -24,6 +25,8 @@ * */ +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + #ifndef AVX512_QSORT_COMMON #define AVX512_QSORT_COMMON diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index 2780a70b66926..2bc213977aac1 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -56,7 +56,7 @@ public class ArraysSort { - @Param({"10", "100", "1000", "10000", "100000", "1000000"}) + @Param({"100", "1000", "10000", "100000"}) private int size; private int[] ints_unsorted; From a7c2b6e9add098c933d4936ec35008a7cc657739 Mon Sep 17 00:00:00 2001 From: Srinivas Vamsi Parasa Date: Thu, 1 Jun 2023 08:52:04 -0700 Subject: [PATCH 04/40] Update test/micro/org/openjdk/bench/java/util/ArraysSort.java Co-authored-by: Andrew Haley --- test/micro/org/openjdk/bench/java/util/ArraysSort.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index 2bc213977aac1..2e66bf6291be7 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -56,7 +56,7 @@ public class ArraysSort { - @Param({"100", "1000", "10000", "100000"}) + @Param({"10","25","50","75","100", "1000", "10000", "100000"}) private int size; private int[] ints_unsorted; From 1dc9589eb084049a5fb585458f0f50c524e604d2 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 1 Jun 2023 10:16:33 -0700 Subject: [PATCH 05/40] fix license in one file --- .../linux/native/libavx512_x86_64/avx512-64bit-common.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h index 88fee99c0d79e..c435d100e7579 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Intel x86-simd-sort source code. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -24,6 +24,8 @@ * */ +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + #ifndef AVX512_64BIT_COMMON #define AVX512_64BIT_COMMON #include "avx512-common-qsort.h" From 53a5309dc643e2f476364566a0d1b8f8404817ef Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 23 Jun 2023 14:06:41 -0700 Subject: [PATCH 06/40] replace multiple intrinsics with one general intrinsic --- make/modules/java.base/Lib.gmk | 1 - src/hotspot/share/classfile/vmIntrinsics.hpp | 12 ++---- src/hotspot/share/opto/c2compiler.cpp | 5 +-- src/hotspot/share/opto/library_call.cpp | 34 +++++----------- .../share/classes/java/util/Arrays.java | 39 +++++++------------ .../openjdk/bench/java/util/ArraysSort.java | 25 ++++++++---- 6 files changed, 44 insertions(+), 72 deletions(-) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 466a79b0c4d50..a7546fee64018 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -244,7 +244,6 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) LDFLAGS := $(LDFLAGS_JDKLIB) \ $(call SET_SHARED_LIBRARY_ORIGIN), \ LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \ - LDFLAGS_windows := -defaultlib:msvcrt, \ LIBS := $(LIBCXX), \ LIBS_linux := -lc -lm -ldl, \ )) diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index de02d4bad0092..6fd273203bb23 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -341,15 +341,9 @@ class methodHandle; do_name( copyOf_name, "copyOf") \ do_signature(copyOf_signature, "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;") \ \ - do_intrinsic(_arraySortI, java_util_Arrays, arraySort_name, arraySortI_signature, F_S) \ - do_name( arraySort_name, "arraySort") \ - do_signature(arraySortI_signature, "([III)V") \ - do_intrinsic(_arraySortL, java_util_Arrays, arraySort_name, arraySortL_signature, F_S) \ - do_signature(arraySortL_signature, "([JII)V") \ - do_intrinsic(_arraySortF, java_util_Arrays, arraySort_name, arraySortF_signature, F_S) \ - do_signature(arraySortF_signature, "([FII)V") \ - do_intrinsic(_arraySortD, java_util_Arrays, arraySort_name, arraySortD_signature, F_S) \ - do_signature(arraySortD_signature, "([DII)V") \ + do_intrinsic(_arraySort, java_util_Arrays, arraySort_name, arraySort_signature, F_S) \ + do_name( arraySort_name, "arraySort") \ + do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;II)V") \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ do_name( copyOfRange_name, "copyOfRange") \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index c904c49d22046..8224d3efe41ee 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -575,10 +575,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) { case vmIntrinsics::_min_strict: case vmIntrinsics::_max_strict: case vmIntrinsics::_arraycopy: - case vmIntrinsics::_arraySortI: - case vmIntrinsics::_arraySortL: - case vmIntrinsics::_arraySortF: - case vmIntrinsics::_arraySortD: + case vmIntrinsics::_arraySort: case vmIntrinsics::_indexOfL: case vmIntrinsics::_indexOfU: case vmIntrinsics::_indexOfUL: diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 21eb6b4483064..3e18246ff7509 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -292,10 +292,7 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_arraycopy: return inline_arraycopy(); - case vmIntrinsics::_arraySortI: - case vmIntrinsics::_arraySortL: - case vmIntrinsics::_arraySortF: - case vmIntrinsics::_arraySortD: return inline_arraysort(intrinsic_id()); + case vmIntrinsics::_arraySort: return inline_arraysort(intrinsic_id()); case vmIntrinsics::_compareToL: return inline_string_compareTo(StrIntrinsicNode::LL); case vmIntrinsics::_compareToU: return inline_string_compareTo(StrIntrinsicNode::UU); @@ -5203,32 +5200,19 @@ bool LibraryCallKit::inline_arraysort(vmIntrinsics::ID id) { address stubAddr = nullptr; const char *stubName; stubName = "arraysort_stub"; - BasicType bt; - switch(id) { - case vmIntrinsics::_arraySortI: - bt = T_INT; - break; - case vmIntrinsics::_arraySortL: - bt = T_LONG; - break; - case vmIntrinsics::_arraySortF: - bt = T_FLOAT; - break; - case vmIntrinsics::_arraySortD: - bt = T_DOUBLE; - break; - default: - break; - } + Node* elementType = argument(0); + Node* array = argument(1); + Node* fromIndex = argument(2); + Node* toIndex = argument(3); + + const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); + ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); + BasicType bt = elem_type->basic_type(); stubAddr = StubRoutines::select_arraysort_function(bt); if (stubAddr == nullptr) return false; - Node* array = argument(0); - Node* fromIndex = argument(1); - Node* toIndex = argument(2); - array = must_be_not_null(array, true); const TypeAryPtr* array_type = array->Value(&_gvn)->isa_aryptr(); diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 927def142476d..2691a7180caca 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -79,23 +79,12 @@ public final class Arrays { private Arrays() {} @IntrinsicCandidate - private static void arraySort(int[] array, int fromIndex, int toIndex) { - DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); - } - - @IntrinsicCandidate - private static void arraySort(long[] array, int fromIndex, int toIndex) { - DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); - } - - @IntrinsicCandidate - private static void arraySort(float[] array, int fromIndex, int toIndex) { - DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); - } - - @IntrinsicCandidate - private static void arraySort(double[] array, int fromIndex, int toIndex) { - DualPivotQuicksort.sort(array, 0, fromIndex, toIndex); + private static void arraySort(Class elemType, Object array, int fromIndex, int toIndex) { + if (elemType == int.class) DualPivotQuicksort.sort((int[]) array, 0, fromIndex, toIndex); + else if (elemType == long.class) DualPivotQuicksort.sort((long[]) array, 0, fromIndex, toIndex); + else if (elemType == float.class) DualPivotQuicksort.sort((float[]) array, 0, fromIndex, toIndex); + else if (elemType == double.class) DualPivotQuicksort.sort((double[]) array, 0, fromIndex, toIndex); + else throw new UnsupportedOperationException("arraySort intrinsic not supported for this type: " + elemType.toString()); } /* @@ -117,7 +106,7 @@ private static void arraySort(double[] array, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(int[] a) { - arraySort(a, 0, a.length); + arraySort(int.class, a, 0, a.length); } /** @@ -141,7 +130,7 @@ public static void sort(int[] a) { */ public static void sort(int[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(a, fromIndex, toIndex); + arraySort(int.class, a, fromIndex, toIndex); } /** @@ -155,7 +144,7 @@ public static void sort(int[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(long[] a) { - arraySort(a, 0, a.length); + arraySort(long.class, a, 0, a.length); } /** @@ -179,7 +168,7 @@ public static void sort(long[] a) { */ public static void sort(long[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(a, fromIndex, toIndex); + arraySort(long.class, a, fromIndex, toIndex); } /** @@ -315,7 +304,7 @@ public static void sort(byte[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(float[] a) { - arraySort(a, 0, a.length); + arraySort(float.class, a, 0, a.length); } /** @@ -347,7 +336,7 @@ public static void sort(float[] a) { */ public static void sort(float[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(a, fromIndex, toIndex); + arraySort(float.class, a, fromIndex, toIndex); } /** @@ -369,7 +358,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(double[] a) { - arraySort(a, 0, a.length); + arraySort(double.class, a, 0, a.length); } /** @@ -401,7 +390,7 @@ public static void sort(double[] a) { */ public static void sort(double[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(a, fromIndex, toIndex); + arraySort(double.class, a, fromIndex, toIndex); } /** diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index 2e66bf6291be7..48d03db2d60a1 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -33,6 +33,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; @@ -87,28 +88,36 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException, } } - @Benchmark - public void intSort() throws Throwable { + @Setup(Level.Invocation) + public void init() { ints_sorted = ints_unsorted.clone(); + longs_sorted = longs_unsorted.clone(); + floats_sorted = floats_unsorted.clone(); + doubles_sorted = doubles_unsorted.clone(); + } + + @Benchmark + public int[] intSort() throws Throwable { Arrays.sort(ints_sorted); + return ints_sorted; } @Benchmark - public void longSort() throws Throwable { - longs_sorted = longs_unsorted.clone(); + public long[] longSort() throws Throwable { Arrays.sort(longs_sorted); + return longs_sorted; } @Benchmark - public void floatSort() throws Throwable { - floats_sorted = floats_unsorted.clone(); + public float[] floatSort() throws Throwable { Arrays.sort(floats_sorted); + return floats_sorted; } @Benchmark - public void doubleSort() throws Throwable { - doubles_sorted = doubles_unsorted.clone(); + public double[] doubleSort() throws Throwable { Arrays.sort(doubles_sorted); + return doubles_sorted; } } From 2bd0419167c889048f1186a8ef72cf2761599603 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 27 Jun 2023 09:53:32 -0700 Subject: [PATCH 07/40] minor cleanups --- src/hotspot/share/opto/library_call.cpp | 4 ++-- src/hotspot/share/opto/library_call.hpp | 2 +- test/micro/org/openjdk/bench/java/util/ArraysSort.java | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 3e18246ff7509..981606ef701cb 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -292,7 +292,7 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_arraycopy: return inline_arraycopy(); - case vmIntrinsics::_arraySort: return inline_arraysort(intrinsic_id()); + case vmIntrinsics::_arraySort: return inline_arraysort(); case vmIntrinsics::_compareToL: return inline_string_compareTo(StrIntrinsicNode::LL); case vmIntrinsics::_compareToU: return inline_string_compareTo(StrIntrinsicNode::UU); @@ -5195,7 +5195,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_ } //------------------------------inline_arraysort----------------------- -bool LibraryCallKit::inline_arraysort(vmIntrinsics::ID id) { +bool LibraryCallKit::inline_arraysort() { address stubAddr = nullptr; const char *stubName; diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index 52725e87080f1..53d697f6b2078 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -279,7 +279,7 @@ class LibraryCallKit : public GraphKit { JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp); void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp, uint new_idx); - bool inline_arraysort(vmIntrinsics::ID id); + bool inline_arraysort(); typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind; bool inline_unsafe_load_store(BasicType type, LoadStoreKind kind, AccessKind access_kind); bool inline_unsafe_fence(vmIntrinsics::ID id); diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index 48d03db2d60a1..cfe59e34e801e 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From e09c05015afe176c6699f9fe7f95caccf86eb952 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 25 Jul 2023 12:19:14 -0700 Subject: [PATCH 08/40] change API to enable MemorySegment --- src/hotspot/share/classfile/vmIntrinsics.hpp | 2 +- src/hotspot/share/opto/library_call.cpp | 28 ++++------- .../libavx512_x86_64/avx512-32bit-qsort.hpp | 22 +++++---- .../libavx512_x86_64/avx512-64bit-qsort.hpp | 21 ++++---- .../libavx512_x86_64/avx512-common-qsort.h | 48 +++++++++++++++---- .../share/classes/java/util/Arrays.java | 38 +++++++++++---- 6 files changed, 106 insertions(+), 53 deletions(-) diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 6fd273203bb23..aa9f9660bbba4 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -343,7 +343,7 @@ class methodHandle; \ do_intrinsic(_arraySort, java_util_Arrays, arraySort_name, arraySort_signature, F_S) \ do_name( arraySort_name, "arraySort") \ - do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;II)V") \ + do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII)V") \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ do_name( copyOfRange_name, "copyOfRange") \ diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 981606ef701cb..132835ba4aff8 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -5201,35 +5201,27 @@ bool LibraryCallKit::inline_arraysort() { const char *stubName; stubName = "arraysort_stub"; - Node* elementType = argument(0); - Node* array = argument(1); - Node* fromIndex = argument(2); - Node* toIndex = argument(3); + Node* elementType = null_check(argument(0)); + Node* obj = argument(1); + Node* offset = argument(2); + Node* fromIndex = argument(4); + Node* toIndex = argument(5); const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); - stubAddr = StubRoutines::select_arraysort_function(bt); if (stubAddr == nullptr) return false; - array = must_be_not_null(array, true); - - const TypeAryPtr* array_type = array->Value(&_gvn)->isa_aryptr(); - assert(array_type != nullptr && array_type->elem() != Type::BOTTOM, "args are strange"); - - // for the quick and dirty code we will skip all the checks. - // we are just trying to get the call to be generated. - Node* array_fromIndex = array; - if (fromIndex != nullptr || toIndex != nullptr) { - assert(fromIndex != nullptr && toIndex != nullptr, ""); - array_fromIndex = array_element_address(array, fromIndex, bt); + const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr(); + if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) { + return false; // failed input validation } - + Node* obj_adr = make_unsafe_address(obj, offset); // Call the stub. make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(), stubAddr, stubName, TypePtr::BOTTOM, - array_fromIndex, fromIndex, toIndex); + obj_adr, fromIndex, toIndex); return true; } diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp index d2240b29292f9..9c8f1af6a9caf 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. * Intel x86-simd-sort source code. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -550,7 +551,8 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize, } template <> -void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize) { +void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize, + bool hasnan) { if (arrsize > 1) { qselect_32bit_, int32_t>( arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); @@ -558,7 +560,8 @@ void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize) { } template <> -void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize) { +void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize, + bool hasnan) { if (arrsize > 1) { qselect_32bit_, uint32_t>( arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); @@ -566,12 +569,15 @@ void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize) { } template <> -void avx512_qselect(float *arr, int64_t k, int64_t arrsize) { - if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qselect_32bit_, float>(arr, k, 0, arrsize - 1, - 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); +void avx512_qselect(float *arr, int64_t k, int64_t arrsize, + bool hasnan) { + int64_t indx_last_elem = arrsize - 1; + if (UNLIKELY(hasnan)) { + indx_last_elem = move_nans_to_end_of_array(arr, arrsize); + } + if (indx_last_elem >= k) { + qselect_32bit_, float>( + arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); } } diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp index 1b4cb0a1936a0..2d01663923a9b 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp @@ -783,7 +783,8 @@ static void qselect_64bit_(type_t *arr, int64_t pos, int64_t left, } template <> -void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize) { +void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize, + bool hasnan) { if (arrsize > 1) { qselect_64bit_, int64_t>( arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); @@ -791,7 +792,8 @@ void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize) { } template <> -void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize) { +void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize, + bool hasnan) { if (arrsize > 1) { qselect_64bit_, uint64_t>( arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); @@ -799,12 +801,15 @@ void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize) { } template <> -void avx512_qselect(double *arr, int64_t k, int64_t arrsize) { - if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qselect_64bit_, double>(arr, k, 0, arrsize - 1, - 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); +void avx512_qselect(double *arr, int64_t k, int64_t arrsize, + bool hasnan) { + int64_t indx_last_elem = arrsize - 1; + if (UNLIKELY(hasnan)) { + indx_last_elem = move_nans_to_end_of_array(arr, arrsize); + } + if (indx_last_elem >= k) { + qselect_64bit_, double>( + arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); } } diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h index 3c5806db607d2..9eb09689f72c2 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h @@ -56,12 +56,11 @@ * */ -#include - #include #include #include #include +#include #include #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() @@ -107,6 +106,9 @@ #define X86_SIMD_SORT_FINLINE static #endif +#define LIKELY(x) __builtin_expect((x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) + template struct zmm_vector; @@ -119,17 +121,19 @@ void avx512_qsort(T *arr, int64_t arrsize); void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); template -void avx512_qselect(T *arr, int64_t k, int64_t arrsize); -void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize); +void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); +void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize, + bool hasnan = false); template -inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) { - avx512_qselect(arr, k - 1, arrsize); +inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize, + bool hasnan = false) { + avx512_qselect(arr, k - 1, arrsize, hasnan); avx512_qsort(arr, k - 1); } -inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, - int64_t arrsize) { - avx512_qselect_fp16(arr, k - 1, arrsize); +inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize, + bool hasnan = false) { + avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); avx512_qsort_fp16(arr, k - 1); } @@ -137,6 +141,32 @@ inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, template void avx512_qsort_kv(T *keys, uint64_t *indexes, int64_t arrsize); +template +bool is_a_nan(T elem) { + return std::isnan(elem); +} + +/* + * Sort all the NAN's to end of the array and return the index of the last elem + * in the array which is not a nan + */ +template +int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) { + int64_t jj = arrsize - 1; + int64_t ii = 0; + int64_t count = 0; + while (ii <= jj) { + if (is_a_nan(arr[ii])) { + std::swap(arr[ii], arr[jj]); + jj -= 1; + count++; + } else { + ii += 1; + } + } + return arrsize - count - 1; +} + template bool comparison_func(const T &a, const T &b) { return a < b; diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index cd1d19690562d..91bfbd6e5e388 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -30,6 +30,7 @@ import java.io.Serializable; import java.lang.reflect.Array; +import java.util.Arrays.NaturalOrder; import java.util.concurrent.ForkJoinPool; import java.util.function.BinaryOperator; import java.util.function.Consumer; @@ -46,6 +47,7 @@ import java.util.stream.LongStream; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.misc.Unsafe; /** * This class contains various methods for manipulating arrays (such as @@ -77,8 +79,18 @@ public final class Arrays { // Suppresses default constructor, ensuring non-instantiability. private Arrays() {} + /** + * Sorts the specified array into ascending numerical order. + * + * + * @param elemType the class of the array to be sorted + * @param array the array to be sorted + * @param offset the array offset + * @param fromIndex from Index + * @param toIndex to Index + */ @IntrinsicCandidate - private static void arraySort(Class elemType, Object array, int fromIndex, int toIndex) { + public static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex) { if (elemType == int.class) DualPivotQuicksort.sort((int[]) array, 0, fromIndex, toIndex); else if (elemType == long.class) DualPivotQuicksort.sort((long[]) array, 0, fromIndex, toIndex); else if (elemType == float.class) DualPivotQuicksort.sort((float[]) array, 0, fromIndex, toIndex); @@ -105,7 +117,8 @@ private static void arraySort(Class elemType, Object array, int fromIndex, in * @param a the array to be sorted */ public static void sort(int[] a) { - arraySort(int.class, a, 0, a.length); + int offset = Unsafe.ARRAY_INT_BASE_OFFSET; + arraySort(int.class, a, offset, 0, a.length); } /** @@ -129,7 +142,8 @@ public static void sort(int[] a) { */ public static void sort(int[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(int.class, a, fromIndex, toIndex); + int offset = Unsafe.ARRAY_INT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_INT_INDEX_SCALE); + arraySort(int.class, a, offset, fromIndex, toIndex); } /** @@ -143,7 +157,8 @@ public static void sort(int[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(long[] a) { - arraySort(long.class, a, 0, a.length); + int offset = Unsafe.ARRAY_LONG_BASE_OFFSET; + arraySort(long.class, a, offset, 0, a.length); } /** @@ -167,7 +182,8 @@ public static void sort(long[] a) { */ public static void sort(long[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(long.class, a, fromIndex, toIndex); + int offset = Unsafe.ARRAY_LONG_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_LONG_INDEX_SCALE); + arraySort(long.class, a, offset, fromIndex, toIndex); } /** @@ -303,7 +319,8 @@ public static void sort(byte[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(float[] a) { - arraySort(float.class, a, 0, a.length); + int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET; + arraySort(float.class, a, offset, 0, a.length); } /** @@ -335,7 +352,8 @@ public static void sort(float[] a) { */ public static void sort(float[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(float.class, a, fromIndex, toIndex); + int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_FLOAT_INDEX_SCALE); + arraySort(float.class, a, offset, fromIndex, toIndex); } /** @@ -357,7 +375,8 @@ public static void sort(float[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(double[] a) { - arraySort(double.class, a, 0, a.length); + int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET; + arraySort(double.class, a, offset, 0, a.length); } /** @@ -389,7 +408,8 @@ public static void sort(double[] a) { */ public static void sort(double[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - arraySort(double.class, a, fromIndex, toIndex); + int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_DOUBLE_INDEX_SCALE); + arraySort(double.class, a, offset, fromIndex, toIndex); } /** From 5eac7b327a2e32c27285954457c06b1e44b84756 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 25 Jul 2023 12:43:51 -0700 Subject: [PATCH 09/40] update arraySort docstring --- src/java.base/share/classes/java/util/Arrays.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 91bfbd6e5e388..bd198960ea2b9 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -85,9 +85,11 @@ private Arrays() {} * * @param elemType the class of the array to be sorted * @param array the array to be sorted - * @param offset the array offset - * @param fromIndex from Index - * @param toIndex to Index + * @param offset the relative offset, in bytes, from the base address of + * the array to sort, otherwise if the array is {@code null},an absolute + * address pointing to the first element to sort from. + * @param fromIndex the index of the first element, inclusive, to be sorted + * @param toIndex the index of the last element, exclusive, to be sorted */ @IntrinsicCandidate public static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex) { From 240fde1840e9f55500cfbf4bc8b067b69786d5fe Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 25 Jul 2023 13:24:05 -0700 Subject: [PATCH 10/40] add special cases to float and double arrays --- .../org/openjdk/bench/java/util/ArraysSort.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index cfe59e34e801e..a77e025602289 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -80,11 +80,20 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException, floats_unsorted = new float[size]; doubles_unsorted = new double[size]; + float[] floatSpecialCases = {+0.0f, -0.0f, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY, Float.NaN}; + double[] doubleSpecialCases = {+0.0, -0.0, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NaN}; + for (int i = 0; i < size; i++) { ints_unsorted[i] = rnd.nextInt(); longs_unsorted[i] = rnd.nextLong(); - floats_unsorted[i] = rnd.nextFloat(); - doubles_unsorted[i] = rnd.nextDouble(); + if (i % 10 != 0) { + floats_unsorted[i] = rnd.nextFloat(); + doubles_unsorted[i] = rnd.nextDouble(); + } else { + int rndIdx = rnd.nextInt(doubleSpecialCases.length); + floats_unsorted[i] = floatSpecialCases[rndIdx]; + doubles_unsorted[i] = doubleSpecialCases[rndIdx]; + } } } From 17b51270a84beee1e053f6ab031b5ecc5706303c Mon Sep 17 00:00:00 2001 From: Srinivas Vamsi Parasa Date: Tue, 1 Aug 2023 10:46:33 -0700 Subject: [PATCH 11/40] Update src/java.base/share/classes/java/util/Arrays.java Co-authored-by: David Schlosnagle --- src/java.base/share/classes/java/util/Arrays.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index bd198960ea2b9..13def8cd35261 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -93,11 +93,14 @@ private Arrays() {} */ @IntrinsicCandidate public static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex) { - if (elemType == int.class) DualPivotQuicksort.sort((int[]) array, 0, fromIndex, toIndex); - else if (elemType == long.class) DualPivotQuicksort.sort((long[]) array, 0, fromIndex, toIndex); - else if (elemType == float.class) DualPivotQuicksort.sort((float[]) array, 0, fromIndex, toIndex); - else if (elemType == double.class) DualPivotQuicksort.sort((double[]) array, 0, fromIndex, toIndex); - else throw new UnsupportedOperationException("arraySort intrinsic not supported for this type: " + elemType.toString()); + switch (array) { + case int[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); + case long[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); + case float[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); + case double[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); + default -> throw new UnsupportedOperationException( + "arraySort intrinsic not supported for this type: " + elemType); + } } /* From a2e14d45b24af5412fc52129ab5f9953e87a63d6 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 4 Aug 2023 11:18:30 -0700 Subject: [PATCH 12/40] fix arraySort API and fastdebug issue --- src/hotspot/share/opto/escape.cpp | 1 + src/java.base/share/classes/java/util/Arrays.java | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp index 138511cffc9ca..eed159ff6c8e7 100644 --- a/src/hotspot/share/opto/escape.cpp +++ b/src/hotspot/share/opto/escape.cpp @@ -1190,6 +1190,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "bigIntegerRightShiftWorker") == 0 || strcmp(call->as_CallLeaf()->_name, "bigIntegerLeftShiftWorker") == 0 || strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0 || + strcmp(call->as_CallLeaf()->_name, "arraysort_stub") == 0 || strcmp(call->as_CallLeaf()->_name, "get_class_id_intrinsic") == 0) ))) { call->dump(); diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 13def8cd35261..d5ce85d336074 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -83,7 +83,7 @@ private Arrays() {} * Sorts the specified array into ascending numerical order. * * - * @param elemType the class of the array to be sorted + * @param elemType the class of the elements of the array to be sorted * @param array the array to be sorted * @param offset the relative offset, in bytes, from the base address of * the array to sort, otherwise if the array is {@code null},an absolute @@ -92,7 +92,7 @@ private Arrays() {} * @param toIndex the index of the last element, exclusive, to be sorted */ @IntrinsicCandidate - public static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex) { + private static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex) { switch (array) { case int[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); case long[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); From 7065f1cf6126224c7f64f3f26fc4446df0913995 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 4 Aug 2023 11:34:30 -0700 Subject: [PATCH 13/40] moved stubroutines definitions to vmStructs_jvmci.cpp --- src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 4 ++++ src/hotspot/share/runtime/vmStructs.cpp | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index a4195a04f1866..86753a501d2e7 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -327,6 +327,10 @@ static_field(StubRoutines, _checkcast_arraycopy_uninit, address) \ static_field(StubRoutines, _unsafe_arraycopy, address) \ static_field(StubRoutines, _generic_arraycopy, address) \ + static_field(StubRoutines, _arraysort_int, address) \ + static_field(StubRoutines, _arraysort_long, address) \ + static_field(StubRoutines, _arraysort_float, address) \ + static_field(StubRoutines, _arraysort_double, address) \ \ static_field(StubRoutines, _aescrypt_encryptBlock, address) \ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index 54f500f502a1d..537f063a2b13d 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -529,10 +529,6 @@ /***********************************/ \ \ static_field(StubRoutines, _call_stub_return_address, address) \ - static_field(StubRoutines, _arraysort_int, address) \ - static_field(StubRoutines, _arraysort_long, address) \ - static_field(StubRoutines, _arraysort_float, address) \ - static_field(StubRoutines, _arraysort_double, address) \ \ /***************************************/ \ /* PcDesc and other compiled code info */ \ From 37f3c52728e649c72ffd3c2147af9db05a4d128e Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 4 Aug 2023 15:26:01 -0700 Subject: [PATCH 14/40] Update avx512 sort, benchmarks, shenandoahSupport --- make/modules/java.base/Lib.gmk | 4 +- .../gc/shenandoah/c2/shenandoahSupport.cpp | 3 + .../libavx512_x86_64/avx512-32bit-qsort.hpp | 57 ------ .../libavx512_x86_64/avx512-64bit-common.h | 152 --------------- .../libavx512_x86_64/avx512-64bit-qsort.hpp | 68 ------- .../libavx512_x86_64/avx512-common-qsort.h | 184 ------------------ .../libavx512_x86_64/avxsort_linux_x86.cpp | 1 + .../openjdk/bench/java/util/ArraysSort.java | 60 +++++- 8 files changed, 56 insertions(+), 473 deletions(-) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 0e0f7b2a134c1..1e86c2541e59a 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -240,8 +240,8 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) $(eval $(call SetupJdkLibrary, BUILD_LIBAVX512_X86_64, \ NAME := avx512_x86_64, \ OPTIMIZATION := HIGH, \ - CFLAGS := $(CFLAGS_JDKLIB) -mavx512f -mavx512dq, \ - CXXFLAGS := $(CXXFLAGS_JDKLIB) -mavx512f -mavx512dq, \ + CFLAGS := $(CFLAGS_JDKLIB), \ + CXXFLAGS := $(CXXFLAGS_JDKLIB), \ LDFLAGS := $(LDFLAGS_JDKLIB) \ $(call SET_SHARED_LIBRARY_ORIGIN), \ LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \ diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp index 71068f76043c9..10b2fe13ab529 100644 --- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp +++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp @@ -387,6 +387,9 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) { verify_type t; } args[6]; } calls[] = { + "arraysort_stub", + { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { TypeFunc::Parms+2, ShenandoahLoad }, + { TypeFunc::Parms+4, ShenandoahLoad }, { TypeFunc::Parms+5, ShenandoahLoad } }, "aescrypt_encryptBlock", { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { TypeFunc::Parms+2, ShenandoahLoad }, { -1, ShenandoahNone}, { -1, ShenandoahNone}, { -1, ShenandoahNone} }, diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp index 9c8f1af6a9caf..2b4525b22a4ce 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp @@ -496,34 +496,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, qsort_32bit_(arr, pivot_index, right, max_iters - 1); } -template -static void qselect_32bit_(type_t *arr, int64_t pos, int64_t left, - int64_t right, int64_t max_iters) { - /* - * Resort to std::sort if quicksort isnt making any progress - */ - if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); - return; - } - type_t pivot = get_pivot_32bit(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( - arr, left, right + 1, pivot, &smallest, &biggest); - if ((pivot != smallest) && (pos < pivot_index)) - qselect_32bit_(arr, pos, left, pivot_index - 1, max_iters - 1); - else if ((pivot != biggest) && (pos >= pivot_index)) - qselect_32bit_(arr, pos, pivot_index, right, max_iters - 1); -} X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) { int64_t nan_count = 0; @@ -550,36 +523,6 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize, } } -template <> -void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize, - bool hasnan) { - if (arrsize > 1) { - qselect_32bit_, int32_t>( - arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize, - bool hasnan) { - if (arrsize > 1) { - qselect_32bit_, uint32_t>( - arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qselect(float *arr, int64_t k, int64_t arrsize, - bool hasnan) { - int64_t indx_last_elem = arrsize - 1; - if (UNLIKELY(hasnan)) { - indx_last_elem = move_nans_to_end_of_array(arr, arrsize); - } - if (indx_last_elem >= k) { - qselect_32bit_, float>( - arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); - } -} template <> void avx512_qsort(int32_t *arr, int64_t arrsize) { diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h index c435d100e7579..b8b17c68afbdf 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h @@ -142,96 +142,6 @@ struct ymm_vector { } }; template <> -struct ymm_vector { - using type_t = uint32_t; - using zmm_t = __m256i; - using zmmi_t = __m256i; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; } - static type_t type_min() { return 0; } - static zmm_t zmm_max() { return _mm256_set1_epi32(type_max()); } - - static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, - int v8) { - return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); - } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) { - return _kxor_mask8(x, y); - } - static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(zmm_t x, zmm_t y) { - return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_LE); - } - static opmask_t ge(zmm_t x, zmm_t y) { - return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); - } - static opmask_t eq(zmm_t x, zmm_t y) { - return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ); - } - template - static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, - void const *base) { - return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); - } - template - static zmm_t i64gather(__m512i index, void const *base) { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t loadu(void const *mem) { - return _mm256_loadu_si256((__m256i *)mem); - } - static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { - return _mm256_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t maskz_loadu(opmask_t mask, void const *mem) { - return _mm256_maskz_loadu_epi32(mask, mem); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { - return _mm256_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { - return _mm256_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { - return _mm256_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epu32(x, y); } - static zmm_t permutexvar(__m256i idx, zmm_t zmm) { - return _mm256_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { - __m128i v128 = _mm_max_epu32(_mm256_castsi256_si128(v), - _mm256_extracti128_si256(v, 1)); - __m128i v64 = _mm_max_epu32( - v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); - __m128i v32 = - _mm_max_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); - return (type_t)_mm_cvtsi128_si32(v32); - } - static type_t reducemin(zmm_t v) { - __m128i v128 = _mm_min_epu32(_mm256_castsi256_si128(v), - _mm256_extracti128_si256(v, 1)); - __m128i v64 = _mm_min_epu32( - v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); - __m128i v32 = - _mm_min_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); - return (type_t)_mm_cvtsi128_si32(v32); - } - static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); } - template - static zmm_t shuffle(zmm_t zmm) { - /* Hack!: have to make shuffles within 128-bit lanes work for both - * 32-bit and 64-bit */ - return _mm256_shuffle_epi32(zmm, 0b10110001); - } - static void storeu(void *mem, zmm_t x) { - _mm256_storeu_si256((__m256i *)mem, x); - } -}; -template <> struct ymm_vector { using type_t = int32_t; using zmm_t = __m256i; @@ -397,68 +307,6 @@ struct zmm_vector { static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); } }; template <> -struct zmm_vector { - using type_t = uint64_t; - using zmm_t = __m512i; - using zmmi_t = __m512i; - using ymm_t = __m512i; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() { return X86_SIMD_SORT_MAX_UINT64; } - static type_t type_min() { return 0; } - static zmm_t zmm_max() { return _mm512_set1_epi64(type_max()); } - - static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, - int v8) { - return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); - } - template - static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, - void const *base) { - return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); - } - template - static zmm_t i64gather(__m512i index, void const *base) { - return _mm512_i64gather_epi64(index, base, scale); - } - static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t ge(zmm_t x, zmm_t y) { - return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT); - } - static opmask_t eq(zmm_t x, zmm_t y) { - return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu64(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { - return _mm512_mask_compressstoreu_epi64(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { - return _mm512_mask_loadu_epi64(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { - return _mm512_mask_mov_epi64(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { - return _mm512_mask_storeu_epi64(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu64(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) { - return _mm512_permutexvar_epi64(idx, zmm); - } - static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu64(v); } - static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu64(v); } - static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); } - template - static zmm_t shuffle(zmm_t zmm) { - __m512d temp = _mm512_castsi512_pd(zmm); - return _mm512_castpd_si512( - _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); - } - static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); } -}; -template <> struct zmm_vector { using type_t = double; using zmm_t = __m512d; diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp index 2d01663923a9b..45497e268a3c5 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp @@ -753,66 +753,6 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, qsort_64bit_(arr, pivot_index, right, max_iters - 1); } -template -static void qselect_64bit_(type_t *arr, int64_t pos, int64_t left, - int64_t right, int64_t max_iters) { - /* - * Resort to std::sort if quicksort isnt making any progress - */ - if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1); - return; - } - /* - * Base case: use bitonic networks to sort arrays <= 128 - */ - if (right + 1 - left <= 128) { - sort_128_64bit(arr + left, (int32_t)(right + 1 - left)); - return; - } - - type_t pivot = get_pivot_64bit(arr, left, right); - type_t smallest = vtype::type_max(); - type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( - arr, left, right + 1, pivot, &smallest, &biggest); - if ((pivot != smallest) && (pos < pivot_index)) - qselect_64bit_(arr, pos, left, pivot_index - 1, max_iters - 1); - else if ((pivot != biggest) && (pos >= pivot_index)) - qselect_64bit_(arr, pos, pivot_index, right, max_iters - 1); -} - -template <> -void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize, - bool hasnan) { - if (arrsize > 1) { - qselect_64bit_, int64_t>( - arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize, - bool hasnan) { - if (arrsize > 1) { - qselect_64bit_, uint64_t>( - arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - -template <> -void avx512_qselect(double *arr, int64_t k, int64_t arrsize, - bool hasnan) { - int64_t indx_last_elem = arrsize - 1; - if (UNLIKELY(hasnan)) { - indx_last_elem = move_nans_to_end_of_array(arr, arrsize); - } - if (indx_last_elem >= k) { - qselect_64bit_, double>( - arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); - } -} - template <> void avx512_qsort(int64_t *arr, int64_t arrsize) { if (arrsize > 1) { @@ -821,14 +761,6 @@ void avx512_qsort(int64_t *arr, int64_t arrsize) { } } -template <> -void avx512_qsort(uint64_t *arr, int64_t arrsize) { - if (arrsize > 1) { - qsort_64bit_, uint64_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - template <> void avx512_qsort(double *arr, int64_t arrsize) { if (arrsize > 1) { diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h index 9eb09689f72c2..f2ad8b039070b 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h @@ -118,28 +118,6 @@ struct ymm_vector; // Regular quicksort routines: template void avx512_qsort(T *arr, int64_t arrsize); -void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); - -template -void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); -void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize, - bool hasnan = false); - -template -inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize, - bool hasnan = false) { - avx512_qselect(arr, k - 1, arrsize, hasnan); - avx512_qsort(arr, k - 1); -} -inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize, - bool hasnan = false) { - avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); - avx512_qsort_fp16(arr, k - 1); -} - -// key-value sort routines -template -void avx512_qsort_kv(T *keys, uint64_t *indexes, int64_t arrsize); template bool is_a_nan(T elem) { @@ -389,166 +367,4 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, return l_store; } -// Key-value sort helper functions - -template -static void COEX(zmm_t1 &key1, zmm_t1 &key2, zmm_t2 &index1, zmm_t2 &index2) { - zmm_t1 key_t1 = vtype1::min(key1, key2); - zmm_t1 key_t2 = vtype1::max(key1, key2); - - zmm_t2 index_t1 = - vtype2::mask_mov(index2, vtype1::eq(key_t1, key1), index1); - zmm_t2 index_t2 = - vtype2::mask_mov(index1, vtype1::eq(key_t1, key1), index2); - - key1 = key_t1; - key2 = key_t2; - index1 = index_t1; - index2 = index_t2; -} -template -static inline zmm_t1 cmp_merge(zmm_t1 in1, zmm_t1 in2, zmm_t2 &indexes1, - zmm_t2 indexes2, opmask_t mask) { - zmm_t1 tmp_keys = cmp_merge(in1, in2, mask); - indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1); - return tmp_keys; // 0 -> min, 1 -> max -} - -/* - * Parition one ZMM register based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template -static inline int32_t partition_vec(type_t1 *keys, type_t2 *indexes, - int64_t left, int64_t right, - const zmm_t1 keys_vec, - const zmm_t2 indexes_vec, - const zmm_t1 pivot_vec, - zmm_t1 *smallest_vec, zmm_t1 *biggest_vec) { - /* which elements are larger than the pivot */ - typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec); - int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); - vtype1::mask_compressstoreu(keys + left, vtype1::knot_opmask(gt_mask), - keys_vec); - vtype1::mask_compressstoreu(keys + right - amount_gt_pivot, gt_mask, - keys_vec); - vtype2::mask_compressstoreu(indexes + left, vtype2::knot_opmask(gt_mask), - indexes_vec); - vtype2::mask_compressstoreu(indexes + right - amount_gt_pivot, gt_mask, - indexes_vec); - *smallest_vec = vtype1::min(keys_vec, *smallest_vec); - *biggest_vec = vtype1::max(keys_vec, *biggest_vec); - return amount_gt_pivot; -} -/* - * Parition an array based on the pivot and returns the index of the - * last element that is less than equal to the pivot. - */ -template -static inline int64_t partition_avx512(type_t1 *keys, type_t2 *indexes, - int64_t left, int64_t right, - type_t1 pivot, type_t1 *smallest, - type_t1 *biggest) { - /* make array length divisible by vtype1::numlanes , shortening the array */ - for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) { - *smallest = std::min(*smallest, keys[left]); - *biggest = std::max(*biggest, keys[left]); - if (keys[left] > pivot) { - right--; - std::swap(keys[left], keys[right]); - std::swap(indexes[left], indexes[right]); - } else { - ++left; - } - } - - if (left == right) - return left; /* less than vtype1::numlanes elements in the array */ - - zmm_t1 pivot_vec = vtype1::set1(pivot); - zmm_t1 min_vec = vtype1::set1(*smallest); - zmm_t1 max_vec = vtype1::set1(*biggest); - - if (right - left == vtype1::numlanes) { - zmm_t1 keys_vec = vtype1::loadu(keys + left); - int32_t amount_gt_pivot; - - zmm_t2 indexes_vec = vtype2::loadu(indexes + left); - amount_gt_pivot = partition_vec( - keys, indexes, left, left + vtype1::numlanes, keys_vec, indexes_vec, - pivot_vec, &min_vec, &max_vec); - - *smallest = vtype1::reducemin(min_vec); - *biggest = vtype1::reducemax(max_vec); - return left + (vtype1::numlanes - amount_gt_pivot); - } - - // first and last vtype1::numlanes values are partitioned at the end - zmm_t1 keys_vec_left = vtype1::loadu(keys + left); - zmm_t1 keys_vec_right = vtype1::loadu(keys + (right - vtype1::numlanes)); - zmm_t2 indexes_vec_left; - zmm_t2 indexes_vec_right; - indexes_vec_left = vtype2::loadu(indexes + left); - indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes)); - - // store points of the vectors - int64_t r_store = right - vtype1::numlanes; - int64_t l_store = left; - // indices for loading the elements - left += vtype1::numlanes; - right -= vtype1::numlanes; - while (right - left != 0) { - zmm_t1 keys_vec; - zmm_t2 indexes_vec; - /* - * if fewer elements are stored on the right side of the array, - * then next elements are loaded from the right side, - * otherwise from the left side - */ - if ((r_store + vtype1::numlanes) - right < left - l_store) { - right -= vtype1::numlanes; - keys_vec = vtype1::loadu(keys + right); - indexes_vec = vtype2::loadu(indexes + right); - } else { - keys_vec = vtype1::loadu(keys + left); - indexes_vec = vtype2::loadu(indexes + left); - left += vtype1::numlanes; - } - // partition the current vector and save it on both sides of the array - int32_t amount_gt_pivot; - - amount_gt_pivot = partition_vec( - keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec, - indexes_vec, pivot_vec, &min_vec, &max_vec); - r_store -= amount_gt_pivot; - l_store += (vtype1::numlanes - amount_gt_pivot); - } - - /* partition and save vec_left and vec_right */ - int32_t amount_gt_pivot; - amount_gt_pivot = partition_vec( - keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec_left, - indexes_vec_left, pivot_vec, &min_vec, &max_vec); - l_store += (vtype1::numlanes - amount_gt_pivot); - amount_gt_pivot = partition_vec( - keys, indexes, l_store, l_store + vtype1::numlanes, keys_vec_right, - indexes_vec_right, pivot_vec, &min_vec, &max_vec); - l_store += (vtype1::numlanes - amount_gt_pivot); - *smallest = vtype1::reducemin(min_vec); - *biggest = vtype1::reducemax(max_vec); - return l_store; -} #endif // AVX512_QSORT_COMMON diff --git a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp index ec436bb49eee6..67d6285cea552 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp +++ b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp @@ -24,6 +24,7 @@ * */ +#pragma GCC target("avx512dq", "avx512f") #include "avx512-32bit-qsort.hpp" #include "avx512-64bit-qsort.hpp" diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index a77e025602289..d5c7953d51492 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -36,7 +36,6 @@ import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; - import java.util.Arrays; import java.util.Random; import java.util.concurrent.TimeUnit; @@ -51,15 +50,9 @@ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MICROSECONDS) @State(Scope.Thread) -@Warmup(iterations = 3, time=60) -@Measurement(iterations = 3, time=120) @Fork(value = 1) public class ArraysSort { - - @Param({"10","25","50","75","100", "1000", "10000", "100000"}) - private int size; - private int[] ints_unsorted; private long[] longs_unsorted; private float[] floats_unsorted; @@ -71,8 +64,7 @@ public class ArraysSort { private double[] doubles_sorted; - @Setup - public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + public void initialize(int size) { Random rnd = new Random(42); ints_unsorted = new int[size]; @@ -98,7 +90,7 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException, } @Setup(Level.Invocation) - public void init() { + public void clear() { ints_sorted = ints_unsorted.clone(); longs_sorted = longs_unsorted.clone(); floats_sorted = floats_unsorted.clone(); @@ -129,4 +121,52 @@ public double[] doubleSort() throws Throwable { return doubles_sorted; } + @Warmup(iterations = 3, time=2) + @Measurement(iterations = 3, time=5) + public static class Small extends ArraysSort { + @Param({"10","25","50","75","100"}) + private int size; + + @Setup + public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + initialize(size); + } + } + + @Warmup(iterations = 3, time=2) + @Measurement(iterations = 3, time=5) + public static class Medium extends ArraysSort { + @Param({"1000", "10000"}) + private int size; + + @Setup + public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + initialize(size); + } + } + + @Warmup(iterations = 3, time=20) + @Measurement(iterations = 3, time=30) + public static class Large extends ArraysSort { + @Param({"50000", "100000"}) + private int size; + + @Setup + public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + initialize(size); + } + } + + @Warmup(iterations = 3, time=120) + @Measurement(iterations = 3, time=30) + public static class VeryLarge extends ArraysSort { + @Param({"1000000"}) + private int size; + + @Setup + public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + initialize(size); + } + } + } From e0ffc81de2d8a6c61d8ad4d6591de7dc3358686a Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 4 Aug 2023 15:48:27 -0700 Subject: [PATCH 15/40] More avx512 sort cleanups --- .../libavx512_x86_64/avx512-32bit-qsort.hpp | 66 ------ .../libavx512_x86_64/avx512-64bit-common.h | 192 ------------------ 2 files changed, 258 deletions(-) diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp index 2b4525b22a4ce..663a885305c2c 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp @@ -102,64 +102,6 @@ struct zmm_vector { static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); } }; template <> -struct zmm_vector { - using type_t = uint32_t; - using zmm_t = __m512i; - using ymm_t = __m256i; - using opmask_t = __mmask16; - static const uint8_t numlanes = 16; - - static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; } - static type_t type_min() { return 0; } - static zmm_t zmm_max() { - return _mm512_set1_epi32(type_max()); - } // TODO: this should broadcast bits as is? - - template - static ymm_t i64gather(__m512i index, void const *base) { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t merge(ymm_t y1, ymm_t y2) { - zmm_t z1 = _mm512_castsi256_si512(y1); - return _mm512_inserti32x8(z1, y2, 1); - } - static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } - static opmask_t ge(zmm_t x, zmm_t y) { - return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); - } - static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { - return _mm512_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { - return _mm512_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { - return _mm512_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { - return _mm512_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); } - static zmm_t permutexvar(__m512i idx, zmm_t zmm) { - return _mm512_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu32(v); } - static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu32(v); } - static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); } - template - static zmm_t shuffle(zmm_t zmm) { - return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); - } - static void storeu(void *mem, zmm_t x) { - return _mm512_storeu_si512(mem, x); - } - - static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); } - static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); } -}; -template <> struct zmm_vector { using type_t = float; using zmm_t = __m512; @@ -532,14 +474,6 @@ void avx512_qsort(int32_t *arr, int64_t arrsize) { } } -template <> -void avx512_qsort(uint32_t *arr, int64_t arrsize) { - if (arrsize > 1) { - qsort_32bit_, uint32_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); - } -} - template <> void avx512_qsort(float *arr, int64_t arrsize) { if (arrsize > 1) { diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h index b8b17c68afbdf..527b4351eb7e7 100644 --- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h +++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h @@ -41,198 +41,6 @@ #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 #define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 -template <> -struct ymm_vector { - using type_t = float; - using zmm_t = __m256; - using zmmi_t = __m256i; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() { return X86_SIMD_SORT_INFINITYF; } - static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; } - static zmm_t zmm_max() { return _mm256_set1_ps(type_max()); } - - static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, - int v8) { - return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); - } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) { - return _kxor_mask8(x, y); - } - static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(zmm_t x, zmm_t y) { - return _mm256_cmp_ps_mask(x, y, _CMP_LE_OQ); - } - static opmask_t ge(zmm_t x, zmm_t y) { - return _mm256_cmp_ps_mask(x, y, _CMP_GE_OQ); - } - static opmask_t eq(zmm_t x, zmm_t y) { - return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ); - } - template - static opmask_t fpclass(zmm_t x) { - return _mm256_fpclass_ps_mask(x, type); - } - template - static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, - void const *base) { - return _mm512_mask_i64gather_ps(src, mask, index, base, scale); - } - template - static zmm_t i64gather(__m512i index, void const *base) { - return _mm512_i64gather_ps(index, base, scale); - } - static zmm_t loadu(void const *mem) { - return _mm256_loadu_ps((float *)mem); - } - static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_ps(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { - return _mm256_mask_compressstoreu_ps(mem, mask, x); - } - static zmm_t maskz_loadu(opmask_t mask, void const *mem) { - return _mm256_maskz_loadu_ps(mask, mem); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { - return _mm256_mask_loadu_ps(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { - return _mm256_mask_mov_ps(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { - return _mm256_mask_storeu_ps(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_ps(x, y); } - static zmm_t permutexvar(__m256i idx, zmm_t zmm) { - return _mm256_permutexvar_ps(idx, zmm); - } - static type_t reducemax(zmm_t v) { - __m128 v128 = - _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1)); - __m128 v64 = _mm_max_ps( - v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2))); - __m128 v32 = - _mm_max_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); - return _mm_cvtss_f32(v32); - } - static type_t reducemin(zmm_t v) { - __m128 v128 = - _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1)); - __m128 v64 = _mm_min_ps( - v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2))); - __m128 v32 = - _mm_min_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); - return _mm_cvtss_f32(v32); - } - static zmm_t set1(type_t v) { return _mm256_set1_ps(v); } - template - static zmm_t shuffle(zmm_t zmm) { - /* Hack!: have to make shuffles within 128-bit lanes work for both - * 32-bit and 64-bit */ - return _mm256_shuffle_ps(zmm, zmm, 0b10110001); - // if constexpr (mask == 0b01010101) { - // } - // else { - // /* Not used, so far */ - // return _mm256_shuffle_ps(zmm, zmm, mask); - // } - } - static void storeu(void *mem, zmm_t x) { - _mm256_storeu_ps((float *)mem, x); - } -}; -template <> -struct ymm_vector { - using type_t = int32_t; - using zmm_t = __m256i; - using zmmi_t = __m256i; - using opmask_t = __mmask8; - static const uint8_t numlanes = 8; - - static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; } - static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; } - static zmm_t zmm_max() { - return _mm256_set1_epi32(type_max()); - } // TODO: this should broadcast bits as is? - - static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, - int v8) { - return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); - } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) { - return _kxor_mask8(x, y); - } - static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(zmm_t x, zmm_t y) { - return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_LE); - } - static opmask_t ge(zmm_t x, zmm_t y) { - return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); - } - static opmask_t eq(zmm_t x, zmm_t y) { - return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ); - } - template - static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index, - void const *base) { - return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); - } - template - static zmm_t i64gather(__m512i index, void const *base) { - return _mm512_i64gather_epi32(index, base, scale); - } - static zmm_t loadu(void const *mem) { - return _mm256_loadu_si256((__m256i *)mem); - } - static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) { - return _mm256_mask_compressstoreu_epi32(mem, mask, x); - } - static zmm_t maskz_loadu(opmask_t mask, void const *mem) { - return _mm256_maskz_loadu_epi32(mask, mem); - } - static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) { - return _mm256_mask_loadu_epi32(x, mask, mem); - } - static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) { - return _mm256_mask_mov_epi32(x, mask, y); - } - static void mask_storeu(void *mem, opmask_t mask, zmm_t x) { - return _mm256_mask_storeu_epi32(mem, mask, x); - } - static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epi32(x, y); } - static zmm_t permutexvar(__m256i idx, zmm_t zmm) { - return _mm256_permutexvar_epi32(idx, zmm); - } - static type_t reducemax(zmm_t v) { - __m128i v128 = _mm_max_epi32(_mm256_castsi256_si128(v), - _mm256_extracti128_si256(v, 1)); - __m128i v64 = _mm_max_epi32( - v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); - __m128i v32 = - _mm_max_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); - return (type_t)_mm_cvtsi128_si32(v32); - } - static type_t reducemin(zmm_t v) { - __m128i v128 = _mm_min_epi32(_mm256_castsi256_si128(v), - _mm256_extracti128_si256(v, 1)); - __m128i v64 = _mm_min_epi32( - v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2))); - __m128i v32 = - _mm_min_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); - return (type_t)_mm_cvtsi128_si32(v32); - } - static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); } - template - static zmm_t shuffle(zmm_t zmm) { - /* Hack!: have to make shuffles within 128-bit lanes work for both - * 32-bit and 64-bit */ - return _mm256_shuffle_epi32(zmm, 0b10110001); - } - static void storeu(void *mem, zmm_t x) { - _mm256_storeu_si256((__m256i *)mem, x); - } -}; template <> struct zmm_vector { using type_t = int64_t; From 13f4aaf45bd3b504f4cf59ab75fa0db0f9dd1c93 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 4 Aug 2023 16:13:45 -0700 Subject: [PATCH 16/40] Change name from libavx512_x86_64 to libx86_64 --- make/modules/java.base/Lib.gmk | 6 +++--- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 18 +++++++++--------- .../gc/shenandoah/c2/shenandoahSupport.cpp | 4 ++-- .../avx512-32bit-qsort.hpp | 0 .../avx512-64bit-common.h | 0 .../avx512-64bit-qsort.hpp | 0 .../avx512-common-qsort.h | 0 .../avxsort_linux_x86.cpp | 0 8 files changed, 14 insertions(+), 14 deletions(-) rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-32bit-qsort.hpp (100%) rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-64bit-common.h (100%) rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-64bit-qsort.hpp (100%) rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-common-qsort.h (100%) rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avxsort_linux_x86.cpp (100%) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 1e86c2541e59a..85a86372dbf1f 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -237,8 +237,8 @@ endif ################################################################################ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) - $(eval $(call SetupJdkLibrary, BUILD_LIBAVX512_X86_64, \ - NAME := avx512_x86_64, \ + $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \ + NAME := x86_64, \ OPTIMIZATION := HIGH, \ CFLAGS := $(CFLAGS_JDKLIB), \ CXXFLAGS := $(CXXFLAGS_JDKLIB), \ @@ -249,7 +249,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) LIBS_linux := -lc -lm -ldl, \ )) - TARGETS += $(BUILD_LIBAVX512_X86_64) + TARGETS += $(BUILD_LIB_X86_64) endif ################################################################################ diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 0b9f91ecc97bd..2a7170224c6b5 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4131,28 +4131,28 @@ void StubGenerator::generate_compiler_stubs() { } // Get avx512 sort stub routine addresses - void *libavx512_x86_64 = nullptr; + void *lib_x86_64 = nullptr; char ebuf_avx512[1024]; char dll_name_avx512[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "avx512_x86_64")) { - libavx512_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512); + if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) { + lib_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512); } - if (libavx512_x86_64 != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "avx512_x86_64" JNI_LIB_SUFFIX, p2i(libavx512_x86_64)); + if (lib_x86_64 != nullptr) { + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(lib_x86_64)); if (UseAVX > 2 && VM_Version::supports_avx512dq()) { snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_int"); - StubRoutines::_arraysort_int = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + StubRoutines::_arraysort_int = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_long"); - StubRoutines::_arraysort_long = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + StubRoutines::_arraysort_long = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_float"); - StubRoutines::_arraysort_float = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + StubRoutines::_arraysort_float = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_double"); - StubRoutines::_arraysort_double = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512); + StubRoutines::_arraysort_double = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); } } diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp index 10b2fe13ab529..27e854028d7f8 100644 --- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp +++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp @@ -388,8 +388,8 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) { } args[6]; } calls[] = { "arraysort_stub", - { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { TypeFunc::Parms+2, ShenandoahLoad }, - { TypeFunc::Parms+4, ShenandoahLoad }, { TypeFunc::Parms+5, ShenandoahLoad } }, + { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { -1, ShenandoahNone }, + { -1, ShenandoahNone }, { -1, ShenandoahNone } }, "aescrypt_encryptBlock", { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { TypeFunc::Parms+2, ShenandoahLoad }, { -1, ShenandoahNone}, { -1, ShenandoahNone}, { -1, ShenandoahNone} }, diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp rename to src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h similarity index 100% rename from src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h rename to src/java.base/linux/native/libx86_64/avx512-64bit-common.h diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp rename to src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h similarity index 100% rename from src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h rename to src/java.base/linux/native/libx86_64/avx512-common-qsort.h diff --git a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp similarity index 100% rename from src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp rename to src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp From c49657ee2809205a08489dba1f2421b310d9936b Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Mon, 7 Aug 2023 13:56:09 -0700 Subject: [PATCH 17/40] change names from avx512 to x86_64 --- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 26 ++++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 2a7170224c6b5..c39d237a8f0af 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4131,28 +4131,28 @@ void StubGenerator::generate_compiler_stubs() { } // Get avx512 sort stub routine addresses - void *lib_x86_64 = nullptr; - char ebuf_avx512[1024]; + void *libx86_64 = nullptr; + char ebuf_x86_64[1024]; char dll_name_avx512[JVM_MAXPATHLEN]; if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) { - lib_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512); + libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64); } - if (lib_x86_64 != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(lib_x86_64)); + if (libx86_64 != nullptr) { + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64)); if (UseAVX > 2 && VM_Version::supports_avx512dq()) { - snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_int"); - StubRoutines::_arraysort_int = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int"); + StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64); - snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_long"); - StubRoutines::_arraysort_long = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_long"); + StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64); - snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_float"); - StubRoutines::_arraysort_float = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_float"); + StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64); - snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_double"); - StubRoutines::_arraysort_double = (address)os::dll_lookup(lib_x86_64, ebuf_avx512); + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double"); + StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64); } } From 58467994ce7a136ecddfa1ea296234ecc0385753 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 11 Aug 2023 15:03:22 -0700 Subject: [PATCH 18/40] Fix signature for Shenandoah support --- src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp index 27e854028d7f8..98c27b337ac8e 100644 --- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp +++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp @@ -388,8 +388,7 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) { } args[6]; } calls[] = { "arraysort_stub", - { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { -1, ShenandoahNone }, - { -1, ShenandoahNone }, { -1, ShenandoahNone } }, + { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone } }, "aescrypt_encryptBlock", { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { TypeFunc::Parms+2, ShenandoahLoad }, { -1, ShenandoahNone}, { -1, ShenandoahNone}, { -1, ShenandoahNone} }, From 07349ec35d5c56777131e364bddb927117b9f46c Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 15 Aug 2023 12:09:45 -0700 Subject: [PATCH 19/40] Fix preservation of NaNs for floats and doubles --- .../native/libx86_64/avx512-32bit-qsort.hpp | 34 ++----------------- .../native/libx86_64/avx512-64bit-common.h | 24 ------------- .../native/libx86_64/avx512-64bit-qsort.hpp | 6 ++-- .../openjdk/bench/java/util/ArraysSort.java | 2 +- 4 files changed, 7 insertions(+), 59 deletions(-) diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp index 663a885305c2c..c07caf991ea8e 100644 --- a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp @@ -438,34 +438,6 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, qsort_32bit_(arr, pivot_index, right, max_iters - 1); } - - -X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) { - int64_t nan_count = 0; - __mmask16 loadmask = 0xFFFF; - while (arrsize > 0) { - if (arrsize < 16) { - loadmask = (0x0001 << arrsize) - 0x0001; - } - __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr); - __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((int32_t)nanmask); - _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT); - arr += 16; - arrsize -= 16; - } - return nan_count; -} - -X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize, - int64_t nan_count) { - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = std::nanf("1"); - nan_count -= 1; - } -} - - template <> void avx512_qsort(int32_t *arr, int64_t arrsize) { if (arrsize > 1) { @@ -476,11 +448,11 @@ void avx512_qsort(int32_t *arr, int64_t arrsize) { template <> void avx512_qsort(float *arr, int64_t arrsize) { + int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize); + arrsize = idx_last_elem_not_nan + 1; if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qsort_32bit_, float>(arr, 0, arrsize - 1, + qsort_32bit_, float>(arr, 0, idx_last_elem_not_nan, 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); } } diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h index 527b4351eb7e7..2d07cf1984859 100644 --- a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h +++ b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h @@ -182,31 +182,7 @@ struct zmm_vector { } static void storeu(void *mem, zmm_t x) { _mm512_storeu_pd(mem, x); } }; -X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, - int64_t arrsize) { - int64_t nan_count = 0; - __mmask8 loadmask = 0xFF; - while (arrsize > 0) { - if (arrsize < 8) { - loadmask = (0x01 << arrsize) - 0x01; - } - __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr); - __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ); - nan_count += _mm_popcnt_u32((int32_t)nanmask); - _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE); - arr += 8; - arrsize -= 8; - } - return nan_count; -} -X86_SIMD_SORT_INLINE void replace_inf_with_nan(double *arr, int64_t arrsize, - int64_t nan_count) { - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { - arr[ii] = std::nan("1"); - nan_count -= 1; - } -} /* * Assumes zmm is random and performs a full sorting network defined in * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp index 45497e268a3c5..da07dc51427a4 100644 --- a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp @@ -763,11 +763,11 @@ void avx512_qsort(int64_t *arr, int64_t arrsize) { template <> void avx512_qsort(double *arr, int64_t arrsize) { + int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize); + arrsize = idx_last_elem_not_nan + 1; if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf(arr, arrsize); - qsort_64bit_, double>(arr, 0, arrsize - 1, + qsort_64bit_, double>(arr, 0, idx_last_elem_not_nan, 2 * (int64_t)log2(arrsize)); - replace_inf_with_nan(arr, arrsize, nan_count); } } #endif // AVX512_QSORT_64BIT diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index d5c7953d51492..fb5b2f874ee88 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -145,7 +145,7 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException, } } - @Warmup(iterations = 3, time=20) + @Warmup(iterations = 3, time=40) @Measurement(iterations = 3, time=30) public static class Large extends ArraysSort { @Param({"50000", "100000"}) From 9153059a4df51a3661aaa719201e1044a80c30fc Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 22 Aug 2023 16:02:22 -0700 Subject: [PATCH 20/40] Decomposed DPQS using AVX512 partitioning and AVX512 sort (for small arrays). Works for serial and parallel sort. --- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 14 +- src/hotspot/share/classfile/vmIntrinsics.hpp | 7 +- .../gc/shenandoah/c2/shenandoahSupport.cpp | 3 + src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 4 + src/hotspot/share/opto/c2compiler.cpp | 1 + src/hotspot/share/opto/escape.cpp | 1 + src/hotspot/share/opto/library_call.cpp | 44 +- src/hotspot/share/opto/library_call.hpp | 1 + src/hotspot/share/opto/runtime.cpp | 21 + src/hotspot/share/opto/runtime.hpp | 1 + src/hotspot/share/runtime/stubRoutines.cpp | 24 +- src/hotspot/share/runtime/stubRoutines.hpp | 5 + .../native/libx86_64/avx512-32bit-qsort.hpp | 20 +- .../native/libx86_64/avx512-64bit-common.h | 6 + .../native/libx86_64/avx512-64bit-qsort.hpp | 15 +- .../native/libx86_64/avx512-common-qsort.h | 135 +- .../native/libx86_64/avxsort_linux_x86.cpp | 36 +- .../share/classes/java/util/Arrays.java | 65 +- .../classes/java/util/DualPivotQuicksort.java | 1308 ++++++++++------- .../openjdk/bench/java/util/ArraysSort.java | 73 +- 20 files changed, 1150 insertions(+), 634 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index c39d237a8f0af..11936ac764126 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4130,7 +4130,7 @@ void StubGenerator::generate_compiler_stubs() { = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); } - // Get avx512 sort stub routine addresses + // Get addresses for avx512 sort and partition routines void *libx86_64 = nullptr; char ebuf_x86_64[1024]; char dll_name_avx512[JVM_MAXPATHLEN]; @@ -4153,6 +4153,18 @@ void StubGenerator::generate_compiler_stubs() { snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double"); StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_int"); + StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_long"); + StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_float"); + StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_double"); + StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64); } } diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index aa9f9660bbba4..9fce2446aea19 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -343,7 +343,12 @@ class methodHandle; \ do_intrinsic(_arraySort, java_util_Arrays, arraySort_name, arraySort_signature, F_S) \ do_name( arraySort_name, "arraySort") \ - do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII)V") \ + do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)V") \ + \ + do_intrinsic(_arrayPartition, java_util_Arrays, arrayPartition_name, arrayPartition_signature, F_S) \ + do_name(arrayPartition_name, "arrayPartition") \ + do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V") \ + \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ do_name( copyOfRange_name, "copyOfRange") \ diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp index 98c27b337ac8e..9a98ec9cd529d 100644 --- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp +++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp @@ -387,6 +387,9 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) { verify_type t; } args[6]; } calls[] = { + "array_partition_stub", + { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore }, { -1, ShenandoahNone }, + { -1, ShenandoahNone }, { -1, ShenandoahNone } }, "arraysort_stub", { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone } }, "aescrypt_encryptBlock", diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index 86753a501d2e7..d28ff16b13f17 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -331,6 +331,10 @@ static_field(StubRoutines, _arraysort_long, address) \ static_field(StubRoutines, _arraysort_float, address) \ static_field(StubRoutines, _arraysort_double, address) \ + static_field(StubRoutines, _array_partition_int, address) \ + static_field(StubRoutines, _array_partition_long, address) \ + static_field(StubRoutines, _array_partition_float, address) \ + static_field(StubRoutines, _array_partition_double, address) \ \ static_field(StubRoutines, _aescrypt_encryptBlock, address) \ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index ac2c27cb248dc..7f4e8ee769625 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -569,6 +569,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) { case vmIntrinsics::_max_strict: case vmIntrinsics::_arraycopy: case vmIntrinsics::_arraySort: + case vmIntrinsics::_arrayPartition: case vmIntrinsics::_indexOfL: case vmIntrinsics::_indexOfU: case vmIntrinsics::_indexOfUL: diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp index eed159ff6c8e7..6c165b5ee813e 100644 --- a/src/hotspot/share/opto/escape.cpp +++ b/src/hotspot/share/opto/escape.cpp @@ -1191,6 +1191,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "bigIntegerLeftShiftWorker") == 0 || strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0 || strcmp(call->as_CallLeaf()->_name, "arraysort_stub") == 0 || + strcmp(call->as_CallLeaf()->_name, "array_partition_stub") == 0 || strcmp(call->as_CallLeaf()->_name, "get_class_id_intrinsic") == 0) ))) { call->dump(); diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 132835ba4aff8..10a8734bc1a8a 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -292,7 +292,8 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_arraycopy: return inline_arraycopy(); - case vmIntrinsics::_arraySort: return inline_arraysort(); + case vmIntrinsics::_arraySort: return inline_arraysort(); + case vmIntrinsics::_arrayPartition: return inline_array_partition(); case vmIntrinsics::_compareToL: return inline_string_compareTo(StrIntrinsicNode::LL); case vmIntrinsics::_compareToU: return inline_string_compareTo(StrIntrinsicNode::UU); @@ -5194,6 +5195,47 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_ uncommon_trap_call->set_req(0, top()); // not used anymore, kill it } +//------------------------------inline_array_partition----------------------- +bool LibraryCallKit::inline_array_partition() { + + address stubAddr = nullptr; + const char *stubName; + stubName = "array_partition_stub"; + + Node* elementType = null_check(argument(0)); + Node* obj = argument(1); + Node* offset = argument(2); + Node* fromIndex = argument(4); + Node* toIndex = argument(5); + Node* pivot_indices = argument(6); + Node* pivot_offset = argument(7); + Node* isDualPivot = argument(9); + + const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); + ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); + BasicType bt = elem_type->basic_type(); + stubAddr = StubRoutines::select_array_partition_function(bt); + if (stubAddr == nullptr) return false; + + const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr(); + if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) { + return false; // failed input validation + } + + Node* obj_adr = make_unsafe_address(obj, offset); + + pivot_indices = must_be_not_null(pivot_indices, true); + Node* pivot_indices_adr = make_unsafe_address(pivot_indices, pivot_offset); //this offset is not same as array offset + + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + obj_adr, fromIndex, toIndex, pivot_indices_adr, isDualPivot); + + return true; +} + + //------------------------------inline_arraysort----------------------- bool LibraryCallKit::inline_arraysort() { diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index 53d697f6b2078..4cc9c56afca80 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -280,6 +280,7 @@ class LibraryCallKit : public GraphKit { void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp, uint new_idx); bool inline_arraysort(); + bool inline_array_partition(); typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind; bool inline_unsafe_load_store(BasicType type, LoadStoreKind kind, AccessKind access_kind); bool inline_unsafe_fence(vmIntrinsics::ID id); diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 3555d0cf31af3..a0e383d95afd0 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -857,6 +857,27 @@ const TypeFunc* OptoRuntime::array_fill_Type() { return TypeFunc::make(domain, range); } +const TypeFunc* OptoRuntime::array_partition_Type() { + // create input type (domain) + int num_args = 5; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // array + fields[argp++] = TypeInt::INT; // low + fields[argp++] = TypeInt::INT; // end + fields[argp++] = TypePtr::NOTNULL; // pivot_indices (int array) + fields[argp++] = TypeInt::BOOL; // isDualPivot + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = nullptr; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + const TypeFunc* OptoRuntime::array_sort_Type() { // create input type (domain) int num_args = 3; diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp index e4d5f749d3efa..b85542423e848 100644 --- a/src/hotspot/share/opto/runtime.hpp +++ b/src/hotspot/share/opto/runtime.hpp @@ -269,6 +269,7 @@ class OptoRuntime : public AllStatic { static const TypeFunc* array_fill_Type(); static const TypeFunc* array_sort_Type(); + static const TypeFunc* array_partition_Type(); static const TypeFunc* aescrypt_block_Type(); static const TypeFunc* cipherBlockChaining_aescrypt_Type(); static const TypeFunc* electronicCodeBook_aescrypt_Type(); diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 5a3b6168ab122..84e43c52ec3a2 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -176,10 +176,14 @@ address StubRoutines::_hf2f = nullptr; address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; -address StubRoutines::_arraysort_int = nullptr; -address StubRoutines::_arraysort_long = nullptr; -address StubRoutines::_arraysort_float = nullptr; -address StubRoutines::_arraysort_double = nullptr; +address StubRoutines::_arraysort_int = nullptr; +address StubRoutines::_arraysort_long = nullptr; +address StubRoutines::_arraysort_float = nullptr; +address StubRoutines::_arraysort_double = nullptr; +address StubRoutines::_array_partition_int = nullptr; +address StubRoutines::_array_partition_long = nullptr; +address StubRoutines::_array_partition_float = nullptr; +address StubRoutines::_array_partition_double = nullptr; address StubRoutines::_cont_thaw = nullptr; address StubRoutines::_cont_returnBarrier = nullptr; @@ -665,3 +669,15 @@ address StubRoutines::select_arraysort_function(BasicType t) { return nullptr; } } + +address StubRoutines::select_array_partition_function(BasicType t) { + switch(t) { + case T_INT: return _array_partition_int; + case T_LONG: return _array_partition_long; + case T_FLOAT: return _array_partition_float; + case T_DOUBLE: return _array_partition_double; + default: + ShouldNotReachHere(); + return nullptr; + } +} diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index b58287d6b5cd3..a835169a7d60a 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -157,6 +157,10 @@ class StubRoutines: AllStatic { static address _arraysort_long; static address _arraysort_float; static address _arraysort_double; + static address _array_partition_int; + static address _array_partition_long; + static address _array_partition_float; + static address _array_partition_double; // Leaf routines which implement arraycopy and their addresses // arraycopy operands aligned on element type boundary static address _jbyte_arraycopy; @@ -378,6 +382,7 @@ class StubRoutines: AllStatic { static address generic_arraycopy() { return _generic_arraycopy; } static address select_arraysort_function(BasicType t); + static address select_array_partition_function(BasicType t); static address jbyte_fill() { return _jbyte_fill; } static address jshort_fill() { return _jshort_fill; } diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp index c07caf991ea8e..bc1258debd389 100644 --- a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp @@ -61,6 +61,9 @@ struct zmm_vector { static opmask_t ge(zmm_t x, zmm_t y) { return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); } + static opmask_t gt(zmm_t x, zmm_t y) { + return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_GT); + } template static ymm_t i64gather(__m512i index, void const *base) { return _mm512_i64gather_epi32(index, base, scale); @@ -117,6 +120,9 @@ struct zmm_vector { static opmask_t ge(zmm_t x, zmm_t y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); } + static opmask_t gt(zmm_t x, zmm_t y) { + return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); + } template static ymm_t i64gather(__m512i index, void const *base) { return _mm512_i64gather_ps(index, base, scale); @@ -431,7 +437,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512_unrolled( - arr, left, right + 1, pivot, &smallest, &biggest); + arr, left, right + 1, pivot, &smallest, &biggest, false); if (pivot != smallest) qsort_32bit_(arr, left, pivot_index - 1, max_iters - 1); if (pivot != biggest) @@ -439,19 +445,19 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, } template <> -void avx512_qsort(int32_t *arr, int64_t arrsize) { +inline void avx512_qsort(int32_t *arr, int64_t fromIndex, int64_t toIndex) { + int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { - qsort_32bit_, int32_t>(arr, 0, arrsize - 1, + qsort_32bit_, int32_t>(arr, fromIndex, toIndex - 1, 2 * (int64_t)log2(arrsize)); } } template <> -void avx512_qsort(float *arr, int64_t arrsize) { - int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize); - arrsize = idx_last_elem_not_nan + 1; +inline void avx512_qsort(float *arr, int64_t fromIndex, int64_t toIndex) { + int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { - qsort_32bit_, float>(arr, 0, idx_last_elem_not_nan, + qsort_32bit_, float>(arr, fromIndex, toIndex - 1, 2 * (int64_t)log2(arrsize)); } } diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h index 2d07cf1984859..2c3bfd97e1960 100644 --- a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h +++ b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h @@ -70,6 +70,9 @@ struct zmm_vector { static opmask_t ge(zmm_t x, zmm_t y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT); } + static opmask_t gt(zmm_t x, zmm_t y) { + return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_GT); + } static opmask_t eq(zmm_t x, zmm_t y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); } @@ -139,6 +142,9 @@ struct zmm_vector { static opmask_t ge(zmm_t x, zmm_t y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); } + static opmask_t gt(zmm_t x, zmm_t y) { + return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); + } static opmask_t eq(zmm_t x, zmm_t y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); } diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp index da07dc51427a4..61f618f657049 100644 --- a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp @@ -746,7 +746,7 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512_unrolled( - arr, left, right + 1, pivot, &smallest, &biggest); + arr, left, right + 1, pivot, &smallest, &biggest, false); if (pivot != smallest) qsort_64bit_(arr, left, pivot_index - 1, max_iters - 1); if (pivot != biggest) @@ -754,20 +754,21 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, } template <> -void avx512_qsort(int64_t *arr, int64_t arrsize) { +inline void avx512_qsort(int64_t *arr, int64_t fromIndex, int64_t toIndex) { + int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { - qsort_64bit_, int64_t>(arr, 0, arrsize - 1, + qsort_64bit_, int64_t>(arr, fromIndex, toIndex - 1, 2 * (int64_t)log2(arrsize)); } } template <> -void avx512_qsort(double *arr, int64_t arrsize) { - int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize); - arrsize = idx_last_elem_not_nan + 1; +inline void avx512_qsort(double *arr, int64_t fromIndex, int64_t toIndex) { + int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { - qsort_64bit_, double>(arr, 0, idx_last_elem_not_nan, + qsort_64bit_, double>(arr, fromIndex, toIndex - 1, 2 * (int64_t)log2(arrsize)); } } + #endif // AVX512_QSORT_64BIT diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h index f2ad8b039070b..b1a53a054692f 100644 --- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h @@ -26,7 +26,7 @@ */ // This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) - +#include #ifndef AVX512_QSORT_COMMON #define AVX512_QSORT_COMMON @@ -116,9 +116,18 @@ template struct ymm_vector; // Regular quicksort routines: +template +void avx512_dual_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot); + +template +void avx512_single_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot); + template void avx512_qsort(T *arr, int64_t arrsize); +template +void inline avx512_qsort(T *arr, int64_t from_index, int64_t to_index); + template bool is_a_nan(T elem) { return std::isnan(elem); @@ -146,10 +155,15 @@ int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) { } template -bool comparison_func(const T &a, const T &b) { +bool comparison_func_ge(const T &a, const T &b) { return a < b; } +template +bool comparison_func_gt(const T &a, const T &b) { + return a <= b; +} + /* * COEX == Compare and Exchange two registers by swapping min and max values */ @@ -173,13 +187,16 @@ static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) { template static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right, const zmm_t curr_vec, const zmm_t pivot_vec, - zmm_t *smallest_vec, zmm_t *biggest_vec) { + zmm_t *smallest_vec, zmm_t *biggest_vec, bool use_gt) { /* which elements are larger than or equal to the pivot */ - typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); - int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask); - vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(ge_mask), + typename vtype::opmask_t mask; + if (use_gt) mask = vtype::gt(curr_vec, pivot_vec); + else mask = vtype::ge(curr_vec, pivot_vec); + //mask = vtype::ge(curr_vec, pivot_vec); + int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)mask); + vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(mask), curr_vec); - vtype::mask_compressstoreu(arr + right - amount_ge_pivot, ge_mask, + vtype::mask_compressstoreu(arr + right - amount_ge_pivot, mask, curr_vec); *smallest_vec = vtype::min(curr_vec, *smallest_vec); *biggest_vec = vtype::max(curr_vec, *biggest_vec); @@ -192,12 +209,13 @@ static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right, template static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right, type_t pivot, type_t *smallest, - type_t *biggest) { + type_t *biggest, bool use_gt) { + auto comparison_func = use_gt ? comparison_func_gt : comparison_func_ge; /* make array length divisible by vtype::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { - *smallest = std::min(*smallest, arr[left], comparison_func); - *biggest = std::max(*biggest, arr[left], comparison_func); - if (!comparison_func(arr[left], pivot)) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { std::swap(arr[left], arr[--right]); } else { ++left; @@ -216,7 +234,7 @@ static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right, zmm_t vec = vtype::loadu(arr + left); int32_t amount_ge_pivot = partition_vec(arr, left, left + vtype::numlanes, vec, - pivot_vec, &min_vec, &max_vec); + pivot_vec, &min_vec, &max_vec, use_gt); *smallest = vtype::reducemin(min_vec); *biggest = vtype::reducemax(max_vec); return left + (vtype::numlanes - amount_ge_pivot); @@ -248,7 +266,7 @@ static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right, // partition the current vector and save it on both sides of the array int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, - curr_vec, pivot_vec, &min_vec, &max_vec); + curr_vec, pivot_vec, &min_vec, &max_vec, use_gt); ; r_store -= amount_ge_pivot; l_store += (vtype::numlanes - amount_ge_pivot); @@ -257,11 +275,11 @@ static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right, /* partition and save vec_left and vec_right */ int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, vec_left, - pivot_vec, &min_vec, &max_vec); + pivot_vec, &min_vec, &max_vec, use_gt); l_store += (vtype::numlanes - amount_ge_pivot); amount_ge_pivot = partition_vec(arr, l_store, l_store + vtype::numlanes, vec_right, - pivot_vec, &min_vec, &max_vec); + pivot_vec, &min_vec, &max_vec, use_gt); l_store += (vtype::numlanes - amount_ge_pivot); *smallest = vtype::reducemin(min_vec); *biggest = vtype::reducemax(max_vec); @@ -273,18 +291,20 @@ template (arr, left, right, pivot, smallest, - biggest); + biggest, use_gt); } + + auto comparison_func = use_gt ? comparison_func_gt : comparison_func_ge; /* make array length divisible by 8*vtype::numlanes , shortening the array */ for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0; --i) { - *smallest = std::min(*smallest, arr[left], comparison_func); - *biggest = std::max(*biggest, arr[left], comparison_func); - if (!comparison_func(arr[left], pivot)) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { std::swap(arr[left], arr[--right]); } else { ++left; @@ -339,7 +359,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec( arr, l_store, r_store + vtype::numlanes, curr_vec[ii], - pivot_vec, &min_vec, &max_vec); + pivot_vec, &min_vec, &max_vec, use_gt); l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } @@ -350,7 +370,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, - vec_left[ii], pivot_vec, &min_vec, &max_vec); + vec_left[ii], pivot_vec, &min_vec, &max_vec, use_gt); l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } @@ -358,7 +378,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, - vec_right[ii], pivot_vec, &min_vec, &max_vec); + vec_right[ii], pivot_vec, &min_vec, &max_vec, use_gt); l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } @@ -367,4 +387,73 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, return l_store; } +// right = to_index (exclusive) +template +static int64_t vectorized_partition(type_t *arr, int64_t left, int64_t right, type_t pivot, bool use_gt) { + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512_unrolled( + arr, left, right, pivot, &smallest, &biggest, use_gt); + return pivot_index; +} + +// partitioning functions +template +void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){ + const int64_t pidx1 = pivot_indices[0]; + const int64_t pidx2 = pivot_indices[1]; + const T pivot1 = arr[pidx1]; + const T pivot2 = arr[pidx2]; + + const int64_t low = from_index; + const int64_t high = to_index; + const int64_t start = low + 1; + const int64_t end = high - 1; + + + std::swap(arr[pidx1], arr[low]); + std::swap(arr[pidx2], arr[end]); + + + const int64_t pivot_index2 = vectorized_partition, T>(arr, start, end, pivot2, true); // use_gt = true + std::swap(arr[end], arr[pivot_index2]); + int64_t upper = pivot_index2; + + const int64_t pivot_index1 = vectorized_partition, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false) + int64_t lower = pivot_index1 - 1; + std::swap(arr[low], arr[lower]); + + pivot_indices[0] = lower; + pivot_indices[1] = upper; +} + +template +void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){ + const int64_t pidx = pivot_indices[0]; + const T pivot = arr[pidx]; + + const int64_t low = from_index; + const int64_t high = to_index; + //const int64_t start = low + 1; + const int64_t end = high - 1; + + + const int64_t pivot_index1 = vectorized_partition, T>(arr, low, high, pivot, false); // use_gt = false (use_ge) + int64_t lower = pivot_index1; + + const int64_t pivot_index2 = vectorized_partition, T>(arr, pivot_index1, high, pivot, true); // use_gt = true + int64_t upper = pivot_index2; + + pivot_indices[0] = lower; + pivot_indices[1] = upper; +} + +template +inline void avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) { + if(is_dual_pviot) avx512_dual_pivot_partition(arr, from_index, to_index, pivot_indices); + else avx512_single_pivot_partition(arr, from_index, to_index, pivot_indices); +} + + + #endif // AVX512_QSORT_COMMON diff --git a/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp index 67d6285cea552..aeea98006ce48 100644 --- a/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp +++ b/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp @@ -32,24 +32,36 @@ extern "C" { - DLL_PUBLIC void avx512_sort_int(int32_t *array_fromIndex, int64_t fromIndex, - int64_t toIndex) { - avx512_qsort(array_fromIndex, toIndex - fromIndex); + DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) { + avx512_qsort(array, from_index, to_index); } - DLL_PUBLIC void avx512_sort_long(int64_t *array_fromIndex, int64_t fromIndex, - int64_t toIndex) { - avx512_qsort(array_fromIndex, toIndex - fromIndex); + DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) { + avx512_qsort(array, from_index, to_index); } - DLL_PUBLIC void avx512_sort_float(float *array_fromIndex, int64_t fromIndex, - int64_t toIndex) { - avx512_qsort(array_fromIndex, toIndex - fromIndex); + DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) { + avx512_qsort(array, from_index, to_index); } - DLL_PUBLIC void avx512_sort_double(double *array_fromIndex, int64_t fromIndex, - int64_t toIndex) { - avx512_qsort(array_fromIndex, toIndex - fromIndex); + DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) { + avx512_qsort(array, from_index, to_index); + } + + DLL_PUBLIC void avx512_partition_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { + avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); + } + + DLL_PUBLIC void avx512_partition_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { + avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); + } + + DLL_PUBLIC void avx512_partition_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { + avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); + } + + DLL_PUBLIC void avx512_partition_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { + avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); } } diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index d5ce85d336074..27f4d38f2e4e2 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -81,7 +81,9 @@ private Arrays() {} /** * Sorts the specified array into ascending numerical order. - * + * While the intrinsic is free to choose its own sorting algorithm, the + * fallback implementation uses either mixed insertion sort or simple + * insertion sort. * * @param elemType the class of the elements of the array to be sorted * @param array the array to be sorted @@ -90,17 +92,36 @@ private Arrays() {} * address pointing to the first element to sort from. * @param fromIndex the index of the first element, inclusive, to be sorted * @param toIndex the index of the last element, exclusive, to be sorted + * @param end the index of the last element for simple insertion sort (in + * the case of mixed insertion sort). In the fallback implementation, + * if end < 0, we use insertion sort else we use mixed insertion sort. */ @IntrinsicCandidate - private static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex) { - switch (array) { - case int[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); - case long[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); - case float[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); - case double[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex); - default -> throw new UnsupportedOperationException( - "arraySort intrinsic not supported for this type: " + elemType); - } + static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex, int end) { + DualPivotQuicksort.smallArraySort(array, fromIndex, toIndex, end); + } + + /** + * Partitions the specified array based on the pivot(s) provided. + * + * @param elemType the class of the array to be sorted + * @param array the array to be sorted + * @param offset the relative offset, in bytes, from the base address of + * the array to partition, otherwise if the array is {@code null},an absolute + * address pointing to the first element to partition from. + * @param fromIndex the index of the first element, inclusive, to be sorted + * @param toIndex the index of the last element, exclusive, to be sorted + * @param pivotIndices the array containing the indices of the pivots. After + * partitioning, this array is updated with the new indices of the pivots. + * @param pivot_offset the offset in bytes pointing to the base address of + * the array used to store the indices of the pivots. + * @param isDualPivot a boolean value to choose between dual pivot + * partitioning and single pivot partitioning + */ + @IntrinsicCandidate + static void arrayPartition(Class elemType, Object array, long offset, int fromIndex, int toIndex, int[] pivotIndices, long pivot_offset, boolean isDualPivot) { + if (isDualPivot) DualPivotQuicksort.partitionDualPivot(array, fromIndex, toIndex, pivotIndices); + else DualPivotQuicksort.partitionSinglePivot(array, fromIndex, toIndex, pivotIndices); } /* @@ -122,8 +143,7 @@ private static void arraySort(Class elemType, Object array, long offset, int * @param a the array to be sorted */ public static void sort(int[] a) { - int offset = Unsafe.ARRAY_INT_BASE_OFFSET; - arraySort(int.class, a, offset, 0, a.length); + DualPivotQuicksort.sort(a, 0, 0, a.length); } /** @@ -147,8 +167,7 @@ public static void sort(int[] a) { */ public static void sort(int[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - int offset = Unsafe.ARRAY_INT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_INT_INDEX_SCALE); - arraySort(int.class, a, offset, fromIndex, toIndex); + DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); } /** @@ -162,8 +181,7 @@ public static void sort(int[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(long[] a) { - int offset = Unsafe.ARRAY_LONG_BASE_OFFSET; - arraySort(long.class, a, offset, 0, a.length); + DualPivotQuicksort.sort(a, 0, 0, a.length); } /** @@ -187,8 +205,7 @@ public static void sort(long[] a) { */ public static void sort(long[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - int offset = Unsafe.ARRAY_LONG_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_LONG_INDEX_SCALE); - arraySort(long.class, a, offset, fromIndex, toIndex); + DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); } /** @@ -324,8 +341,7 @@ public static void sort(byte[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(float[] a) { - int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET; - arraySort(float.class, a, offset, 0, a.length); + DualPivotQuicksort.sort(a, 0, 0, a.length); } /** @@ -357,8 +373,7 @@ public static void sort(float[] a) { */ public static void sort(float[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_FLOAT_INDEX_SCALE); - arraySort(float.class, a, offset, fromIndex, toIndex); + DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); } /** @@ -380,8 +395,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(double[] a) { - int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET; - arraySort(double.class, a, offset, 0, a.length); + DualPivotQuicksort.sort(a, 0, 0, a.length); } /** @@ -413,8 +427,7 @@ public static void sort(double[] a) { */ public static void sort(double[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); - int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_DOUBLE_INDEX_SCALE); - arraySort(double.class, a, offset, fromIndex, toIndex); + DualPivotQuicksort.sort(a, 0, fromIndex, toIndex); } /** diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index 3dcc7fee1f525..7a7a906176ffd 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -27,6 +27,9 @@ import java.util.concurrent.CountedCompleter; import java.util.concurrent.RecursiveTask; +import java.util.Arrays; +import jdk.internal.misc.Unsafe; + /** * This class implements powerful and fully optimized versions, both @@ -137,6 +140,100 @@ private static int getDepth(int parallelism, int size) { return depth; } + /** + * Sorts the specified range of the array using either insertion sort + * or mixed insertion sort depending on the value of end. if end < 0, + * we use insertion sort else we use mixed insertion sort. + * + * @param array the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param end the index of the last element for simple insertion sort (in + * the case of mixed insertion sort). If end < 0, we use insertion sort + * else we use mixed insertion sort. + */ + static void smallArraySort(Object array, int low, int high, int end) { + if (end < 0) insertionSort(array, low, high); + else mixedInsertionSort(array, low, end, high); + } + + /** + * Sorts the specified range of the array using insertion sort + * + * @param array the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * + */ + static void insertionSort(Object array, int low, int high) { + switch (array) { + case int[] arr -> insertionSort(arr, low, high); + case long[] arr -> insertionSort(arr, low, high); + case float[] arr -> insertionSort(arr, low, high); + case double[] arr -> insertionSort(arr, low, high); + default -> throw new UnsupportedOperationException(); + } + } + + /** + * Sorts the specified range of the array using mixed insertion sort. + * + * @param array the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param end the index of the last element for simple insertion sort + * + */ + static void mixedInsertionSort(Object array, int low, int end, int high) { + switch (array) { + case int[] arr -> mixedInsertionSort(arr, low, end, high); + case long[] arr -> mixedInsertionSort(arr, low, end, high); + case float[] arr -> mixedInsertionSort(arr, low, end, high); + case double[] arr -> mixedInsertionSort(arr, low, end, high); + default -> throw new UnsupportedOperationException(); + } + } + + /** + * Partitions the specified range of the array using the two pivots specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the two pivots to be used. + * After partitioning, the indices of the pivots is updated as well. + * + */ + static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) { + switch(array) { + case int[] arr -> partitionDualPivot(arr, low, high, pivotIndices); + case long[] arr -> partitionDualPivot(arr, low, high, pivotIndices); + case float[] arr -> partitionDualPivot(arr, low, high, pivotIndices); + case double[] arr -> partitionDualPivot(arr, low, high, pivotIndices); + default -> throw new UnsupportedOperationException(); + } + } + + /** + * Partitions the specified range of the array using a single pivot specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the pivot to be used. + * After partitioning, the indices of the pivots is updated as well. + * + */ + static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) { + switch(array) { + case int[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); + case long[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); + case float[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); + case double[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); + default -> throw new UnsupportedOperationException(); + } + } + /** * Sorts the specified range of the array using parallel merge * sort and/or Dual-Pivot Quicksort. @@ -178,12 +275,14 @@ static void sort(int[] a, int parallelism, int low, int high) { static void sort(Sorter sorter, int[] a, int bits, int low, int high) { while (true) { int end = high - 1, size = high - low; + int[] pivotIndices; + int baseOffset = Unsafe.ARRAY_INT_BASE_OFFSET; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high); + Arrays.arraySort(int.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -191,7 +290,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(a, low, high); + Arrays.arraySort(int.class, a, baseOffset, low, high, -1); return; } @@ -271,78 +370,19 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { - + boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); + if (isDualPivot) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - int pivot1 = a[e1]; - int pivot2 = a[e5]; - - /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. - */ - a[e1] = a[lower]; - a[e5] = a[upper]; - - /* - * Skip elements, which are less or greater than the pivots. - */ - while (a[++lower] < pivot1); - while (a[--upper] > pivot2); - - /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ - for (int unused = --lower, k = ++upper; --k > lower; ) { - int ak = a[k]; + pivotIndices = new int[] {e1, e5}; + Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; - if (ak < pivot1) { // Move a[k] to the left side - while (lower < k) { - if (a[++lower] >= pivot1) { - if (a[lower] > pivot2) { - a[k] = a[--upper]; - a[upper] = a[lower]; - } else { - a[k] = a[lower]; - } - a[lower] = ak; - break; - } - } - } else if (ak > pivot2) { // Move a[k] to the right side - a[k] = a[--upper]; - a[upper] = ak; - } - } - /* - * Swap the pivots into their final positions. - */ - a[low] = a[lower]; a[lower] = pivot1; - a[end] = a[upper]; a[upper] = pivot2; /* * Sort non-left parts recursively (possibly in parallel), @@ -362,73 +402,186 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - int pivot = a[e3]; - + pivotIndices = new int[] {e3, e3}; + Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. + * Sort the right part (possibly in parallel), excluding + * known pivot. All elements from the central part are + * equal and therefore already sorted. */ - a[e3] = a[lower]; + if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { + sorter.forkSorter(bits | 1, upper, high); + } else { + sort(sorter, a, bits | 1, upper, high); + } + } + high = lower; // Iterate along the left part + } + } - /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ - for (int k = ++upper; --k > lower; ) { - int ak = a[k]; + /** + * Partitions the specified range of the array using the two pivots specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the two pivots to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionDualPivot(int[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; - if (ak != pivot) { - a[k] = pivot; + int e1 = pivotIndices[0]; + int e5 = pivotIndices[1]; + int pivot1 = a[e1]; + int pivot2 = a[e5]; - if (ak < pivot) { // Move a[k] to the left side - while (a[++lower] < pivot); + /* + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ + a[e1] = a[lower]; + a[e5] = a[upper]; - if (a[lower] > pivot) { - a[--upper] = a[lower]; - } - a[lower] = ak; - } else { // ak > pivot - Move a[k] to the right side - a[--upper] = ak; + /* + * Skip elements, which are less or greater than the pivots. + */ + while (a[++lower] < pivot1); + while (a[--upper] > pivot2); + + /* + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ + for (int unused = --lower, k = ++upper; --k > lower; ) { + int ak = a[k]; + + if (ak < pivot1) { // Move a[k] to the left side + while (lower < k) { + if (a[++lower] >= pivot1) { + if (a[lower] > pivot2) { + a[k] = a[--upper]; + a[upper] = a[lower]; + } else { + a[k] = a[lower]; } + a[lower] = ak; + break; } } + } else if (ak > pivot2) { // Move a[k] to the right side + a[k] = a[--upper]; + a[upper] = ak; + } + } - /* - * Swap the pivot into its final position. - */ - a[low] = a[lower]; a[lower] = pivot; + /* + * Swap the pivots into their final positions. + */ + a[low] = a[lower]; a[lower] = pivot1; + a[end] = a[upper]; a[upper] = pivot2; - /* - * Sort the right part (possibly in parallel), excluding - * known pivot. All elements from the central part are - * equal and therefore already sorted. - */ - if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { - sorter.forkSorter(bits | 1, upper, high); - } else { - sort(sorter, a, bits | 1, upper, high); + pivotIndices[0] = lower; + pivotIndices[1] = upper; + } + + + + /** + * Partitions the specified range of the array using a single pivot specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the pivot to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionSinglePivot(int[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; + + + int e3 = pivotIndices[0]; + int pivot = a[e3]; + + /* + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ + a[e3] = a[lower]; + + /* + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ + for (int k = ++upper; --k > lower; ) { + int ak = a[k]; + + if (ak != pivot) { + a[k] = pivot; + + if (ak < pivot) { // Move a[k] to the left side + while (a[++lower] < pivot); + + if (a[lower] > pivot) { + a[--upper] = a[lower]; + } + a[lower] = ak; + } else { // ak > pivot - Move a[k] to the right side + a[--upper] = ak; } } - high = lower; // Iterate along the left part } + + /* + * Swap the pivot into its final position. + */ + a[low] = a[lower]; a[lower] = pivot; + pivotIndices[0] = lower; + pivotIndices[1] = upper; } /** @@ -932,12 +1085,14 @@ static void sort(long[] a, int parallelism, int low, int high) { static void sort(Sorter sorter, long[] a, int bits, int low, int high) { while (true) { int end = high - 1, size = high - low; + int[] pivotIndices; + int baseOffset = Unsafe.ARRAY_LONG_BASE_OFFSET; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high); + Arrays.arraySort(long.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -945,7 +1100,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(a, low, high); + Arrays.arraySort(long.class, a, baseOffset, low, high, -1); return; } @@ -1025,164 +1180,214 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { + boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); + if(isDualPivot) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - long pivot1 = a[e1]; - long pivot2 = a[e5]; - + pivotIndices = new int[] {e1, e5}; + Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. + * Sort non-left parts recursively (possibly in parallel), + * excluding known pivots. */ - a[e1] = a[lower]; - a[e5] = a[upper]; + if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { + sorter.forkSorter(bits | 1, lower + 1, upper); + sorter.forkSorter(bits | 1, upper + 1, high); + } else { + sort(sorter, a, bits | 1, lower + 1, upper); + sort(sorter, a, bits | 1, upper + 1, high); + } + + } else { // Use single pivot in case of many equal elements /* - * Skip elements, which are less or greater than the pivots. + * Use the third of the five sorted elements as the pivot. + * This value is inexpensive approximation of the median. */ - while (a[++lower] < pivot1); - while (a[--upper] > pivot2); - + pivotIndices = new int[] {e3, e3}; + Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part + * Sort the right part (possibly in parallel), excluding + * known pivot. All elements from the central part are + * equal and therefore already sorted. */ - for (int unused = --lower, k = ++upper; --k > lower; ) { - long ak = a[k]; + if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { + sorter.forkSorter(bits | 1, upper, high); + } else { + sort(sorter, a, bits | 1, upper, high); + } + } + high = lower; // Iterate along the left part + } + } - if (ak < pivot1) { // Move a[k] to the left side - while (lower < k) { - if (a[++lower] >= pivot1) { - if (a[lower] > pivot2) { - a[k] = a[--upper]; - a[upper] = a[lower]; - } else { - a[k] = a[lower]; - } - a[lower] = ak; - break; - } - } - } else if (ak > pivot2) { // Move a[k] to the right side - a[k] = a[--upper]; - a[upper] = ak; - } - } + /** + * Partitions the specified range of the array using the two pivots specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the two pivots to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionDualPivot(long[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; - /* - * Swap the pivots into their final positions. - */ - a[low] = a[lower]; a[lower] = pivot1; - a[end] = a[upper]; a[upper] = pivot2; + int e1 = pivotIndices[0]; + int e5 = pivotIndices[1]; + long pivot1 = a[e1]; + long pivot2 = a[e5]; - /* - * Sort non-left parts recursively (possibly in parallel), - * excluding known pivots. - */ - if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { - sorter.forkSorter(bits | 1, lower + 1, upper); - sorter.forkSorter(bits | 1, upper + 1, high); - } else { - sort(sorter, a, bits | 1, lower + 1, upper); - sort(sorter, a, bits | 1, upper + 1, high); - } + /* + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ + a[e1] = a[lower]; + a[e5] = a[upper]; - } else { // Use single pivot in case of many equal elements + /* + * Skip elements, which are less or greater than the pivots. + */ + while (a[++lower] < pivot1); + while (a[--upper] > pivot2); - /* - * Use the third of the five sorted elements as the pivot. - * This value is inexpensive approximation of the median. - */ - long pivot = a[e3]; + /* + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ + for (int unused = --lower, k = ++upper; --k > lower; ) { + long ak = a[k]; + + if (ak < pivot1) { // Move a[k] to the left side + while (lower < k) { + if (a[++lower] >= pivot1) { + if (a[lower] > pivot2) { + a[k] = a[--upper]; + a[upper] = a[lower]; + } else { + a[k] = a[lower]; + } + a[lower] = ak; + break; + } + } + } else if (ak > pivot2) { // Move a[k] to the right side + a[k] = a[--upper]; + a[upper] = ak; + } + } - /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. - */ - a[e3] = a[lower]; + /* + * Swap the pivots into their final positions. + */ + a[low] = a[lower]; a[lower] = pivot1; + a[end] = a[upper]; a[upper] = pivot2; - /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ - for (int k = ++upper; --k > lower; ) { - long ak = a[k]; + pivotIndices[0] = lower; + pivotIndices[1] = upper; + } - if (ak != pivot) { - a[k] = pivot; - if (ak < pivot) { // Move a[k] to the left side - while (a[++lower] < pivot); + /** + * Partitions the specified range of the array using a single pivot specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the pivot to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionSinglePivot(long[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; - if (a[lower] > pivot) { - a[--upper] = a[lower]; - } - a[lower] = ak; - } else { // ak > pivot - Move a[k] to the right side - a[--upper] = ak; - } - } - } + int e3 = pivotIndices[0]; + long pivot = a[e3]; - /* - * Swap the pivot into its final position. - */ - a[low] = a[lower]; a[lower] = pivot; + /* + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ + a[e3] = a[lower]; - /* - * Sort the right part (possibly in parallel), excluding - * known pivot. All elements from the central part are - * equal and therefore already sorted. - */ - if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { - sorter.forkSorter(bits | 1, upper, high); - } else { - sort(sorter, a, bits | 1, upper, high); + /* + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ + for (int k = ++upper; --k > lower; ) { + long ak = a[k]; + + if (ak != pivot) { + a[k] = pivot; + + if (ak < pivot) { // Move a[k] to the left side + while (a[++lower] < pivot); + + if (a[lower] > pivot) { + a[--upper] = a[lower]; + } + a[lower] = ak; + } else { // ak > pivot - Move a[k] to the right side + a[--upper] = ak; } } - high = lower; // Iterate along the left part } + + /* + * Swap the pivot into its final position. + */ + a[low] = a[lower]; a[lower] = pivot; + pivotIndices[0] = lower; + pivotIndices[1] = upper; } /** @@ -2473,12 +2678,14 @@ static void sort(float[] a, int parallelism, int low, int high) { static void sort(Sorter sorter, float[] a, int bits, int low, int high) { while (true) { int end = high - 1, size = high - low; + int[] pivotIndices; + int baseOffset = Unsafe.ARRAY_FLOAT_BASE_OFFSET; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high); + Arrays.arraySort(float.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -2486,7 +2693,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(a, low, high); + Arrays.arraySort(float.class, a, baseOffset, low, high, -1); return; } @@ -2566,79 +2773,18 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { + boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); + if(isDualPivot) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - float pivot1 = a[e1]; - float pivot2 = a[e5]; - - /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. - */ - a[e1] = a[lower]; - a[e5] = a[upper]; - - /* - * Skip elements, which are less or greater than the pivots. - */ - while (a[++lower] < pivot1); - while (a[--upper] > pivot2); - - /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ - for (int unused = --lower, k = ++upper; --k > lower; ) { - float ak = a[k]; - - if (ak < pivot1) { // Move a[k] to the left side - while (lower < k) { - if (a[++lower] >= pivot1) { - if (a[lower] > pivot2) { - a[k] = a[--upper]; - a[upper] = a[lower]; - } else { - a[k] = a[lower]; - } - a[lower] = ak; - break; - } - } - } else if (ak > pivot2) { // Move a[k] to the right side - a[k] = a[--upper]; - a[upper] = ak; - } - } - - /* - * Swap the pivots into their final positions. - */ - a[low] = a[lower]; a[lower] = pivot1; - a[end] = a[upper]; a[upper] = pivot2; - + pivotIndices = new int[] {e1, e5}; + Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* * Sort non-left parts recursively (possibly in parallel), * excluding known pivots. @@ -2657,73 +2803,184 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - float pivot = a[e3]; - + pivotIndices = new int[] {e3, e3}; + Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. + * Sort the right part (possibly in parallel), excluding + * known pivot. All elements from the central part are + * equal and therefore already sorted. */ - a[e3] = a[lower]; + if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { + sorter.forkSorter(bits | 1, upper, high); + } else { + sort(sorter, a, bits | 1, upper, high); + } + } + high = lower; // Iterate along the left part + } + } - /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ - for (int k = ++upper; --k > lower; ) { - float ak = a[k]; + /** + * Partitions the specified range of the array using the two pivots specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the two pivots to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionDualPivot(float[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; - if (ak != pivot) { - a[k] = pivot; + int e1 = pivotIndices[0]; + int e5 = pivotIndices[1]; + float pivot1 = a[e1]; + float pivot2 = a[e5]; - if (ak < pivot) { // Move a[k] to the left side - while (a[++lower] < pivot); + /* + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ + a[e1] = a[lower]; + a[e5] = a[upper]; - if (a[lower] > pivot) { - a[--upper] = a[lower]; - } - a[lower] = ak; - } else { // ak > pivot - Move a[k] to the right side - a[--upper] = ak; + /* + * Skip elements, which are less or greater than the pivots. + */ + while (a[++lower] < pivot1); + while (a[--upper] > pivot2); + + /* + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ + for (int unused = --lower, k = ++upper; --k > lower; ) { + float ak = a[k]; + + if (ak < pivot1) { // Move a[k] to the left side + while (lower < k) { + if (a[++lower] >= pivot1) { + if (a[lower] > pivot2) { + a[k] = a[--upper]; + a[upper] = a[lower]; + } else { + a[k] = a[lower]; } + a[lower] = ak; + break; } } + } else if (ak > pivot2) { // Move a[k] to the right side + a[k] = a[--upper]; + a[upper] = ak; + } + } - /* - * Swap the pivot into its final position. - */ - a[low] = a[lower]; a[lower] = pivot; + /* + * Swap the pivots into their final positions. + */ + a[low] = a[lower]; a[lower] = pivot1; + a[end] = a[upper]; a[upper] = pivot2; - /* - * Sort the right part (possibly in parallel), excluding - * known pivot. All elements from the central part are - * equal and therefore already sorted. - */ - if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { - sorter.forkSorter(bits | 1, upper, high); - } else { - sort(sorter, a, bits | 1, upper, high); + pivotIndices[0] = lower; + pivotIndices[1] = upper; + } + + + /** + * Partitions the specified range of the array using a single pivot specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the pivot to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionSinglePivot(float[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; + + int e3 = pivotIndices[0]; + float pivot = a[e3]; + + /* + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ + a[e3] = a[lower]; + + /* + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ + for (int k = ++upper; --k > lower; ) { + float ak = a[k]; + + if (ak != pivot) { + a[k] = pivot; + + if (ak < pivot) { // Move a[k] to the left side + while (a[++lower] < pivot); + + if (a[lower] > pivot) { + a[--upper] = a[lower]; + } + a[lower] = ak; + } else { // ak > pivot - Move a[k] to the right side + a[--upper] = ak; } } - high = lower; // Iterate along the left part } + + /* + * Swap the pivot into its final position. + */ + a[low] = a[lower]; a[lower] = pivot; + pivotIndices[0] = lower; + pivotIndices[1] = upper; } /** @@ -3279,12 +3536,14 @@ static void sort(double[] a, int parallelism, int low, int high) { static void sort(Sorter sorter, double[] a, int bits, int low, int high) { while (true) { int end = high - 1, size = high - low; + int[] pivotIndices; + int baseOffset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high); + Arrays.arraySort(double.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -3292,7 +3551,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(a, low, high); + Arrays.arraySort(double.class, a, baseOffset, low, high, -1); return; } @@ -3372,79 +3631,18 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { - - /* - * Use the first and fifth of the five sorted elements as - * the pivots. These values are inexpensive approximation - * of tertiles. Note, that pivot1 < pivot2. - */ - double pivot1 = a[e1]; - double pivot2 = a[e5]; - - /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. - */ - a[e1] = a[lower]; - a[e5] = a[upper]; - - /* - * Skip elements, which are less or greater than the pivots. - */ - while (a[++lower] < pivot1); - while (a[--upper] > pivot2); - - /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ - for (int unused = --lower, k = ++upper; --k > lower; ) { - double ak = a[k]; - - if (ak < pivot1) { // Move a[k] to the left side - while (lower < k) { - if (a[++lower] >= pivot1) { - if (a[lower] > pivot2) { - a[k] = a[--upper]; - a[upper] = a[lower]; - } else { - a[k] = a[lower]; - } - a[lower] = ak; - break; - } - } - } else if (ak > pivot2) { // Move a[k] to the right side - a[k] = a[--upper]; - a[upper] = ak; - } - } + boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); + if(isDualPivot) { /* - * Swap the pivots into their final positions. - */ - a[low] = a[lower]; a[lower] = pivot1; - a[end] = a[upper]; a[upper] = pivot2; - + * Use the first and fifth of the five sorted elements as + * the pivots. These values are inexpensive approximation + * of tertiles. Note, that pivot1 < pivot2. + */ + pivotIndices = new int[] {e1, e5}; + Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* * Sort non-left parts recursively (possibly in parallel), * excluding known pivots. @@ -3463,73 +3661,185 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - double pivot = a[e3]; + pivotIndices = new int[] {e3, e3}; + Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + lower = pivotIndices[0]; + upper = pivotIndices[1]; /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. + * Sort the right part (possibly in parallel), excluding + * known pivot. All elements from the central part are + * equal and therefore already sorted. */ - a[e3] = a[lower]; + if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { + sorter.forkSorter(bits | 1, upper, high); + } else { + sort(sorter, a, bits | 1, upper, high); + } + } + high = lower; // Iterate along the left part + } + } - /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ - for (int k = ++upper; --k > lower; ) { - double ak = a[k]; + /** + * Partitions the specified range of the array using the two pivots specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the two pivots to be used. + * After partitioning, this array the indices of the pivots is updated as well. + * + */ + private static void partitionDualPivot(double[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; - if (ak != pivot) { - a[k] = pivot; + int e1 = pivotIndices[0]; + int e5 = pivotIndices[1]; + double pivot1 = a[e1]; + double pivot2 = a[e5]; - if (ak < pivot) { // Move a[k] to the left side - while (a[++lower] < pivot); + /* + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ + a[e1] = a[lower]; + a[e5] = a[upper]; - if (a[lower] > pivot) { - a[--upper] = a[lower]; - } - a[lower] = ak; - } else { // ak > pivot - Move a[k] to the right side - a[--upper] = ak; + /* + * Skip elements, which are less or greater than the pivots. + */ + while (a[++lower] < pivot1); + while (a[--upper] > pivot2); + + /* + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ + for (int unused = --lower, k = ++upper; --k > lower; ) { + double ak = a[k]; + + if (ak < pivot1) { // Move a[k] to the left side + while (lower < k) { + if (a[++lower] >= pivot1) { + if (a[lower] > pivot2) { + a[k] = a[--upper]; + a[upper] = a[lower]; + } else { + a[k] = a[lower]; } + a[lower] = ak; + break; } } + } else if (ak > pivot2) { // Move a[k] to the right side + a[k] = a[--upper]; + a[upper] = ak; + } + } - /* - * Swap the pivot into its final position. - */ - a[low] = a[lower]; a[lower] = pivot; + /* + * Swap the pivots into their final positions. + */ + a[low] = a[lower]; a[lower] = pivot1; + a[end] = a[upper]; a[upper] = pivot2; - /* - * Sort the right part (possibly in parallel), excluding - * known pivot. All elements from the central part are - * equal and therefore already sorted. - */ - if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) { - sorter.forkSorter(bits | 1, upper, high); - } else { - sort(sorter, a, bits | 1, upper, high); + pivotIndices[0] = lower; + pivotIndices[1] = upper; + } + + + + /** + * Partitions the specified range of the array using a single pivot specified. + * + * @param array the array to be partitioned + * @param low the index of the first element, inclusive, for partitioning + * @param high the index of the last element, exclusive, for partitioning + * @param pivotIndices an array containing the indices of the pivot to be used. + * After partitioning, this array the indices of the pivots is updated as well. + */ + private static void partitionSinglePivot(double[] a, int low, int high, int[] pivotIndices) { + int end = high - 1; + int lower = low; + int upper = end; + + int e3 = pivotIndices[0]; + double pivot = a[e3]; + + /* + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ + a[e3] = a[lower]; + + /* + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ + for (int k = ++upper; --k > lower; ) { + double ak = a[k]; + + if (ak != pivot) { + a[k] = pivot; + + if (ak < pivot) { // Move a[k] to the left side + while (a[++lower] < pivot); + + if (a[lower] > pivot) { + a[--upper] = a[lower]; + } + a[lower] = ak; + } else { // ak > pivot - Move a[k] to the right side + a[--upper] = ak; } } - high = lower; // Iterate along the left part } + + /* + * Swap the pivot into its final position. + */ + a[low] = a[lower]; a[lower] = pivot; + pivotIndices[0] = lower; + pivotIndices[1] = upper; } /** diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index fb5b2f874ee88..be1634fa1f2ca 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -47,12 +47,17 @@ /** * Performance test of Arrays.sort() methods */ +@Fork(value=1, jvmArgsAppend={"-XX:CompileThreshold=1", "-XX:-TieredCompilation"}) @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MICROSECONDS) @State(Scope.Thread) -@Fork(value = 1) +@Warmup(iterations = 3, time=5) +@Measurement(iterations = 3, time=3) public class ArraysSort { + @Param({"10","25","50","75","100", "1000", "10000", "100000", "1000000"}) + private int size; + private int[] ints_unsorted; private long[] longs_unsorted; private float[] floats_unsorted; @@ -64,7 +69,7 @@ public class ArraysSort { private double[] doubles_sorted; - public void initialize(int size) { + public void initialize() { Random rnd = new Random(42); ints_unsorted = new int[size]; @@ -72,6 +77,8 @@ public void initialize(int size) { floats_unsorted = new float[size]; doubles_unsorted = new double[size]; + int[] intSpecialCases = {Integer.MIN_VALUE, Integer.MAX_VALUE}; + long[] longSpecialCases = {Long.MIN_VALUE, Long.MAX_VALUE}; float[] floatSpecialCases = {+0.0f, -0.0f, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY, Float.NaN}; double[] doubleSpecialCases = {+0.0, -0.0, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NaN}; @@ -79,16 +86,24 @@ public void initialize(int size) { ints_unsorted[i] = rnd.nextInt(); longs_unsorted[i] = rnd.nextLong(); if (i % 10 != 0) { + ints_unsorted[i] = rnd.nextInt(); + longs_unsorted[i] = rnd.nextLong(); floats_unsorted[i] = rnd.nextFloat(); doubles_unsorted[i] = rnd.nextDouble(); } else { - int rndIdx = rnd.nextInt(doubleSpecialCases.length); - floats_unsorted[i] = floatSpecialCases[rndIdx]; - doubles_unsorted[i] = doubleSpecialCases[rndIdx]; + ints_unsorted[i] = intSpecialCases[rnd.nextInt(intSpecialCases.length)]; + longs_unsorted[i] = longSpecialCases[rnd.nextInt(longSpecialCases.length)]; + floats_unsorted[i] = floatSpecialCases[rnd.nextInt(floatSpecialCases.length)]; + doubles_unsorted[i] = doubleSpecialCases[rnd.nextInt(doubleSpecialCases.length)]; } } } + @Setup + public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { + initialize(); + } + @Setup(Level.Invocation) public void clear() { ints_sorted = ints_unsorted.clone(); @@ -121,52 +136,4 @@ public double[] doubleSort() throws Throwable { return doubles_sorted; } - @Warmup(iterations = 3, time=2) - @Measurement(iterations = 3, time=5) - public static class Small extends ArraysSort { - @Param({"10","25","50","75","100"}) - private int size; - - @Setup - public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { - initialize(size); - } - } - - @Warmup(iterations = 3, time=2) - @Measurement(iterations = 3, time=5) - public static class Medium extends ArraysSort { - @Param({"1000", "10000"}) - private int size; - - @Setup - public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { - initialize(size); - } - } - - @Warmup(iterations = 3, time=40) - @Measurement(iterations = 3, time=30) - public static class Large extends ArraysSort { - @Param({"50000", "100000"}) - private int size; - - @Setup - public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { - initialize(size); - } - } - - @Warmup(iterations = 3, time=120) - @Measurement(iterations = 3, time=30) - public static class VeryLarge extends ArraysSort { - @Param({"1000000"}) - private int size; - - @Setup - public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable { - initialize(size); - } - } - } From 8b80b80bddabe32865596520ec519700bb95710b Mon Sep 17 00:00:00 2001 From: Srinivas Vamsi Parasa Date: Wed, 23 Aug 2023 05:51:43 -0700 Subject: [PATCH 21/40] Update avx512-common-qsort.h --- src/java.base/linux/native/libx86_64/avx512-common-qsort.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h index b1a53a054692f..2a3f608a6f4f8 100644 --- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h @@ -26,7 +26,6 @@ */ // This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) -#include #ifndef AVX512_QSORT_COMMON #define AVX512_QSORT_COMMON From 96cdd190e5cc8c7cfff98bf3f46d0180ecbeb0e2 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Wed, 23 Aug 2023 15:57:22 -0700 Subject: [PATCH 22/40] Update copyright for DPQS.java; replace avx512 pivot calculation with scalar version --- .../native/libx86_64/avx512-32bit-qsort.hpp | 24 +------------ .../native/libx86_64/avx512-64bit-common.h | 14 -------- .../native/libx86_64/avx512-64bit-qsort.hpp | 2 +- .../native/libx86_64/avx512-common-qsort.h | 27 +++++--------- .../classes/java/util/DualPivotQuicksort.java | 35 ++++++++++++++----- 5 files changed, 37 insertions(+), 65 deletions(-) diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp index bc1258debd389..7abc3a5454266 100644 --- a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp @@ -392,28 +392,6 @@ X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) { vtype::mask_storeu(arr + 112, load_mask4, zmm[7]); } -template -X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, const int64_t left, - const int64_t right) { - // median of 16 - int64_t size = (right - left) / 16; - using zmm_t = typename vtype::zmm_t; - using ymm_t = typename vtype::ymm_t; - __m512i rand_index1 = _mm512_set_epi64( - left + size, left + 2 * size, left + 3 * size, left + 4 * size, - left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size); - __m512i rand_index2 = _mm512_set_epi64( - left + 9 * size, left + 10 * size, left + 11 * size, left + 12 * size, - left + 13 * size, left + 14 * size, left + 15 * size, left + 16 * size); - ymm_t rand_vec1 = - vtype::template i64gather(rand_index1, arr); - ymm_t rand_vec2 = - vtype::template i64gather(rand_index2, arr); - zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2); - zmm_t sort = sort_zmm_32bit(rand_vec); - // pivot will never be a nan, since there are no nan's! - return ((type_t *)&sort)[8]; -} template static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, @@ -433,7 +411,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, return; } - type_t pivot = get_pivot_32bit(arr, left, right); + type_t pivot = get_pivot_scalar(arr, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512_unrolled( diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h index 2c3bfd97e1960..bb7553229eacb 100644 --- a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h +++ b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h @@ -210,19 +210,5 @@ X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) { return zmm; } -template -X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, const int64_t left, - const int64_t right) { - // median of 8 - int64_t size = (right - left) / 8; - using zmm_t = typename vtype::zmm_t; - __m512i rand_index = _mm512_set_epi64( - left + size, left + 2 * size, left + 3 * size, left + 4 * size, - left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size); - zmm_t rand_vec = vtype::template i64gather(rand_index, arr); - // pivot will never be a nan, since there are no nan's! - zmm_t sort = sort_zmm_64bit(rand_vec); - return ((type_t *)&sort)[4]; -} #endif diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp index 61f618f657049..422f385d052e2 100644 --- a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp @@ -742,7 +742,7 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, return; } - type_t pivot = get_pivot_64bit(arr, left, right); + type_t pivot = get_pivot_scalar(arr, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512_unrolled( diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h index 2a3f608a6f4f8..ae6af54f572fa 100644 --- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h @@ -132,25 +132,16 @@ bool is_a_nan(T elem) { return std::isnan(elem); } -/* - * Sort all the NAN's to end of the array and return the index of the last elem - * in the array which is not a nan - */ template -int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) { - int64_t jj = arrsize - 1; - int64_t ii = 0; - int64_t count = 0; - while (ii <= jj) { - if (is_a_nan(arr[ii])) { - std::swap(arr[ii], arr[jj]); - jj -= 1; - count++; - } else { - ii += 1; - } - } - return arrsize - count - 1; +X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) { + // median of 8 equally spaced elements + int64_t NUM_ELEMENTS = 8; + int64_t MID = NUM_ELEMENTS / 2; + int64_t size = (right - left) / NUM_ELEMENTS; + T temp[NUM_ELEMENTS]; + for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)]; + std::sort(temp, temp + NUM_ELEMENTS); + return temp[MID]; } template diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index 7a7a906176ffd..f02fec26f39a7 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -123,6 +123,11 @@ private DualPivotQuicksort() {} */ private static final int MAX_RECURSION_DEPTH = 64 * DELTA; + /** + * Min array size to call fast small array sort. + */ + private static final int MIN_FAST_SMALL_ARRAY_SORT_SIZE = 16; + /** * Calculates the double depth of parallel merging. * Depth is negative, if tasks split before sorting. @@ -282,7 +287,9 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - Arrays.arraySort(int.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); + int last = high - 3 * ((size >> 5) << 3); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); + else Arrays.arraySort(int.class, a, baseOffset, low, high, last); return; } @@ -290,7 +297,8 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - Arrays.arraySort(int.class, a, baseOffset, low, high, -1); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); + else Arrays.arraySort(int.class, a, baseOffset, low, high, -1); return; } @@ -1092,7 +1100,9 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - Arrays.arraySort(long.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); + int last = high - 3 * ((size >> 5) << 3); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); + else Arrays.arraySort(long.class, a, baseOffset, low, high, last); return; } @@ -1100,7 +1110,8 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - Arrays.arraySort(long.class, a, baseOffset, low, high, -1); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); + else Arrays.arraySort(long.class, a, baseOffset, low, high, -1); return; } @@ -2685,7 +2696,9 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - Arrays.arraySort(float.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); + int last = high - 3 * ((size >> 5) << 3); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); + else Arrays.arraySort(float.class, a, baseOffset, low, high, last); return; } @@ -2693,7 +2706,8 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - Arrays.arraySort(float.class, a, baseOffset, low, high, -1); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); + else Arrays.arraySort(float.class, a, baseOffset, low, high, -1); return; } @@ -3543,7 +3557,9 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - Arrays.arraySort(double.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3)); + int last = high - 3 * ((size >> 5) << 3); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); + else Arrays.arraySort(double.class, a, baseOffset, low, high, last); return; } @@ -3551,7 +3567,8 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - Arrays.arraySort(double.class, a, baseOffset, low, high, -1); + if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); + else Arrays.arraySort(double.class, a, baseOffset, low, high, -1); return; } From 5173849175e8c5cfe51c28a2333e470bc83efaa3 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Wed, 23 Aug 2023 16:24:19 -0700 Subject: [PATCH 23/40] add parallelSort benchmarking --- .../openjdk/bench/java/util/ArraysSort.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index be1634fa1f2ca..059a3626a0d3d 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -118,22 +118,46 @@ public int[] intSort() throws Throwable { return ints_sorted; } + @Benchmark + public int[] intParallelSort() throws Throwable { + Arrays.parallelSort(ints_sorted); + return ints_sorted; + } + @Benchmark public long[] longSort() throws Throwable { Arrays.sort(longs_sorted); return longs_sorted; } + @Benchmark + public long[] longParallelSort() throws Throwable { + Arrays.parallelSort(longs_sorted); + return longs_sorted; + } + @Benchmark public float[] floatSort() throws Throwable { Arrays.sort(floats_sorted); return floats_sorted; } + @Benchmark + public float[] floatParallelSort() throws Throwable { + Arrays.parallelSort(floats_sorted); + return floats_sorted; + } + @Benchmark public double[] doubleSort() throws Throwable { Arrays.sort(doubles_sorted); return doubles_sorted; } + @Benchmark + public double[] doubleParallelSort() throws Throwable { + Arrays.parallelSort(doubles_sorted); + return doubles_sorted; + } + } From df17b3e24e2995eece6405eaaa79b9089e1e469e Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 24 Aug 2023 16:25:07 -0700 Subject: [PATCH 24/40] Fix unused assignment in DPQS.java and space in Arrays.java --- .../share/classes/java/util/Arrays.java | 2 +- .../classes/java/util/DualPivotQuicksort.java | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 27f4d38f2e4e2..3547c5eece569 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -395,7 +395,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) { * @param a the array to be sorted */ public static void sort(double[] a) { - DualPivotQuicksort.sort(a, 0, 0, a.length); + DualPivotQuicksort.sort(a, 0, 0, a.length); } /** diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index f02fec26f39a7..0c5f9b48e1e68 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -372,8 +372,8 @@ && tryMergeRuns(sorter, a, low, size)) { } // Pointers - int lower = low; // The index of the last element of the left part - int upper = end; // The index of the first element of the right part + int lower; // The index of the last element of the left part + int upper; // The index of the first element of the right part /* * Partitioning with 2 pivots in case of different elements. @@ -1185,8 +1185,8 @@ && tryMergeRuns(sorter, a, low, size)) { } // Pointers - int lower = low; // The index of the last element of the left part - int upper = end; // The index of the first element of the right part + int lower; // The index of the last element of the left part + int upper; // The index of the first element of the right part /* * Partitioning with 2 pivots in case of different elements. @@ -2781,8 +2781,8 @@ && tryMergeRuns(sorter, a, low, size)) { } // Pointers - int lower = low; // The index of the last element of the left part - int upper = end; // The index of the first element of the right part + int lower; // The index of the last element of the left part + int upper; // The index of the first element of the right part /* * Partitioning with 2 pivots in case of different elements. @@ -3642,8 +3642,8 @@ && tryMergeRuns(sorter, a, low, size)) { } // Pointers - int lower = low; // The index of the last element of the left part - int upper = end; // The index of the first element of the right part + int lower; // The index of the last element of the left part + int upper; // The index of the first element of the right part /* * Partitioning with 2 pivots in case of different elements. From f3b5fcf5df8c68458e59ddf7f0bbd33ed255f688 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 24 Aug 2023 18:43:04 -0700 Subject: [PATCH 25/40] Move sort and partition intrinsics from Arrays.java to DPQS.java --- src/hotspot/share/classfile/vmIntrinsics.hpp | 10 +- src/hotspot/share/classfile/vmSymbols.hpp | 1 + .../share/classes/java/util/Arrays.java | 46 --------- .../classes/java/util/DualPivotQuicksort.java | 96 ++++++++++++------- 4 files changed, 69 insertions(+), 84 deletions(-) diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 9fce2446aea19..d5936373202ad 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -341,13 +341,13 @@ class methodHandle; do_name( copyOf_name, "copyOf") \ do_signature(copyOf_signature, "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;") \ \ - do_intrinsic(_arraySort, java_util_Arrays, arraySort_name, arraySort_signature, F_S) \ + do_intrinsic(_arraySort, java_util_DualPivotQuicksort, arraySort_name, arraySort_signature, F_S) \ do_name( arraySort_name, "arraySort") \ - do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)V") \ + do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)V") \ \ - do_intrinsic(_arrayPartition, java_util_Arrays, arrayPartition_name, arrayPartition_signature, F_S) \ - do_name(arrayPartition_name, "arrayPartition") \ - do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V") \ + do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ + do_name( arrayPartition_name, "arrayPartition") \ + do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V") \ \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ diff --git a/src/hotspot/share/classfile/vmSymbols.hpp b/src/hotspot/share/classfile/vmSymbols.hpp index 1b406550ef78c..38bfa7c978844 100644 --- a/src/hotspot/share/classfile/vmSymbols.hpp +++ b/src/hotspot/share/classfile/vmSymbols.hpp @@ -143,6 +143,7 @@ template(java_util_Vector, "java/util/Vector") \ template(java_util_AbstractList, "java/util/AbstractList") \ template(java_util_Hashtable, "java/util/Hashtable") \ + template(java_util_DualPivotQuicksort, "java/util/DualPivotQuicksort") \ template(java_lang_Compiler, "java/lang/Compiler") \ template(jdk_internal_misc_Signal, "jdk/internal/misc/Signal") \ template(jdk_internal_util_Preconditions, "jdk/internal/util/Preconditions") \ diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 3547c5eece569..419adef0239ff 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -47,7 +47,6 @@ import java.util.stream.LongStream; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import jdk.internal.misc.Unsafe; /** * This class contains various methods for manipulating arrays (such as @@ -79,51 +78,6 @@ public final class Arrays { // Suppresses default constructor, ensuring non-instantiability. private Arrays() {} - /** - * Sorts the specified array into ascending numerical order. - * While the intrinsic is free to choose its own sorting algorithm, the - * fallback implementation uses either mixed insertion sort or simple - * insertion sort. - * - * @param elemType the class of the elements of the array to be sorted - * @param array the array to be sorted - * @param offset the relative offset, in bytes, from the base address of - * the array to sort, otherwise if the array is {@code null},an absolute - * address pointing to the first element to sort from. - * @param fromIndex the index of the first element, inclusive, to be sorted - * @param toIndex the index of the last element, exclusive, to be sorted - * @param end the index of the last element for simple insertion sort (in - * the case of mixed insertion sort). In the fallback implementation, - * if end < 0, we use insertion sort else we use mixed insertion sort. - */ - @IntrinsicCandidate - static void arraySort(Class elemType, Object array, long offset, int fromIndex, int toIndex, int end) { - DualPivotQuicksort.smallArraySort(array, fromIndex, toIndex, end); - } - - /** - * Partitions the specified array based on the pivot(s) provided. - * - * @param elemType the class of the array to be sorted - * @param array the array to be sorted - * @param offset the relative offset, in bytes, from the base address of - * the array to partition, otherwise if the array is {@code null},an absolute - * address pointing to the first element to partition from. - * @param fromIndex the index of the first element, inclusive, to be sorted - * @param toIndex the index of the last element, exclusive, to be sorted - * @param pivotIndices the array containing the indices of the pivots. After - * partitioning, this array is updated with the new indices of the pivots. - * @param pivot_offset the offset in bytes pointing to the base address of - * the array used to store the indices of the pivots. - * @param isDualPivot a boolean value to choose between dual pivot - * partitioning and single pivot partitioning - */ - @IntrinsicCandidate - static void arrayPartition(Class elemType, Object array, long offset, int fromIndex, int toIndex, int[] pivotIndices, long pivot_offset, boolean isDualPivot) { - if (isDualPivot) DualPivotQuicksort.partitionDualPivot(array, fromIndex, toIndex, pivotIndices); - else DualPivotQuicksort.partitionSinglePivot(array, fromIndex, toIndex, pivotIndices); - } - /* * Sorting methods. Note that all public "sort" methods take the * same form: performing argument checks if necessary, and then diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index 0c5f9b48e1e68..deb2850d30eae 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -29,6 +29,7 @@ import java.util.concurrent.RecursiveTask; import java.util.Arrays; import jdk.internal.misc.Unsafe; +import jdk.internal.vm.annotation.IntrinsicCandidate; /** @@ -128,6 +129,52 @@ private DualPivotQuicksort() {} */ private static final int MIN_FAST_SMALL_ARRAY_SORT_SIZE = 16; + /** + * Sorts the specified array into ascending numerical order. + * While the intrinsic is free to choose its own sorting algorithm, the + * fallback implementation uses either mixed insertion sort or simple + * insertion sort. + * + * @param elemType the class of the elements of the array to be sorted + * @param array the array to be sorted + * @param offset the relative offset, in bytes, from the base address of + * the array to sort, otherwise if the array is {@code null},an absolute + * address pointing to the first element to sort from. + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param end the index of the last element for simple insertion sort (in + * the case of mixed insertion sort). In the fallback implementation, + * if end < 0, we use insertion sort else we use mixed insertion sort. + */ + @IntrinsicCandidate + static void arraySort(Class elemType, Object array, long offset, int low, int high, int end) { + if (end < 0) insertionSort(array, low, high); + else mixedInsertionSort(array, low, end, high); + } + + /** + * Partitions the specified array based on the pivot(s) provided. + * + * @param elemType the class of the array to be sorted + * @param array the array to be sorted + * @param offset the relative offset, in bytes, from the base address of + * the array to partition, otherwise if the array is {@code null},an absolute + * address pointing to the first element to partition from. + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param pivotIndices the array containing the indices of the pivots. After + * partitioning, this array is updated with the new indices of the pivots. + * @param pivot_offset the offset in bytes pointing to the base address of + * the array used to store the indices of the pivots. + * @param isDualPivot a boolean value to choose between dual pivot + * partitioning and single pivot partitioning + */ + @IntrinsicCandidate + static void arrayPartition(Class elemType, Object array, long offset, int low, int high, int[] pivotIndices, long pivot_offset, boolean isDualPivot) { + if (isDualPivot) partitionDualPivot(array, low, high, pivotIndices); + else partitionSinglePivot(array, low, high, pivotIndices); + } + /** * Calculates the double depth of parallel merging. * Depth is negative, if tasks split before sorting. @@ -145,23 +192,6 @@ private static int getDepth(int parallelism, int size) { return depth; } - /** - * Sorts the specified range of the array using either insertion sort - * or mixed insertion sort depending on the value of end. if end < 0, - * we use insertion sort else we use mixed insertion sort. - * - * @param array the array to be sorted - * @param low the index of the first element, inclusive, to be sorted - * @param high the index of the last element, exclusive, to be sorted - * @param end the index of the last element for simple insertion sort (in - * the case of mixed insertion sort). If end < 0, we use insertion sort - * else we use mixed insertion sort. - */ - static void smallArraySort(Object array, int low, int high, int end) { - if (end < 0) insertionSort(array, low, high); - else mixedInsertionSort(array, low, end, high); - } - /** * Sorts the specified range of the array using insertion sort * @@ -289,7 +319,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else Arrays.arraySort(int.class, a, baseOffset, low, high, last); + else arraySort(int.class, a, baseOffset, low, high, last); return; } @@ -298,7 +328,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { */ if (size < MAX_INSERTION_SORT_SIZE) { if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else Arrays.arraySort(int.class, a, baseOffset, low, high, -1); + else arraySort(int.class, a, baseOffset, low, high, -1); return; } @@ -386,7 +416,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -411,7 +441,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1102,7 +1132,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else Arrays.arraySort(long.class, a, baseOffset, low, high, last); + else arraySort(long.class, a, baseOffset, low, high, last); return; } @@ -1111,7 +1141,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { */ if (size < MAX_INSERTION_SORT_SIZE) { if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else Arrays.arraySort(long.class, a, baseOffset, low, high, -1); + else arraySort(long.class, a, baseOffset, low, high, -1); return; } @@ -1200,7 +1230,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1222,7 +1252,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2698,7 +2728,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else Arrays.arraySort(float.class, a, baseOffset, low, high, last); + else arraySort(float.class, a, baseOffset, low, high, last); return; } @@ -2707,7 +2737,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { */ if (size < MAX_INSERTION_SORT_SIZE) { if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else Arrays.arraySort(float.class, a, baseOffset, low, high, -1); + else arraySort(float.class, a, baseOffset, low, high, -1); return; } @@ -2796,7 +2826,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2818,7 +2848,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3559,7 +3589,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else Arrays.arraySort(double.class, a, baseOffset, low, high, last); + else arraySort(double.class, a, baseOffset, low, high, last); return; } @@ -3568,7 +3598,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { */ if (size < MAX_INSERTION_SORT_SIZE) { if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else Arrays.arraySort(double.class, a, baseOffset, low, high, -1); + else arraySort(double.class, a, baseOffset, low, high, -1); return; } @@ -3657,7 +3687,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3679,7 +3709,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; From e44f11a6b69133b8061bb6fda08b65dfe421bd88 Mon Sep 17 00:00:00 2001 From: Srinivas Vamsi Parasa Date: Thu, 24 Aug 2023 18:52:02 -0700 Subject: [PATCH 26/40] Remove unnecessary import in Arrays.java --- src/java.base/share/classes/java/util/Arrays.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java index 419adef0239ff..85c514da3083c 100644 --- a/src/java.base/share/classes/java/util/Arrays.java +++ b/src/java.base/share/classes/java/util/Arrays.java @@ -30,7 +30,6 @@ import java.io.Serializable; import java.lang.reflect.Array; -import java.util.Arrays.NaturalOrder; import java.util.concurrent.ForkJoinPool; import java.util.function.BinaryOperator; import java.util.function.Consumer; From 9642d852cce5a9cf8270b850c124ef38fc158c6d Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Mon, 28 Aug 2023 14:15:36 -0700 Subject: [PATCH 27/40] Clean up parameters passed to arrayPartition; update the check to load library --- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 21 ++++++------ src/hotspot/share/classfile/vmIntrinsics.hpp | 2 +- src/hotspot/share/opto/library_call.cpp | 10 +++--- src/hotspot/share/opto/runtime.cpp | 2 +- .../native/libx86_64/avx512-common-qsort.h | 6 ---- .../classes/java/util/DualPivotQuicksort.java | 33 ++++++++++--------- .../openjdk/bench/java/util/ArraysSort.java | 2 +- 7 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 11936ac764126..640d88f270fe9 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4130,18 +4130,19 @@ void StubGenerator::generate_compiler_stubs() { = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); } - // Get addresses for avx512 sort and partition routines - void *libx86_64 = nullptr; - char ebuf_x86_64[1024]; - char dll_name_avx512[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) { - libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64); - } - if (libx86_64 != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64)); - + // Load x86_64 library on supported hardware to enable avx512 sort and partition intrinsics if (UseAVX > 2 && VM_Version::supports_avx512dq()) { + void *libx86_64 = nullptr; + char ebuf_x86_64[1024]; + char dll_name_avx512[JVM_MAXPATHLEN]; + if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) { + libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64); + } + // Get addresses for avx512 sort and partition routines + if (libx86_64 != nullptr) { + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64)); + snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int"); StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64); diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index d5936373202ad..8c5ca344ae304 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -347,7 +347,7 @@ class methodHandle; \ do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ do_name( arrayPartition_name, "arrayPartition") \ - do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V") \ + do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IZ)V") \ \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 10a8734bc1a8a..b9a39ac61babf 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -5208,8 +5208,7 @@ bool LibraryCallKit::inline_array_partition() { Node* fromIndex = argument(4); Node* toIndex = argument(5); Node* pivot_indices = argument(6); - Node* pivot_offset = argument(7); - Node* isDualPivot = argument(9); + Node* isDualPivot = argument(7); const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); @@ -5221,11 +5220,14 @@ bool LibraryCallKit::inline_array_partition() { if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) { return false; // failed input validation } - Node* obj_adr = make_unsafe_address(obj, offset); pivot_indices = must_be_not_null(pivot_indices, true); - Node* pivot_indices_adr = make_unsafe_address(pivot_indices, pivot_offset); //this offset is not same as array offset + const TypeAryPtr* pivot_indices_type = pivot_indices->Value(&_gvn)->isa_aryptr(); + if (pivot_indices_type == nullptr || pivot_indices_type->elem() == Type::BOTTOM ) { + return false; // failed input validation + } + Node* pivot_indices_adr = array_element_address(pivot_indices, intcon(0), T_INT); // Call the stub. make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(), diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index a0e383d95afd0..52dd29b8fa793 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -884,7 +884,7 @@ const TypeFunc* OptoRuntime::array_sort_Type() { int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; - fields[argp++] = TypePtr::NOTNULL; // array(fromIndex) + fields[argp++] = TypePtr::NOTNULL; // array fields[argp++] = TypeInt::INT; // fromIndex fields[argp++] = TypeInt::INT; // toIndex assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h index ae6af54f572fa..c56990f921eae 100644 --- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h +++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h @@ -115,12 +115,6 @@ template struct ymm_vector; // Regular quicksort routines: -template -void avx512_dual_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot); - -template -void avx512_single_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot); - template void avx512_qsort(T *arr, int64_t arrsize); diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index deb2850d30eae..b3959b1048e75 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -30,6 +30,7 @@ import java.util.Arrays; import jdk.internal.misc.Unsafe; import jdk.internal.vm.annotation.IntrinsicCandidate; +import jdk.internal.vm.annotation.ForceInline; /** @@ -147,7 +148,8 @@ private DualPivotQuicksort() {} * if end < 0, we use insertion sort else we use mixed insertion sort. */ @IntrinsicCandidate - static void arraySort(Class elemType, Object array, long offset, int low, int high, int end) { + @ForceInline + private static void arraySort(Class elemType, Object array, long offset, int low, int high, int end) { if (end < 0) insertionSort(array, low, high); else mixedInsertionSort(array, low, end, high); } @@ -164,13 +166,12 @@ static void arraySort(Class elemType, Object array, long offset, int low, int * @param high the index of the last element, exclusive, to be sorted * @param pivotIndices the array containing the indices of the pivots. After * partitioning, this array is updated with the new indices of the pivots. - * @param pivot_offset the offset in bytes pointing to the base address of - * the array used to store the indices of the pivots. * @param isDualPivot a boolean value to choose between dual pivot * partitioning and single pivot partitioning */ @IntrinsicCandidate - static void arrayPartition(Class elemType, Object array, long offset, int low, int high, int[] pivotIndices, long pivot_offset, boolean isDualPivot) { + @ForceInline + private static void arrayPartition(Class elemType, Object array, long offset, int low, int high, int[] pivotIndices, boolean isDualPivot) { if (isDualPivot) partitionDualPivot(array, low, high, pivotIndices); else partitionSinglePivot(array, low, high, pivotIndices); } @@ -200,7 +201,7 @@ private static int getDepth(int parallelism, int size) { * @param high the index of the last element, exclusive, to be sorted * */ - static void insertionSort(Object array, int low, int high) { + private static void insertionSort(Object array, int low, int high) { switch (array) { case int[] arr -> insertionSort(arr, low, high); case long[] arr -> insertionSort(arr, low, high); @@ -219,7 +220,7 @@ static void insertionSort(Object array, int low, int high) { * @param end the index of the last element for simple insertion sort * */ - static void mixedInsertionSort(Object array, int low, int end, int high) { + private static void mixedInsertionSort(Object array, int low, int end, int high) { switch (array) { case int[] arr -> mixedInsertionSort(arr, low, end, high); case long[] arr -> mixedInsertionSort(arr, low, end, high); @@ -239,7 +240,7 @@ static void mixedInsertionSort(Object array, int low, int end, int high) { * After partitioning, the indices of the pivots is updated as well. * */ - static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) { + private static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) { switch(array) { case int[] arr -> partitionDualPivot(arr, low, high, pivotIndices); case long[] arr -> partitionDualPivot(arr, low, high, pivotIndices); @@ -259,7 +260,7 @@ static void partitionDualPivot(Object array, int low, int high, int[] pivotIndic * After partitioning, the indices of the pivots is updated as well. * */ - static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) { + private static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) { switch(array) { case int[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); case long[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); @@ -416,7 +417,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -441,7 +442,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1230,7 +1231,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1252,7 +1253,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2826,7 +2827,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2848,7 +2849,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3687,7 +3688,7 @@ && tryMergeRuns(sorter, a, low, size)) { * of tertiles. Note, that pivot1 < pivot2. */ pivotIndices = new int[] {e1, e5}; - arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3709,7 +3710,7 @@ && tryMergeRuns(sorter, a, low, size)) { * This value is inexpensive approximation of the median. */ pivotIndices = new int[] {e3, e3}; - arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot); + arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java index 059a3626a0d3d..4cd45d79412c1 100644 --- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java +++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From 1746eeddeb6b6ca6313434d1a7626cfabd7068cc Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 31 Aug 2023 11:38:08 -0700 Subject: [PATCH 28/40] update build script --- make/modules/java.base/Lib.gmk | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 85a86372dbf1f..2dca1e9f6d20d 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -237,19 +237,21 @@ endif ################################################################################ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) - $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \ - NAME := x86_64, \ - OPTIMIZATION := HIGH, \ - CFLAGS := $(CFLAGS_JDKLIB), \ - CXXFLAGS := $(CXXFLAGS_JDKLIB), \ - LDFLAGS := $(LDFLAGS_JDKLIB) \ - $(call SET_SHARED_LIBRARY_ORIGIN), \ - LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \ - LIBS := $(LIBCXX), \ - LIBS_linux := -lc -lm -ldl, \ - )) - - TARGETS += $(BUILD_LIB_X86_64) + ifeq ($(TOOLCHAIN_TYPE), gcc) + $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \ + NAME := x86_64, \ + TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ + OPTIMIZATION := HIGH, \ + CFLAGS := $(CFLAGS_JDKLIB), \ + CXXFLAGS := $(CXXFLAGS_JDKLIB), \ + LDFLAGS := $(LDFLAGS_JDKLIB) \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ + LIBS := $(LIBCXX), \ + LIBS_linux := -lc -lm -ldl, \ + )) + + TARGETS += $(BUILD_LIB_X86_64) + endif endif ################################################################################ From a0f006b63bd7de1e318cc5922b29faf1ec33fae7 Mon Sep 17 00:00:00 2001 From: Srinivas Vamsi Parasa Date: Thu, 31 Aug 2023 13:29:41 -0700 Subject: [PATCH 29/40] Update make/modules/java.base/Lib.gmk Co-authored-by: Erik Joelsson <37597443+erikj79@users.noreply.github.com> --- make/modules/java.base/Lib.gmk | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 2dca1e9f6d20d..b339c9f549bed 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -237,20 +237,20 @@ endif ################################################################################ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) - ifeq ($(TOOLCHAIN_TYPE), gcc) - $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \ - NAME := x86_64, \ - TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ - OPTIMIZATION := HIGH, \ - CFLAGS := $(CFLAGS_JDKLIB), \ - CXXFLAGS := $(CXXFLAGS_JDKLIB), \ - LDFLAGS := $(LDFLAGS_JDKLIB) \ - $(call SET_SHARED_LIBRARY_ORIGIN), \ - LIBS := $(LIBCXX), \ - LIBS_linux := -lc -lm -ldl, \ - )) - - TARGETS += $(BUILD_LIB_X86_64) + ifeq ($(TOOLCHAIN_TYPE), gcc) + $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \ + NAME := x86_64, \ + TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ + OPTIMIZATION := HIGH, \ + CFLAGS := $(CFLAGS_JDKLIB), \ + CXXFLAGS := $(CXXFLAGS_JDKLIB), \ + LDFLAGS := $(LDFLAGS_JDKLIB) \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ + LIBS := $(LIBCXX), \ + LIBS_linux := -lc -lm -ldl, \ + )) + + TARGETS += $(BUILD_LIB_X86_64) endif endif From 0ec5f52d26dd9738fabe9203b95b886e5ccca7f7 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 31 Aug 2023 14:25:02 -0700 Subject: [PATCH 30/40] Change name of the avxsort library to libx86_64_sort --- make/modules/java.base/Lib.gmk | 6 ++-- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 30 +++++++++---------- .../avx512-32bit-qsort.hpp | 0 .../avx512-64bit-common.h | 0 .../avx512-64bit-qsort.hpp | 0 .../avx512-common-qsort.h | 0 .../avxsort_linux_x86.cpp | 0 7 files changed, 18 insertions(+), 18 deletions(-) rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-32bit-qsort.hpp (100%) rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-64bit-common.h (100%) rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-64bit-qsort.hpp (100%) rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-common-qsort.h (100%) rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avxsort_linux_x86.cpp (100%) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index b339c9f549bed..e8f7e200f0d70 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -238,8 +238,8 @@ endif ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) ifeq ($(TOOLCHAIN_TYPE), gcc) - $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \ - NAME := x86_64, \ + $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64_SORT, \ + NAME := x86_64_sort, \ TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ OPTIMIZATION := HIGH, \ CFLAGS := $(CFLAGS_JDKLIB), \ @@ -250,7 +250,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) LIBS_linux := -lc -lm -ldl, \ )) - TARGETS += $(BUILD_LIB_X86_64) + TARGETS += $(BUILD_LIB_X86_64_SORT) endif endif diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 1b3fbd1aabfc5..10dfa8673f9d5 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4179,42 +4179,42 @@ void StubGenerator::generate_compiler_stubs() { = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); } - // Load x86_64 library on supported hardware to enable avx512 sort and partition intrinsics + // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics if (UseAVX > 2 && VM_Version::supports_avx512dq()) { - void *libx86_64 = nullptr; + void *libx86_64_sort = nullptr; char ebuf_x86_64[1024]; - char dll_name_avx512[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) { - libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64); + char dll_name_avx512_sort[JVM_MAXPATHLEN]; + if (os::dll_locate_lib(dll_name_avx512_sort, sizeof(dll_name_avx512_sort), Arguments::get_dll_dir(), "x86_64_sort")) { + libx86_64_sort = os::dll_load(dll_name_avx512_sort, ebuf_x86_64, sizeof ebuf_x86_64); } // Get addresses for avx512 sort and partition routines - if (libx86_64 != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64)); + if (libx86_64_sort != nullptr) { + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64_sort)); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int"); - StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_long"); - StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_float"); - StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double"); - StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_int"); - StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_long"); - StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_float"); - StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_double"); - StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64); + StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); } } diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp rename to src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h similarity index 100% rename from src/java.base/linux/native/libx86_64/avx512-64bit-common.h rename to src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp rename to src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h similarity index 100% rename from src/java.base/linux/native/libx86_64/avx512-common-qsort.h rename to src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h diff --git a/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp similarity index 100% rename from src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp rename to src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp From c096ff62e63fd1a374f7f180e11a7578798b06c7 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 8 Sep 2023 11:04:15 -0700 Subject: [PATCH 31/40] Fix regression when intrinsics are disabled; enable insertion sort in intrinsic, change library name to libsimdsort --- make/modules/java.base/Lib.gmk | 6 +- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 58 +-- src/hotspot/share/classfile/vmIntrinsics.hpp | 20 +- .../gc/shenandoah/c2/shenandoahSupport.cpp | 7 +- src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 12 +- src/hotspot/share/opto/c2compiler.cpp | 6 +- src/hotspot/share/opto/library_call.cpp | 55 ++- src/hotspot/share/opto/library_call.hpp | 2 +- src/hotspot/share/opto/runtime.cpp | 17 +- src/hotspot/share/opto/runtime.hpp | 2 +- src/hotspot/share/runtime/stubRoutines.cpp | 22 +- src/hotspot/share/runtime/stubRoutines.hpp | 14 +- .../avx512-32bit-qsort.hpp | 4 +- .../avx512-64bit-common.h | 0 .../avx512-64bit-qsort.hpp | 4 +- .../avx512-common-qsort.h | 59 ++- .../native/libsimdsort/avxsort_linux_x86.cpp | 85 +++++ .../libx86_64_sort/avxsort_linux_x86.cpp | 67 ---- .../classes/java/util/DualPivotQuicksort.java | 353 +++++++----------- test/jdk/java/util/Arrays/Sorting.java | 7 +- 20 files changed, 419 insertions(+), 381 deletions(-) rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-32bit-qsort.hpp (99%) rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-64bit-common.h (100%) rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-64bit-qsort.hpp (99%) rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-common-qsort.h (91%) create mode 100644 src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp delete mode 100644 src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index e8f7e200f0d70..976f5e8e75582 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -238,8 +238,8 @@ endif ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) ifeq ($(TOOLCHAIN_TYPE), gcc) - $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64_SORT, \ - NAME := x86_64_sort, \ + $(eval $(call SetupJdkLibrary, BUILD_LIB_SIMD_SORT, \ + NAME := simdsort, \ TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ OPTIMIZATION := HIGH, \ CFLAGS := $(CFLAGS_JDKLIB), \ @@ -250,7 +250,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) LIBS_linux := -lc -lm -ldl, \ )) - TARGETS += $(BUILD_LIB_X86_64_SORT) + TARGETS += $(BUILD_LIB_SIMD_SORT) endif endif diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 10dfa8673f9d5..1ac5f566434e4 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4182,39 +4182,51 @@ void StubGenerator::generate_compiler_stubs() { // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics if (UseAVX > 2 && VM_Version::supports_avx512dq()) { - void *libx86_64_sort = nullptr; - char ebuf_x86_64[1024]; - char dll_name_avx512_sort[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_avx512_sort, sizeof(dll_name_avx512_sort), Arguments::get_dll_dir(), "x86_64_sort")) { - libx86_64_sort = os::dll_load(dll_name_avx512_sort, ebuf_x86_64, sizeof ebuf_x86_64); + void *libsimdsort = nullptr; + char ebuf_[1024]; + char dll_name_simd_sort[JVM_MAXPATHLEN]; + if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) { + libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); } // Get addresses for avx512 sort and partition routines - if (libx86_64_sort != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64_sort)); + if (libsimdsort != nullptr) { + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libsimdsort)); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int"); - StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_int"); + StubRoutines::_arraysort_int = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_long"); - StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_long"); + StubRoutines::_arraysort_long = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_float"); - StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_float"); + StubRoutines::_arraysort_float = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double"); - StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_double"); + StubRoutines::_arraysort_double = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_int"); - StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_int"); + StubRoutines::_array_partition_single_int = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_long"); - StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_int"); + StubRoutines::_array_partition_dual_int = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_float"); - StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_long"); + StubRoutines::_array_partition_single_long = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_double"); - StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_long"); + StubRoutines::_array_partition_dual_long = (address)os::dll_lookup(libsimdsort, ebuf_); + + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_float"); + StubRoutines::_array_partition_single_float = (address)os::dll_lookup(libsimdsort, ebuf_); + + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_float"); + StubRoutines::_array_partition_dual_float = (address)os::dll_lookup(libsimdsort, ebuf_); + + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_double"); + StubRoutines::_array_partition_single_double = (address)os::dll_lookup(libsimdsort, ebuf_); + + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_double"); + StubRoutines::_array_partition_dual_double = (address)os::dll_lookup(libsimdsort, ebuf_); } } diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 4f307cfa8a388..bba728b694f8e 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -341,13 +341,19 @@ class methodHandle; do_name( copyOf_name, "copyOf") \ do_signature(copyOf_signature, "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;") \ \ - do_intrinsic(_arraySort, java_util_DualPivotQuicksort, arraySort_name, arraySort_signature, F_S) \ - do_name( arraySort_name, "arraySort") \ - do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)V") \ - \ - do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ - do_name( arrayPartition_name, "arrayPartition") \ - do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IZ)V") \ + do_intrinsic(_arraySortMI, java_util_DualPivotQuicksort, arraySortMI_name, arraySortMI_signature, F_S) \ + do_name( arraySortMI_name, "mixedInsertionSort") \ + do_signature(arraySortMI_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)V") \ + do_intrinsic(_arraySortI, java_util_DualPivotQuicksort, arraySortI_name, arraySortI_signature, F_S) \ + do_name( arraySortI_name, "insertionSort") \ + do_signature(arraySortI_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII)V") \ + \ + do_intrinsic(_arrayPartitionSP, java_util_DualPivotQuicksort, arrayPartitionSP_name, arrayPartitionSP_signature, F_S) \ + do_name( arrayPartitionSP_name, "partitionSinglePivot") \ + do_signature(arrayPartitionSP_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)[I") \ + do_intrinsic(_arrayPartitionDP, java_util_DualPivotQuicksort, arrayPartitionDP_name, arrayPartitionDP_signature, F_S) \ + do_name( arrayPartitionDP_name, "partitionDualPivot") \ + do_signature(arrayPartitionDP_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIII)[I") \ \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp index 9a98ec9cd529d..0384ec1942b3d 100644 --- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp +++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp @@ -388,10 +388,11 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) { } args[6]; } calls[] = { "array_partition_stub", - { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore }, { -1, ShenandoahNone }, - { -1, ShenandoahNone }, { -1, ShenandoahNone } }, + { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore }, { -1, ShenandoahNone }, + { -1, ShenandoahNone }, { -1, ShenandoahNone }, { -1, ShenandoahNone } }, "arraysort_stub", - { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone } }, + { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone }, + { -1, ShenandoahNone}, { -1, ShenandoahNone}, { -1, ShenandoahNone} }, "aescrypt_encryptBlock", { { TypeFunc::Parms, ShenandoahLoad }, { TypeFunc::Parms+1, ShenandoahStore }, { TypeFunc::Parms+2, ShenandoahLoad }, { -1, ShenandoahNone}, { -1, ShenandoahNone}, { -1, ShenandoahNone} }, diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index 9107b53fc2b84..e74afd3a7759d 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -331,10 +331,14 @@ static_field(StubRoutines, _arraysort_long, address) \ static_field(StubRoutines, _arraysort_float, address) \ static_field(StubRoutines, _arraysort_double, address) \ - static_field(StubRoutines, _array_partition_int, address) \ - static_field(StubRoutines, _array_partition_long, address) \ - static_field(StubRoutines, _array_partition_float, address) \ - static_field(StubRoutines, _array_partition_double, address) \ + static_field(StubRoutines, _array_partition_single_int, address) \ + static_field(StubRoutines, _array_partition_dual_int, address) \ + static_field(StubRoutines, _array_partition_single_long, address) \ + static_field(StubRoutines, _array_partition_dual_long, address) \ + static_field(StubRoutines, _array_partition_single_float, address) \ + static_field(StubRoutines, _array_partition_dual_float, address) \ + static_field(StubRoutines, _array_partition_single_double, address) \ + static_field(StubRoutines, _array_partition_dual_double, address) \ \ static_field(StubRoutines, _aescrypt_encryptBlock, address) \ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index 5efac02178865..39f56c002e41e 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -597,8 +597,10 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) { case vmIntrinsics::_min_strict: case vmIntrinsics::_max_strict: case vmIntrinsics::_arraycopy: - case vmIntrinsics::_arraySort: - case vmIntrinsics::_arrayPartition: + case vmIntrinsics::_arraySortMI: + case vmIntrinsics::_arraySortI: + case vmIntrinsics::_arrayPartitionSP: + case vmIntrinsics::_arrayPartitionDP: case vmIntrinsics::_indexOfL: case vmIntrinsics::_indexOfU: case vmIntrinsics::_indexOfUL: diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 477d1ff40558b..2ab21eb1b5355 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -293,8 +293,11 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_arraycopy: return inline_arraycopy(); - case vmIntrinsics::_arraySort: return inline_arraysort(); - case vmIntrinsics::_arrayPartition: return inline_array_partition(); + case vmIntrinsics::_arraySortMI: + case vmIntrinsics::_arraySortI: return inline_arraysort(); + + case vmIntrinsics::_arrayPartitionSP: return inline_array_partition(false /* single pivot*/); + case vmIntrinsics::_arrayPartitionDP: return inline_array_partition(true /* dual pivot*/); case vmIntrinsics::_compareToL: return inline_string_compareTo(StrIntrinsicNode::LL); case vmIntrinsics::_compareToU: return inline_string_compareTo(StrIntrinsicNode::UU); @@ -5367,7 +5370,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_ } //------------------------------inline_array_partition----------------------- -bool LibraryCallKit::inline_array_partition() { +bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { address stubAddr = nullptr; const char *stubName; @@ -5378,32 +5381,41 @@ bool LibraryCallKit::inline_array_partition() { Node* offset = argument(2); Node* fromIndex = argument(4); Node* toIndex = argument(5); - Node* pivot_indices = argument(6); - Node* isDualPivot = argument(7); + Node* indexPivot1 = argument(6); + Node* indexPivot2 = is_dual_pivot? argument(7) : nullptr; const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); - stubAddr = StubRoutines::select_array_partition_function(bt); - if (stubAddr == nullptr) return false; - + stubAddr = StubRoutines::select_array_partition_function(bt, is_dual_pivot); + // stub not loaded + if (stubAddr == nullptr) { + return false; + } + // get the address of the array const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr(); if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) { return false; // failed input validation } Node* obj_adr = make_unsafe_address(obj, offset); - pivot_indices = must_be_not_null(pivot_indices, true); - const TypeAryPtr* pivot_indices_type = pivot_indices->Value(&_gvn)->isa_aryptr(); - if (pivot_indices_type == nullptr || pivot_indices_type->elem() == Type::BOTTOM ) { - return false; // failed input validation - } - Node* pivot_indices_adr = array_element_address(pivot_indices, intcon(0), T_INT); - - // Call the stub. - make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(), + // create the pivotIndices array of type int and size = 2 + Node* pivotIndices = nullptr; + Node* size = intcon(2); + Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT))); + pivotIndices = new_array(klass_node, size, 0); // no arguments to push + AllocateArrayNode* alloc = tightly_coupled_allocation(pivotIndices); + guarantee(alloc != nullptr, "created above"); + Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT)); + + // Call the stub + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(is_dual_pivot), stubAddr, stubName, TypePtr::BOTTOM, - obj_adr, fromIndex, toIndex, pivot_indices_adr, isDualPivot); + obj_adr, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2); + + if (!stopped()) { + set_result(pivotIndices); + } return true; } @@ -5426,13 +5438,18 @@ bool LibraryCallKit::inline_arraysort() { ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); stubAddr = StubRoutines::select_arraysort_function(bt); - if (stubAddr == nullptr) return false; + //stub not loaded + if (stubAddr == nullptr) { + return false; + } + // get address of the array const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr(); if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) { return false; // failed input validation } Node* obj_adr = make_unsafe_address(obj, offset); + // Call the stub. make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(), stubAddr, stubName, TypePtr::BOTTOM, diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index 79258f3575d31..d33c1c8ee0538 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -278,7 +278,7 @@ class LibraryCallKit : public GraphKit { void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp, uint new_idx); bool inline_arraysort(); - bool inline_array_partition(); + bool inline_array_partition(bool is_dual_pivot); typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind; bool inline_unsafe_load_store(BasicType type, LoadStoreKind kind, AccessKind access_kind); bool inline_unsafe_fence(vmIntrinsics::ID id); diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 52dd29b8fa793..e6d8c956a5e63 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -857,17 +857,20 @@ const TypeFunc* OptoRuntime::array_fill_Type() { return TypeFunc::make(domain, range); } -const TypeFunc* OptoRuntime::array_partition_Type() { +const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) { // create input type (domain) - int num_args = 5; + int num_args = is_dual_pivot ? 6 : 5; int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; - fields[argp++] = TypePtr::NOTNULL; // array - fields[argp++] = TypeInt::INT; // low - fields[argp++] = TypeInt::INT; // end - fields[argp++] = TypePtr::NOTNULL; // pivot_indices (int array) - fields[argp++] = TypeInt::BOOL; // isDualPivot + fields[argp++] = TypePtr::NOTNULL; // array + fields[argp++] = TypeInt::INT; // low + fields[argp++] = TypeInt::INT; // end + fields[argp++] = TypePtr::NOTNULL; // pivot_indices (int array) + fields[argp++] = TypeInt::INT; // indexPivot1 + if (is_dual_pivot) { + fields[argp++] = TypeInt::INT; // indexPivot2 + } assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp index b85542423e848..4017f70d36296 100644 --- a/src/hotspot/share/opto/runtime.hpp +++ b/src/hotspot/share/opto/runtime.hpp @@ -269,7 +269,7 @@ class OptoRuntime : public AllStatic { static const TypeFunc* array_fill_Type(); static const TypeFunc* array_sort_Type(); - static const TypeFunc* array_partition_Type(); + static const TypeFunc* array_partition_Type(bool is_dual_pivot); static const TypeFunc* aescrypt_block_Type(); static const TypeFunc* cipherBlockChaining_aescrypt_Type(); static const TypeFunc* electronicCodeBook_aescrypt_Type(); diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 40bf177994e19..dce5740cecfda 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -180,10 +180,14 @@ address StubRoutines::_arraysort_int = nullptr; address StubRoutines::_arraysort_long = nullptr; address StubRoutines::_arraysort_float = nullptr; address StubRoutines::_arraysort_double = nullptr; -address StubRoutines::_array_partition_int = nullptr; -address StubRoutines::_array_partition_long = nullptr; -address StubRoutines::_array_partition_float = nullptr; -address StubRoutines::_array_partition_double = nullptr; +address StubRoutines::_array_partition_single_int = nullptr; +address StubRoutines::_array_partition_dual_int = nullptr; +address StubRoutines::_array_partition_single_long = nullptr; +address StubRoutines::_array_partition_dual_long = nullptr; +address StubRoutines::_array_partition_single_float = nullptr; +address StubRoutines::_array_partition_dual_float = nullptr; +address StubRoutines::_array_partition_single_double = nullptr; +address StubRoutines::_array_partition_dual_double = nullptr; address StubRoutines::_cont_thaw = nullptr; address StubRoutines::_cont_returnBarrier = nullptr; @@ -553,12 +557,12 @@ address StubRoutines::select_arraysort_function(BasicType t) { } } -address StubRoutines::select_array_partition_function(BasicType t) { +address StubRoutines::select_array_partition_function(BasicType t, bool is_dual_pivot) { switch(t) { - case T_INT: return _array_partition_int; - case T_LONG: return _array_partition_long; - case T_FLOAT: return _array_partition_float; - case T_DOUBLE: return _array_partition_double; + case T_INT: return is_dual_pivot ? _array_partition_dual_int : _array_partition_single_int; + case T_LONG: return is_dual_pivot ? _array_partition_dual_long : _array_partition_single_long; + case T_FLOAT: return is_dual_pivot ? _array_partition_dual_float : _array_partition_single_float; + case T_DOUBLE: return is_dual_pivot ? _array_partition_dual_double : _array_partition_single_double; default: ShouldNotReachHere(); return nullptr; diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index cc582bf24cc64..ee87450285c2e 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -157,10 +157,14 @@ class StubRoutines: AllStatic { static address _arraysort_long; static address _arraysort_float; static address _arraysort_double; - static address _array_partition_int; - static address _array_partition_long; - static address _array_partition_float; - static address _array_partition_double; + static address _array_partition_single_int; + static address _array_partition_dual_int; + static address _array_partition_single_long; + static address _array_partition_dual_long; + static address _array_partition_single_float; + static address _array_partition_dual_float; + static address _array_partition_single_double; + static address _array_partition_dual_double; // Leaf routines which implement arraycopy and their addresses // arraycopy operands aligned on element type boundary static address _jbyte_arraycopy; @@ -384,7 +388,7 @@ class StubRoutines: AllStatic { static address generic_arraycopy() { return _generic_arraycopy; } static address select_arraysort_function(BasicType t); - static address select_array_partition_function(BasicType t); + static address select_array_partition_function(BasicType t, bool is_dual_pivot); static address jbyte_fill() { return _jbyte_fill; } static address jshort_fill() { return _jshort_fill; } diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp similarity index 99% rename from src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp index 7abc3a5454266..15e406a822900 100644 --- a/src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp @@ -423,7 +423,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, } template <> -inline void avx512_qsort(int32_t *arr, int64_t fromIndex, int64_t toIndex) { +void inline avx512_qsort(int32_t *arr, int64_t fromIndex, int64_t toIndex) { int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { qsort_32bit_, int32_t>(arr, fromIndex, toIndex - 1, @@ -432,7 +432,7 @@ inline void avx512_qsort(int32_t *arr, int64_t fromIndex, int64_t toInd } template <> -inline void avx512_qsort(float *arr, int64_t fromIndex, int64_t toIndex) { +void inline avx512_qsort(float *arr, int64_t fromIndex, int64_t toIndex) { int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { qsort_32bit_, float>(arr, fromIndex, toIndex - 1, diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h similarity index 100% rename from src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h rename to src/java.base/linux/native/libsimdsort/avx512-64bit-common.h diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp similarity index 99% rename from src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp index 422f385d052e2..3028f45a79407 100644 --- a/src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp @@ -754,7 +754,7 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right, } template <> -inline void avx512_qsort(int64_t *arr, int64_t fromIndex, int64_t toIndex) { +void inline avx512_qsort(int64_t *arr, int64_t fromIndex, int64_t toIndex) { int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { qsort_64bit_, int64_t>(arr, fromIndex, toIndex - 1, @@ -763,7 +763,7 @@ inline void avx512_qsort(int64_t *arr, int64_t fromIndex, int64_t toInd } template <> -inline void avx512_qsort(double *arr, int64_t fromIndex, int64_t toIndex) { +void inline avx512_qsort(double *arr, int64_t fromIndex, int64_t toIndex) { int64_t arrsize = toIndex - fromIndex; if (arrsize > 1) { qsort_64bit_, double>(arr, fromIndex, toIndex - 1, diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h similarity index 91% rename from src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h rename to src/java.base/linux/native/libsimdsort/avx512-common-qsort.h index c56990f921eae..8f255a38e47d6 100644 --- a/src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h +++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h @@ -371,23 +371,21 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left, return l_store; } -// right = to_index (exclusive) +// to_index (exclusive) template -static int64_t vectorized_partition(type_t *arr, int64_t left, int64_t right, type_t pivot, bool use_gt) { +static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) { type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512_unrolled( - arr, left, right, pivot, &smallest, &biggest, use_gt); + arr, from_index, to_index, pivot, &smallest, &biggest, use_gt); return pivot_index; } // partitioning functions template -void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){ - const int64_t pidx1 = pivot_indices[0]; - const int64_t pidx2 = pivot_indices[1]; - const T pivot1 = arr[pidx1]; - const T pivot2 = arr[pidx2]; +void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){ + const T pivot1 = arr[index_pivot1]; + const T pivot2 = arr[index_pivot2]; const int64_t low = from_index; const int64_t high = to_index; @@ -395,14 +393,21 @@ void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, i const int64_t end = high - 1; - std::swap(arr[pidx1], arr[low]); - std::swap(arr[pidx2], arr[end]); + std::swap(arr[index_pivot1], arr[low]); + std::swap(arr[index_pivot2], arr[end]); const int64_t pivot_index2 = vectorized_partition, T>(arr, start, end, pivot2, true); // use_gt = true std::swap(arr[end], arr[pivot_index2]); int64_t upper = pivot_index2; + // if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning + if (upper == start) { + pivot_indices[0] = low; + pivot_indices[1] = upper; + return; + } + const int64_t pivot_index1 = vectorized_partition, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false) int64_t lower = pivot_index1 - 1; std::swap(arr[low], arr[lower]); @@ -412,13 +417,11 @@ void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, i } template -void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){ - const int64_t pidx = pivot_indices[0]; - const T pivot = arr[pidx]; +void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot){ + const T pivot = arr[index_pivot]; const int64_t low = from_index; const int64_t high = to_index; - //const int64_t start = low + 1; const int64_t end = high - 1; @@ -433,11 +436,37 @@ void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, } template -inline void avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) { +void inline avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) { if(is_dual_pviot) avx512_dual_pivot_partition(arr, from_index, to_index, pivot_indices); else avx512_single_pivot_partition(arr, from_index, to_index, pivot_indices); } +template +void inline insertion_sort(T *arr, int32_t from_index, int32_t to_index) { + for (int i, k = from_index; ++k < to_index; ) { + T ai = arr[i = k]; + + if (ai < arr[i - 1]) { + while (--i >= from_index && ai < arr[i]) { + arr[i + 1] = arr[i]; + } + arr[i + 1] = ai; + } + } +} + +template +void inline avx512_fastsort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) { + int32_t size = to_index - from_index; + + if (size <= INS_SORT_THRESHOLD) { + insertion_sort(arr, from_index, to_index); + } + else { + avx512_qsort(arr, from_index, to_index); + } +} + #endif // AVX512_QSORT_COMMON diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp new file mode 100644 index 0000000000000..a18acda571ce1 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Intel x86-simd-sort source code. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#pragma GCC target("avx512dq", "avx512f") +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" + +#define DLL_PUBLIC __attribute__((visibility("default"))) +#define INSERTION_SORT_THRESHOLD_32BIT 16 +#define INSERTION_SORT_THRESHOLD_64BIT 20 + +extern "C" { + + DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) { + avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + } + + DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) { + avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + } + + DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) { + avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + } + + DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) { + avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + } + + DLL_PUBLIC void avx512_partition_single_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { + avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); + } + + DLL_PUBLIC void avx512_partition_dual_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + + DLL_PUBLIC void avx512_partition_single_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { + avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); + } + + DLL_PUBLIC void avx512_partition_dual_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + + DLL_PUBLIC void avx512_partition_single_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { + avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); + } + + DLL_PUBLIC void avx512_partition_dual_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + + DLL_PUBLIC void avx512_partition_single_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { + avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); + } + + DLL_PUBLIC void avx512_partition_dual_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + +} diff --git a/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp deleted file mode 100644 index aeea98006ce48..0000000000000 --- a/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2023 Intel Corporation. All rights reserved. - * Intel x86-simd-sort source code. - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - * - */ - -#pragma GCC target("avx512dq", "avx512f") -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" - -#define DLL_PUBLIC __attribute__((visibility("default"))) - -extern "C" { - - DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) { - avx512_qsort(array, from_index, to_index); - } - - DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) { - avx512_qsort(array, from_index, to_index); - } - - DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) { - avx512_qsort(array, from_index, to_index); - } - - DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) { - avx512_qsort(array, from_index, to_index); - } - - DLL_PUBLIC void avx512_partition_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { - avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); - } - - DLL_PUBLIC void avx512_partition_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { - avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); - } - - DLL_PUBLIC void avx512_partition_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { - avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); - } - - DLL_PUBLIC void avx512_partition_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) { - avx512_partition(array, from_index, to_index, pivot_indices, is_dual_pivot); - } - -} diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index b3959b1048e75..f2cf2deab6638 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -126,15 +126,9 @@ private DualPivotQuicksort() {} private static final int MAX_RECURSION_DEPTH = 64 * DELTA; /** - * Min array size to call fast small array sort. - */ - private static final int MIN_FAST_SMALL_ARRAY_SORT_SIZE = 16; - - /** - * Sorts the specified array into ascending numerical order. - * While the intrinsic is free to choose its own sorting algorithm, the - * fallback implementation uses either mixed insertion sort or simple - * insertion sort. + * Sorts the specified array into ascending numerical order using + * mixed insertion sort.The intrinsic is free to choose its own + * sorting algorithm. * * @param elemType the class of the elements of the array to be sorted * @param array the array to be sorted @@ -143,66 +137,35 @@ private DualPivotQuicksort() {} * address pointing to the first element to sort from. * @param low the index of the first element, inclusive, to be sorted * @param high the index of the last element, exclusive, to be sorted - * @param end the index of the last element for simple insertion sort (in - * the case of mixed insertion sort). In the fallback implementation, - * if end < 0, we use insertion sort else we use mixed insertion sort. + * @param end the index of the last element for simple insertion sort */ @IntrinsicCandidate - @ForceInline - private static void arraySort(Class elemType, Object array, long offset, int low, int high, int end) { - if (end < 0) insertionSort(array, low, high); - else mixedInsertionSort(array, low, end, high); + private static void mixedInsertionSort(Class elemType, Object array, long offset, int low, int high, int end) { + switch (array) { + case int[] arr -> mixedInsertionSort(arr, low, end, high); + case long[] arr -> mixedInsertionSort(arr, low, end, high); + case float[] arr -> mixedInsertionSort(arr, low, end, high); + case double[] arr -> mixedInsertionSort(arr, low, end, high); + default -> throw new UnsupportedOperationException(); + } } /** - * Partitions the specified array based on the pivot(s) provided. + * Sorts the specified array into ascending numerical order using + * insertion sort.The intrinsic is free to choose its own + * sorting algorithm. * - * @param elemType the class of the array to be sorted + * @param elemType the class of the elements of the array to be sorted * @param array the array to be sorted * @param offset the relative offset, in bytes, from the base address of - * the array to partition, otherwise if the array is {@code null},an absolute - * address pointing to the first element to partition from. + * the array to sort, otherwise if the array is {@code null},an absolute + * address pointing to the first element to sort from. * @param low the index of the first element, inclusive, to be sorted * @param high the index of the last element, exclusive, to be sorted - * @param pivotIndices the array containing the indices of the pivots. After - * partitioning, this array is updated with the new indices of the pivots. - * @param isDualPivot a boolean value to choose between dual pivot - * partitioning and single pivot partitioning */ @IntrinsicCandidate - @ForceInline - private static void arrayPartition(Class elemType, Object array, long offset, int low, int high, int[] pivotIndices, boolean isDualPivot) { - if (isDualPivot) partitionDualPivot(array, low, high, pivotIndices); - else partitionSinglePivot(array, low, high, pivotIndices); - } - - /** - * Calculates the double depth of parallel merging. - * Depth is negative, if tasks split before sorting. - * - * @param parallelism the parallelism level - * @param size the target size - * @return the depth of parallel merging - */ - private static int getDepth(int parallelism, int size) { - int depth = 0; - - while ((parallelism >>= 3) > 0 && (size >>= 2) > 0) { - depth -= 2; - } - return depth; - } - - /** - * Sorts the specified range of the array using insertion sort - * - * @param array the array to be sorted - * @param low the index of the first element, inclusive, to be sorted - * @param high the index of the last element, exclusive, to be sorted - * - */ - private static void insertionSort(Object array, int low, int high) { - switch (array) { + private static void insertionSort(Class elemType, Object array, long offset, int low, int high) { + switch (array) { case int[] arr -> insertionSort(arr, low, high); case long[] arr -> insertionSort(arr, low, high); case float[] arr -> insertionSort(arr, low, high); @@ -212,62 +175,69 @@ private static void insertionSort(Object array, int low, int high) { } /** - * Sorts the specified range of the array using mixed insertion sort. + * Partitions the specified array based on the two pivots provided. * + * @param elemType the class of the array to be sorted * @param array the array to be sorted + * @param offset the relative offset, in bytes, from the base address of + * the array to partition, otherwise if the array is {@code null},an absolute + * address pointing to the first element to partition from. * @param low the index of the first element, inclusive, to be sorted * @param high the index of the last element, exclusive, to be sorted - * @param end the index of the last element for simple insertion sort - * + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot */ - private static void mixedInsertionSort(Object array, int low, int end, int high) { - switch (array) { - case int[] arr -> mixedInsertionSort(arr, low, end, high); - case long[] arr -> mixedInsertionSort(arr, low, end, high); - case float[] arr -> mixedInsertionSort(arr, low, end, high); - case double[] arr -> mixedInsertionSort(arr, low, end, high); + @IntrinsicCandidate + @ForceInline + private static int[] partitionDualPivot(Class elemType, Object array, long offset, int low, int high, int indexPivot1, int indexPivot2) { + return switch(array) { + case int[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); + case long[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); + case float[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); + case double[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); default -> throw new UnsupportedOperationException(); - } + }; } /** - * Partitions the specified range of the array using the two pivots specified. - * - * @param array the array to be partitioned - * @param low the index of the first element, inclusive, for partitioning - * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the two pivots to be used. - * After partitioning, the indices of the pivots is updated as well. + * Partitions the specified array based on the single pivot provided. * + * @param elemType the class of the array to be sorted + * @param array the array to be sorted + * @param offset the relative offset, in bytes, from the base address of + * the array to partition, otherwise if the array is {@code null},an absolute + * address pointing to the first element to partition from. + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param indexPivot the index of the pivot */ - private static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) { - switch(array) { - case int[] arr -> partitionDualPivot(arr, low, high, pivotIndices); - case long[] arr -> partitionDualPivot(arr, low, high, pivotIndices); - case float[] arr -> partitionDualPivot(arr, low, high, pivotIndices); - case double[] arr -> partitionDualPivot(arr, low, high, pivotIndices); + @IntrinsicCandidate + @ForceInline + private static int[] partitionSinglePivot(Class elemType, Object array, long offset, int low, int high, int indexPivot) { + return switch(array) { + case int[] arr -> partitionSinglePivot(arr, low, high, indexPivot); + case long[] arr -> partitionSinglePivot(arr, low, high, indexPivot); + case float[] arr -> partitionSinglePivot(arr, low, high, indexPivot); + case double[] arr -> partitionSinglePivot(arr, low, high, indexPivot); default -> throw new UnsupportedOperationException(); - } + }; } /** - * Partitions the specified range of the array using a single pivot specified. - * - * @param array the array to be partitioned - * @param low the index of the first element, inclusive, for partitioning - * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the pivot to be used. - * After partitioning, the indices of the pivots is updated as well. + * Calculates the double depth of parallel merging. + * Depth is negative, if tasks split before sorting. * + * @param parallelism the parallelism level + * @param size the target size + * @return the depth of parallel merging */ - private static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) { - switch(array) { - case int[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); - case long[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); - case float[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); - case double[] arr -> partitionSinglePivot(arr, low, high, pivotIndices); - default -> throw new UnsupportedOperationException(); + private static int getDepth(int parallelism, int size) { + int depth = 0; + + while ((parallelism >>= 3) > 0 && (size >>= 2) > 0) { + depth -= 2; } + return depth; } /** @@ -309,18 +279,14 @@ static void sort(int[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { + int[] pivotIndices; while (true) { int end = high - 1, size = high - low; - int[] pivotIndices; - int baseOffset = Unsafe.ARRAY_INT_BASE_OFFSET; - /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - int last = high - 3 * ((size >> 5) << 3); - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else arraySort(int.class, a, baseOffset, low, high, last); + mixedInsertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -328,8 +294,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else arraySort(int.class, a, baseOffset, low, high, -1); + insertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high); return; } @@ -409,15 +374,13 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); - if (isDualPivot) { + if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = new int[] {e1, e5}; - arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionDualPivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -441,8 +404,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = new int[] {e3, e3}; - arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionSinglePivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -461,22 +423,23 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots specified. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the two pivots to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ - private static void partitionDualPivot(int[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionDualPivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) { int end = high - 1; int lower = low; int upper = end; - int e1 = pivotIndices[0]; - int e5 = pivotIndices[1]; + int e1 = indexPivot1; + int e5 = indexPivot2; int pivot1 = a[e1]; int pivot2 = a[e5]; @@ -543,29 +506,26 @@ private static void partitionDualPivot(int[] a, int low, int high, int[] pivotIn a[low] = a[lower]; a[lower] = pivot1; a[end] = a[upper]; a[upper] = pivot2; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** - * Partitions the specified range of the array using a single pivot specified. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the pivot to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot the index of the pivot * */ - private static void partitionSinglePivot(int[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot) { int end = high - 1; int lower = low; int upper = end; - - - int e3 = pivotIndices[0]; + int e3 = indexPivot; int pivot = a[e3]; /* @@ -619,8 +579,7 @@ private static void partitionSinglePivot(int[] a, int low, int high, int[] pivot * Swap the pivot into its final position. */ a[low] = a[lower]; a[lower] = pivot; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** @@ -1122,18 +1081,16 @@ static void sort(long[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { + int[] pivotIndices; while (true) { int end = high - 1, size = high - low; - int[] pivotIndices; - int baseOffset = Unsafe.ARRAY_LONG_BASE_OFFSET; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else arraySort(long.class, a, baseOffset, low, high, last); + mixedInsertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -1141,8 +1098,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else arraySort(long.class, a, baseOffset, low, high, -1); + insertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high); return; } @@ -1222,16 +1178,14 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); - if(isDualPivot) { + if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = new int[] {e1, e5}; - arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionDualPivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1252,8 +1206,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = new int[] {e3, e3}; - arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionSinglePivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1272,22 +1225,23 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots specified. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the two pivots to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ - private static void partitionDualPivot(long[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionDualPivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) { int end = high - 1; int lower = low; int upper = end; - int e1 = pivotIndices[0]; - int e5 = pivotIndices[1]; + int e1 = indexPivot1; + int e5 = indexPivot2; long pivot1 = a[e1]; long pivot2 = a[e5]; @@ -1354,27 +1308,26 @@ private static void partitionDualPivot(long[] a, int low, int high, int[] pivotI a[low] = a[lower]; a[lower] = pivot1; a[end] = a[upper]; a[upper] = pivot2; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** - * Partitions the specified range of the array using a single pivot specified. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the pivot to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot the index of the pivot * */ - private static void partitionSinglePivot(long[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot) { int end = high - 1; int lower = low; int upper = end; - int e3 = pivotIndices[0]; + int e3 = indexPivot; long pivot = a[e3]; /* @@ -1428,8 +1381,7 @@ private static void partitionSinglePivot(long[] a, int low, int high, int[] pivo * Swap the pivot into its final position. */ a[low] = a[lower]; a[lower] = pivot; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** @@ -2718,18 +2670,16 @@ static void sort(float[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { + int[] pivotIndices; while (true) { int end = high - 1, size = high - low; - int[] pivotIndices; - int baseOffset = Unsafe.ARRAY_FLOAT_BASE_OFFSET; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else arraySort(float.class, a, baseOffset, low, high, last); + mixedInsertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -2737,8 +2687,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else arraySort(float.class, a, baseOffset, low, high, -1); + insertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high); return; } @@ -2818,16 +2767,14 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); - if(isDualPivot) { + if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = new int[] {e1, e5}; - arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionDualPivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2848,8 +2795,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = new int[] {e3, e3}; - arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionSinglePivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2868,22 +2814,23 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots specified. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the two pivots to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ - private static void partitionDualPivot(float[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionDualPivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) { int end = high - 1; int lower = low; int upper = end; - int e1 = pivotIndices[0]; - int e5 = pivotIndices[1]; + int e1 = indexPivot1; + int e5 = indexPivot2; float pivot1 = a[e1]; float pivot2 = a[e5]; @@ -2950,27 +2897,26 @@ private static void partitionDualPivot(float[] a, int low, int high, int[] pivot a[low] = a[lower]; a[lower] = pivot1; a[end] = a[upper]; a[upper] = pivot2; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** - * Partitions the specified range of the array using a single pivot specified. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the pivot to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot the index of the pivot * */ - private static void partitionSinglePivot(float[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot) { int end = high - 1; int lower = low; int upper = end; - int e3 = pivotIndices[0]; + int e3 = indexPivot; float pivot = a[e3]; /* @@ -3024,8 +2970,7 @@ private static void partitionSinglePivot(float[] a, int low, int high, int[] piv * Swap the pivot into its final position. */ a[low] = a[lower]; a[lower] = pivot; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** @@ -3579,18 +3524,15 @@ static void sort(double[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { + int[] pivotIndices; while (true) { int end = high - 1, size = high - low; - int[] pivotIndices; - int baseOffset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET; - /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { int last = high - 3 * ((size >> 5) << 3); - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high); - else arraySort(double.class, a, baseOffset, low, high, last); + mixedInsertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); return; } @@ -3598,8 +3540,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high); - else arraySort(double.class, a, baseOffset, low, high, -1); + insertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high); return; } @@ -3679,16 +3620,14 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]); - if(isDualPivot) { + if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = new int[] {e1, e5}; - arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionDualPivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3709,8 +3648,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = new int[] {e3, e3}; - arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot); + pivotIndices = partitionSinglePivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -3730,22 +3668,23 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots specified. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the two pivots to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ - private static void partitionDualPivot(double[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionDualPivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) { int end = high - 1; int lower = low; int upper = end; - int e1 = pivotIndices[0]; - int e5 = pivotIndices[1]; + int e1 = indexPivot1; + int e5 = indexPivot2; double pivot1 = a[e1]; double pivot2 = a[e5]; @@ -3812,27 +3751,26 @@ private static void partitionDualPivot(double[] a, int low, int high, int[] pivo a[low] = a[lower]; a[lower] = pivot1; a[end] = a[upper]; a[upper] = pivot2; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** - * Partitions the specified range of the array using a single pivot specified. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param pivotIndices an array containing the indices of the pivot to be used. - * After partitioning, this array the indices of the pivots is updated as well. + * @param indexPivot the index of the pivot */ - private static void partitionSinglePivot(double[] a, int low, int high, int[] pivotIndices) { + @ForceInline + private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot) { int end = high - 1; int lower = low; int upper = end; - int e3 = pivotIndices[0]; + int e3 = indexPivot; double pivot = a[e3]; /* @@ -3886,8 +3824,7 @@ private static void partitionSinglePivot(double[] a, int low, int high, int[] pi * Swap the pivot into its final position. */ a[low] = a[lower]; a[lower] = pivot; - pivotIndices[0] = lower; - pivotIndices[1] = upper; + return new int[] {lower, upper}; } /** diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java index e89496bb2e532..ce5b2ff87e07d 100644 --- a/test/jdk/java/util/Arrays/Sorting.java +++ b/test/jdk/java/util/Arrays/Sorting.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -26,7 +26,8 @@ * @compile/module=java.base java/util/SortingHelper.java * @bug 6880672 6896573 6899694 6976036 7013585 7018258 8003981 8226297 * @build Sorting - * @run main Sorting -shortrun + * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySortI,_arraySortMI,_arrayPartitionSP,_arrayPartitionDP Sorting -shortrun + * @run main/othervm -XX:CompileThreshold=1 -XX:-TieredCompilation Sorting -shortrun * @summary Exercise Arrays.sort, Arrays.parallelSort * * @author Vladimir Yaroslavskiy @@ -46,7 +47,7 @@ public class Sorting { // Array lengths used in a long run (default) private static final int[] LONG_RUN_LENGTHS = { - 1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000 }; + 1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000}; // Array lengths used in a short run private static final int[] SHORT_RUN_LENGTHS = { From ed8b95c98379fb08e9aea71aca3f7261896304cd Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Tue, 12 Sep 2023 15:52:54 -0700 Subject: [PATCH 32/40] Refactor stub handling to use a generic function for all types --- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 38 ++------ .../gc/shenandoah/c2/shenandoahSupport.cpp | 2 +- src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 15 +--- src/hotspot/share/opto/library_call.cpp | 14 ++- src/hotspot/share/opto/runtime.cpp | 6 +- src/hotspot/share/runtime/stubRoutines.cpp | 39 +------- src/hotspot/share/runtime/stubRoutines.hpp | 19 ++-- .../native/libsimdsort/avxsort_linux_x86.cpp | 88 ++++++++++--------- 8 files changed, 78 insertions(+), 143 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 1ac5f566434e4..ff688ef1913c5 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4192,41 +4192,15 @@ void StubGenerator::generate_compiler_stubs() { if (libsimdsort != nullptr) { log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libsimdsort)); - snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_int"); - StubRoutines::_arraysort_int = (address)os::dll_lookup(libsimdsort, ebuf_); + snprintf(ebuf_, sizeof(ebuf_), "avx512_sort"); + StubRoutines::_arraysort = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_long"); - StubRoutines::_arraysort_long = (address)os::dll_lookup(libsimdsort, ebuf_); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single"); + StubRoutines::_array_partition_single = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_float"); - StubRoutines::_arraysort_float = (address)os::dll_lookup(libsimdsort, ebuf_); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual"); + StubRoutines::_array_partition_dual = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_double"); - StubRoutines::_arraysort_double = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_int"); - StubRoutines::_array_partition_single_int = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_int"); - StubRoutines::_array_partition_dual_int = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_long"); - StubRoutines::_array_partition_single_long = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_long"); - StubRoutines::_array_partition_dual_long = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_float"); - StubRoutines::_array_partition_single_float = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_float"); - StubRoutines::_array_partition_dual_float = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_double"); - StubRoutines::_array_partition_single_double = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_double"); - StubRoutines::_array_partition_dual_double = (address)os::dll_lookup(libsimdsort, ebuf_); } } diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp index 0384ec1942b3d..8f1e041b5a9b4 100644 --- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp +++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp @@ -388,7 +388,7 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) { } args[6]; } calls[] = { "array_partition_stub", - { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore }, { -1, ShenandoahNone }, + { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+4, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone }, { -1, ShenandoahNone }, { -1, ShenandoahNone } }, "arraysort_stub", { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone }, diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index e74afd3a7759d..f687f879b863f 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -327,18 +327,9 @@ static_field(StubRoutines, _checkcast_arraycopy_uninit, address) \ static_field(StubRoutines, _unsafe_arraycopy, address) \ static_field(StubRoutines, _generic_arraycopy, address) \ - static_field(StubRoutines, _arraysort_int, address) \ - static_field(StubRoutines, _arraysort_long, address) \ - static_field(StubRoutines, _arraysort_float, address) \ - static_field(StubRoutines, _arraysort_double, address) \ - static_field(StubRoutines, _array_partition_single_int, address) \ - static_field(StubRoutines, _array_partition_dual_int, address) \ - static_field(StubRoutines, _array_partition_single_long, address) \ - static_field(StubRoutines, _array_partition_dual_long, address) \ - static_field(StubRoutines, _array_partition_single_float, address) \ - static_field(StubRoutines, _array_partition_dual_float, address) \ - static_field(StubRoutines, _array_partition_single_double, address) \ - static_field(StubRoutines, _array_partition_dual_double, address) \ + static_field(StubRoutines, _arraysort, address) \ + static_field(StubRoutines, _array_partition_single, address) \ + static_field(StubRoutines, _array_partition_dual, address) \ \ static_field(StubRoutines, _aescrypt_encryptBlock, address) \ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 2ab21eb1b5355..34c2f003651da 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -5387,7 +5387,7 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); - stubAddr = StubRoutines::select_array_partition_function(bt, is_dual_pivot); + stubAddr = StubRoutines::select_array_partition_function(is_dual_pivot); // stub not loaded if (stubAddr == nullptr) { return false; @@ -5408,10 +5408,13 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { guarantee(alloc != nullptr, "created above"); Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT)); + // pass the bastic type enum to the stub + Node* elemType = intcon(bt); + // Call the stub make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(is_dual_pivot), stubAddr, stubName, TypePtr::BOTTOM, - obj_adr, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2); + obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2); if (!stopped()) { set_result(pivotIndices); @@ -5437,7 +5440,7 @@ bool LibraryCallKit::inline_arraysort() { const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); - stubAddr = StubRoutines::select_arraysort_function(bt); + stubAddr = StubRoutines::select_arraysort_function(); //stub not loaded if (stubAddr == nullptr) { return false; @@ -5450,10 +5453,13 @@ bool LibraryCallKit::inline_arraysort() { } Node* obj_adr = make_unsafe_address(obj, offset); + // pass the bastic type enum to the stub + Node* elemType = intcon(bt); + // Call the stub. make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(), stubAddr, stubName, TypePtr::BOTTOM, - obj_adr, fromIndex, toIndex); + obj_adr, elemType, fromIndex, toIndex); return true; } diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index e6d8c956a5e63..7a57b755555f1 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -859,11 +859,12 @@ const TypeFunc* OptoRuntime::array_fill_Type() { const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) { // create input type (domain) - int num_args = is_dual_pivot ? 6 : 5; + int num_args = is_dual_pivot ? 7 : 6; int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; fields[argp++] = TypePtr::NOTNULL; // array + fields[argp++] = TypeInt::INT; // element type fields[argp++] = TypeInt::INT; // low fields[argp++] = TypeInt::INT; // end fields[argp++] = TypePtr::NOTNULL; // pivot_indices (int array) @@ -883,11 +884,12 @@ const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) { const TypeFunc* OptoRuntime::array_sort_Type() { // create input type (domain) - int num_args = 3; + int num_args = 4; int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; fields[argp++] = TypePtr::NOTNULL; // array + fields[argp++] = TypeInt::INT; // element type fields[argp++] = TypeInt::INT; // fromIndex fields[argp++] = TypeInt::INT; // toIndex assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index dce5740cecfda..ed26d119f49a6 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -176,18 +176,9 @@ address StubRoutines::_hf2f = nullptr; address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; -address StubRoutines::_arraysort_int = nullptr; -address StubRoutines::_arraysort_long = nullptr; -address StubRoutines::_arraysort_float = nullptr; -address StubRoutines::_arraysort_double = nullptr; -address StubRoutines::_array_partition_single_int = nullptr; -address StubRoutines::_array_partition_dual_int = nullptr; -address StubRoutines::_array_partition_single_long = nullptr; -address StubRoutines::_array_partition_dual_long = nullptr; -address StubRoutines::_array_partition_single_float = nullptr; -address StubRoutines::_array_partition_dual_float = nullptr; -address StubRoutines::_array_partition_single_double = nullptr; -address StubRoutines::_array_partition_dual_double = nullptr; +address StubRoutines::_arraysort = nullptr; +address StubRoutines::_array_partition_single = nullptr; +address StubRoutines::_array_partition_dual = nullptr; address StubRoutines::_cont_thaw = nullptr; address StubRoutines::_cont_returnBarrier = nullptr; @@ -544,27 +535,3 @@ UnsafeCopyMemoryMark::~UnsafeCopyMemoryMark() { } } } - -address StubRoutines::select_arraysort_function(BasicType t) { - switch(t) { - case T_INT: return _arraysort_int; - case T_LONG: return _arraysort_long; - case T_FLOAT: return _arraysort_float; - case T_DOUBLE: return _arraysort_double; - default: - ShouldNotReachHere(); - return nullptr; - } -} - -address StubRoutines::select_array_partition_function(BasicType t, bool is_dual_pivot) { - switch(t) { - case T_INT: return is_dual_pivot ? _array_partition_dual_int : _array_partition_single_int; - case T_LONG: return is_dual_pivot ? _array_partition_dual_long : _array_partition_single_long; - case T_FLOAT: return is_dual_pivot ? _array_partition_dual_float : _array_partition_single_float; - case T_DOUBLE: return is_dual_pivot ? _array_partition_dual_double : _array_partition_single_double; - default: - ShouldNotReachHere(); - return nullptr; - } -} diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index ee87450285c2e..4afd471e97de7 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -153,18 +153,9 @@ class StubRoutines: AllStatic { static BufferBlob* _compiler_stubs_code; // code buffer for C2 intrinsics static BufferBlob* _final_stubs_code; // code buffer for all other routines - static address _arraysort_int; - static address _arraysort_long; - static address _arraysort_float; - static address _arraysort_double; - static address _array_partition_single_int; - static address _array_partition_dual_int; - static address _array_partition_single_long; - static address _array_partition_dual_long; - static address _array_partition_single_float; - static address _array_partition_dual_float; - static address _array_partition_single_double; - static address _array_partition_dual_double; + static address _arraysort; + static address _array_partition_single; + static address _array_partition_dual; // Leaf routines which implement arraycopy and their addresses // arraycopy operands aligned on element type boundary static address _jbyte_arraycopy; @@ -387,8 +378,8 @@ class StubRoutines: AllStatic { static UnsafeArrayCopyStub UnsafeArrayCopy_stub() { return CAST_TO_FN_PTR(UnsafeArrayCopyStub, _unsafe_arraycopy); } static address generic_arraycopy() { return _generic_arraycopy; } - static address select_arraysort_function(BasicType t); - static address select_array_partition_function(BasicType t, bool is_dual_pivot); + static address select_arraysort_function() { return _arraysort; } + static address select_array_partition_function(bool is_dual_pivot) { return is_dual_pivot ? _array_partition_dual : _array_partition_single; } static address jbyte_fill() { return _jbyte_fill; } static address jshort_fill() { return _jshort_fill; } diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp index a18acda571ce1..555be741a4f2b 100644 --- a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp +++ b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp @@ -27,6 +27,7 @@ #pragma GCC target("avx512dq", "avx512f") #include "avx512-32bit-qsort.hpp" #include "avx512-64bit-qsort.hpp" +#include "classfile_constants.h" #define DLL_PUBLIC __attribute__((visibility("default"))) #define INSERTION_SORT_THRESHOLD_32BIT 16 @@ -34,52 +35,55 @@ extern "C" { - DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) { - avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + DLL_PUBLIC void avx512_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) { + switch(elem_type) { + case JVM_T_INT: + avx512_fastsort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + break; + case JVM_T_LONG: + avx512_fastsort((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + break; + case JVM_T_FLOAT: + avx512_fastsort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + break; + case JVM_T_DOUBLE: + avx512_fastsort((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + break; + } } - DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) { - avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + DLL_PUBLIC void avx512_partition_single(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot) { + switch(elem_type) { + case JVM_T_INT: + avx512_single_pivot_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot); + break; + case JVM_T_LONG: + avx512_single_pivot_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot); + break; + case JVM_T_FLOAT: + avx512_single_pivot_partition((float*)array, from_index, to_index, pivot_indices, index_pivot); + break; + case JVM_T_DOUBLE: + avx512_single_pivot_partition((double*)array, from_index, to_index, pivot_indices, index_pivot); + break; + } } - DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) { - avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); - } - - DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) { - avx512_fastsort(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); - } - - DLL_PUBLIC void avx512_partition_single_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { - avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); - } - - DLL_PUBLIC void avx512_partition_dual_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { - avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - } - - DLL_PUBLIC void avx512_partition_single_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { - avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); - } - - DLL_PUBLIC void avx512_partition_dual_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { - avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - } - - DLL_PUBLIC void avx512_partition_single_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { - avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); - } - - DLL_PUBLIC void avx512_partition_dual_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { - avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - } - - DLL_PUBLIC void avx512_partition_single_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) { - avx512_single_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot); - } - - DLL_PUBLIC void avx512_partition_dual_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { - avx512_dual_pivot_partition(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + DLL_PUBLIC void avx512_partition_dual(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + switch(elem_type) { + case JVM_T_INT: + avx512_dual_pivot_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + case JVM_T_LONG: + avx512_dual_pivot_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + case JVM_T_FLOAT: + avx512_dual_pivot_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + case JVM_T_DOUBLE: + avx512_dual_pivot_partition((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + } } } From 172b2d3e91b3689cddc4bc92597d610d72645a17 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Wed, 13 Sep 2023 15:54:18 -0700 Subject: [PATCH 33/40] Refactor the sort and partition intrinsics to accept method references for fallback functions --- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 11 +- src/hotspot/share/classfile/vmIntrinsics.hpp | 19 +- src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 5 +- src/hotspot/share/opto/c2compiler.cpp | 6 +- src/hotspot/share/opto/library_call.cpp | 19 +- src/hotspot/share/opto/library_call.hpp | 4 +- src/hotspot/share/opto/runtime.cpp | 8 +- src/hotspot/share/opto/runtime.hpp | 2 +- src/hotspot/share/runtime/stubRoutines.cpp | 5 +- src/hotspot/share/runtime/stubRoutines.hpp | 9 +- .../native/libsimdsort/avx512-common-qsort.h | 12 +- .../native/libsimdsort/avxsort_linux_x86.cpp | 35 +--- .../classes/java/util/DualPivotQuicksort.java | 196 +++++++++--------- test/jdk/java/util/Arrays/Sorting.java | 2 +- 14 files changed, 145 insertions(+), 188 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index ff688ef1913c5..bcbb8764982cb 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4190,16 +4190,13 @@ void StubGenerator::generate_compiler_stubs() { } // Get addresses for avx512 sort and partition routines if (libsimdsort != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libsimdsort)); + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort)); snprintf(ebuf_, sizeof(ebuf_), "avx512_sort"); - StubRoutines::_arraysort = (address)os::dll_lookup(libsimdsort, ebuf_); + StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_); - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single"); - StubRoutines::_array_partition_single = (address)os::dll_lookup(libsimdsort, ebuf_); - - snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual"); - StubRoutines::_array_partition_dual = (address)os::dll_lookup(libsimdsort, ebuf_); + snprintf(ebuf_, sizeof(ebuf_), "avx512_partition"); + StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_); } } diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index bba728b694f8e..fcd7e6a89b7a0 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -341,20 +341,13 @@ class methodHandle; do_name( copyOf_name, "copyOf") \ do_signature(copyOf_signature, "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;") \ \ - do_intrinsic(_arraySortMI, java_util_DualPivotQuicksort, arraySortMI_name, arraySortMI_signature, F_S) \ - do_name( arraySortMI_name, "mixedInsertionSort") \ - do_signature(arraySortMI_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)V") \ - do_intrinsic(_arraySortI, java_util_DualPivotQuicksort, arraySortI_name, arraySortI_signature, F_S) \ - do_name( arraySortI_name, "insertionSort") \ - do_signature(arraySortI_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII)V") \ - \ - do_intrinsic(_arrayPartitionSP, java_util_DualPivotQuicksort, arrayPartitionSP_name, arrayPartitionSP_signature, F_S) \ - do_name( arrayPartitionSP_name, "partitionSinglePivot") \ - do_signature(arrayPartitionSP_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIII)[I") \ - do_intrinsic(_arrayPartitionDP, java_util_DualPivotQuicksort, arrayPartitionDP_name, arrayPartitionDP_signature, F_S) \ - do_name( arrayPartitionDP_name, "partitionDualPivot") \ - do_signature(arrayPartitionDP_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIII)[I") \ + do_intrinsic(_arraySort, java_util_DualPivotQuicksort, arraySort_name, arraySort_signature, F_S) \ + do_name( arraySort_name, "arraySort") \ + do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/SortOperation;)V") \ \ + do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ + do_name( arrayPartition_name, "arrayPartition") \ + do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/PartitionOperation;)[I") \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ do_name( copyOfRange_name, "copyOfRange") \ diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index f687f879b863f..794895ec8fbdb 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -327,9 +327,8 @@ static_field(StubRoutines, _checkcast_arraycopy_uninit, address) \ static_field(StubRoutines, _unsafe_arraycopy, address) \ static_field(StubRoutines, _generic_arraycopy, address) \ - static_field(StubRoutines, _arraysort, address) \ - static_field(StubRoutines, _array_partition_single, address) \ - static_field(StubRoutines, _array_partition_dual, address) \ + static_field(StubRoutines, _array_sort, address) \ + static_field(StubRoutines, _array_partition, address) \ \ static_field(StubRoutines, _aescrypt_encryptBlock, address) \ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index 39f56c002e41e..5efac02178865 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -597,10 +597,8 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) { case vmIntrinsics::_min_strict: case vmIntrinsics::_max_strict: case vmIntrinsics::_arraycopy: - case vmIntrinsics::_arraySortMI: - case vmIntrinsics::_arraySortI: - case vmIntrinsics::_arrayPartitionSP: - case vmIntrinsics::_arrayPartitionDP: + case vmIntrinsics::_arraySort: + case vmIntrinsics::_arrayPartition: case vmIntrinsics::_indexOfL: case vmIntrinsics::_indexOfU: case vmIntrinsics::_indexOfUL: diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 34c2f003651da..9d119a19ea313 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -293,11 +293,8 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_arraycopy: return inline_arraycopy(); - case vmIntrinsics::_arraySortMI: - case vmIntrinsics::_arraySortI: return inline_arraysort(); - - case vmIntrinsics::_arrayPartitionSP: return inline_array_partition(false /* single pivot*/); - case vmIntrinsics::_arrayPartitionDP: return inline_array_partition(true /* dual pivot*/); + case vmIntrinsics::_arraySort: return inline_array_sort(); + case vmIntrinsics::_arrayPartition: return inline_array_partition(); case vmIntrinsics::_compareToL: return inline_string_compareTo(StrIntrinsicNode::LL); case vmIntrinsics::_compareToU: return inline_string_compareTo(StrIntrinsicNode::UU); @@ -5370,7 +5367,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_ } //------------------------------inline_array_partition----------------------- -bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { +bool LibraryCallKit::inline_array_partition() { address stubAddr = nullptr; const char *stubName; @@ -5382,12 +5379,12 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { Node* fromIndex = argument(4); Node* toIndex = argument(5); Node* indexPivot1 = argument(6); - Node* indexPivot2 = is_dual_pivot? argument(7) : nullptr; + Node* indexPivot2 = argument(7); const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); - stubAddr = StubRoutines::select_array_partition_function(is_dual_pivot); + stubAddr = StubRoutines::select_array_partition_function(); // stub not loaded if (stubAddr == nullptr) { return false; @@ -5412,7 +5409,7 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { Node* elemType = intcon(bt); // Call the stub - make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(is_dual_pivot), + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(), stubAddr, stubName, TypePtr::BOTTOM, obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2); @@ -5424,8 +5421,8 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) { } -//------------------------------inline_arraysort----------------------- -bool LibraryCallKit::inline_arraysort() { +//------------------------------inline_array_sort----------------------- +bool LibraryCallKit::inline_array_sort() { address stubAddr = nullptr; const char *stubName; diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index d33c1c8ee0538..55d1dc78f1fd5 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -277,8 +277,8 @@ class LibraryCallKit : public GraphKit { JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp); void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp, uint new_idx); - bool inline_arraysort(); - bool inline_array_partition(bool is_dual_pivot); + bool inline_array_sort(); + bool inline_array_partition(); typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind; bool inline_unsafe_load_store(BasicType type, LoadStoreKind kind, AccessKind access_kind); bool inline_unsafe_fence(vmIntrinsics::ID id); diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 7a57b755555f1..473062abfca09 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -857,9 +857,9 @@ const TypeFunc* OptoRuntime::array_fill_Type() { return TypeFunc::make(domain, range); } -const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) { +const TypeFunc* OptoRuntime::array_partition_Type() { // create input type (domain) - int num_args = is_dual_pivot ? 7 : 6; + int num_args = 7; int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; @@ -869,9 +869,7 @@ const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) { fields[argp++] = TypeInt::INT; // end fields[argp++] = TypePtr::NOTNULL; // pivot_indices (int array) fields[argp++] = TypeInt::INT; // indexPivot1 - if (is_dual_pivot) { - fields[argp++] = TypeInt::INT; // indexPivot2 - } + fields[argp++] = TypeInt::INT; // indexPivot2 assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp index 4017f70d36296..b85542423e848 100644 --- a/src/hotspot/share/opto/runtime.hpp +++ b/src/hotspot/share/opto/runtime.hpp @@ -269,7 +269,7 @@ class OptoRuntime : public AllStatic { static const TypeFunc* array_fill_Type(); static const TypeFunc* array_sort_Type(); - static const TypeFunc* array_partition_Type(bool is_dual_pivot); + static const TypeFunc* array_partition_Type(); static const TypeFunc* aescrypt_block_Type(); static const TypeFunc* cipherBlockChaining_aescrypt_Type(); static const TypeFunc* electronicCodeBook_aescrypt_Type(); diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index ed26d119f49a6..bea2a934bc603 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -176,9 +176,8 @@ address StubRoutines::_hf2f = nullptr; address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}}; -address StubRoutines::_arraysort = nullptr; -address StubRoutines::_array_partition_single = nullptr; -address StubRoutines::_array_partition_dual = nullptr; +address StubRoutines::_array_sort = nullptr; +address StubRoutines::_array_partition = nullptr; address StubRoutines::_cont_thaw = nullptr; address StubRoutines::_cont_returnBarrier = nullptr; diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index 4afd471e97de7..eb29238b46308 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -153,9 +153,8 @@ class StubRoutines: AllStatic { static BufferBlob* _compiler_stubs_code; // code buffer for C2 intrinsics static BufferBlob* _final_stubs_code; // code buffer for all other routines - static address _arraysort; - static address _array_partition_single; - static address _array_partition_dual; + static address _array_sort; + static address _array_partition; // Leaf routines which implement arraycopy and their addresses // arraycopy operands aligned on element type boundary static address _jbyte_arraycopy; @@ -378,8 +377,8 @@ class StubRoutines: AllStatic { static UnsafeArrayCopyStub UnsafeArrayCopy_stub() { return CAST_TO_FN_PTR(UnsafeArrayCopyStub, _unsafe_arraycopy); } static address generic_arraycopy() { return _generic_arraycopy; } - static address select_arraysort_function() { return _arraysort; } - static address select_array_partition_function(bool is_dual_pivot) { return is_dual_pivot ? _array_partition_dual : _array_partition_single; } + static address select_arraysort_function() { return _array_sort; } + static address select_array_partition_function() { return _array_partition; } static address jbyte_fill() { return _jbyte_fill; } static address jshort_fill() { return _jshort_fill; } diff --git a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h index 8f255a38e47d6..16aeb0d50a30f 100644 --- a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h +++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h @@ -436,9 +436,13 @@ void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, } template -void inline avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) { - if(is_dual_pviot) avx512_dual_pivot_partition(arr, from_index, to_index, pivot_indices); - else avx512_single_pivot_partition(arr, from_index, to_index, pivot_indices); +void inline avx512_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) { + if (index_pivot1 != index_pivot2) { + avx512_dual_pivot_partition(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + else { + avx512_single_pivot_partition(arr, from_index, to_index, pivot_indices, index_pivot1); + } } template @@ -456,7 +460,7 @@ void inline insertion_sort(T *arr, int32_t from_index, int32_t to_index) { } template -void inline avx512_fastsort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) { +void inline avx512_fast_sort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) { int32_t size = to_index - from_index; if (size <= INS_SORT_THRESHOLD) { diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp index 555be741a4f2b..a4ac2a8e4955f 100644 --- a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp +++ b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp @@ -38,50 +38,33 @@ extern "C" { DLL_PUBLIC void avx512_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) { switch(elem_type) { case JVM_T_INT: - avx512_fastsort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + avx512_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); break; case JVM_T_LONG: - avx512_fastsort((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + avx512_fast_sort((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); break; case JVM_T_FLOAT: - avx512_fastsort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + avx512_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); break; case JVM_T_DOUBLE: - avx512_fastsort((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); + avx512_fast_sort((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT); break; } } - DLL_PUBLIC void avx512_partition_single(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot) { + DLL_PUBLIC void avx512_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { switch(elem_type) { case JVM_T_INT: - avx512_single_pivot_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot); + avx512_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); break; case JVM_T_LONG: - avx512_single_pivot_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot); + avx512_fast_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); break; case JVM_T_FLOAT: - avx512_single_pivot_partition((float*)array, from_index, to_index, pivot_indices, index_pivot); + avx512_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); break; case JVM_T_DOUBLE: - avx512_single_pivot_partition((double*)array, from_index, to_index, pivot_indices, index_pivot); - break; - } - } - - DLL_PUBLIC void avx512_partition_dual(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { - switch(elem_type) { - case JVM_T_INT: - avx512_dual_pivot_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - break; - case JVM_T_LONG: - avx512_dual_pivot_partition((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - break; - case JVM_T_FLOAT: - avx512_dual_pivot_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - break; - case JVM_T_DOUBLE: - avx512_dual_pivot_partition((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + avx512_fast_partition((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); break; } } diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index f2cf2deab6638..d6fe8ddcffcf4 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -53,6 +53,42 @@ * * @since 1.7 * 14 */ + +/** + * Represents a function that accepts an array and sorts a specified range + * of the array into ascending order. + */ +@FunctionalInterface +interface SortOperation { + /** + * Sorts the specified range of the array. + * + * @param a the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + */ + void sort(A a, int low, int high); +} + +/** + * Represents a function that accepts an array and partitions a specified range + * of the array based on the pivots provided. + */ +@FunctionalInterface +interface PartitionOperation { + /** + * Partitions the specified range of the array. + * + * @param a the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot + */ + int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2); +} + + final class DualPivotQuicksort { /** @@ -137,41 +173,11 @@ private DualPivotQuicksort() {} * address pointing to the first element to sort from. * @param low the index of the first element, inclusive, to be sorted * @param high the index of the last element, exclusive, to be sorted - * @param end the index of the last element for simple insertion sort - */ - @IntrinsicCandidate - private static void mixedInsertionSort(Class elemType, Object array, long offset, int low, int high, int end) { - switch (array) { - case int[] arr -> mixedInsertionSort(arr, low, end, high); - case long[] arr -> mixedInsertionSort(arr, low, end, high); - case float[] arr -> mixedInsertionSort(arr, low, end, high); - case double[] arr -> mixedInsertionSort(arr, low, end, high); - default -> throw new UnsupportedOperationException(); - } - } - - /** - * Sorts the specified array into ascending numerical order using - * insertion sort.The intrinsic is free to choose its own - * sorting algorithm. - * - * @param elemType the class of the elements of the array to be sorted - * @param array the array to be sorted - * @param offset the relative offset, in bytes, from the base address of - * the array to sort, otherwise if the array is {@code null},an absolute - * address pointing to the first element to sort from. - * @param low the index of the first element, inclusive, to be sorted - * @param high the index of the last element, exclusive, to be sorted + * @param so the method reference for the fallback implementation */ @IntrinsicCandidate - private static void insertionSort(Class elemType, Object array, long offset, int low, int high) { - switch (array) { - case int[] arr -> insertionSort(arr, low, high); - case long[] arr -> insertionSort(arr, low, high); - case float[] arr -> insertionSort(arr, low, high); - case double[] arr -> insertionSort(arr, low, high); - default -> throw new UnsupportedOperationException(); - } + private static void arraySort(Class elemType, A array, long offset, int low, int high, SortOperation so) { + so.sort(array, low, high); } /** @@ -186,42 +192,14 @@ private static void insertionSort(Class elemType, Object array, long offset, * @param high the index of the last element, exclusive, to be sorted * @param indexPivot1 the index of pivot1, the first pivot * @param indexPivot2 the index of pivot2, the second pivot + * @param po the method reference for the fallback implementation */ @IntrinsicCandidate @ForceInline - private static int[] partitionDualPivot(Class elemType, Object array, long offset, int low, int high, int indexPivot1, int indexPivot2) { - return switch(array) { - case int[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); - case long[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); - case float[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); - case double[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2); - default -> throw new UnsupportedOperationException(); - }; + private static int[] arrayPartition(Class elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation po) { + return po.partition(array, low, high, indexPivot1, indexPivot2); } - /** - * Partitions the specified array based on the single pivot provided. - * - * @param elemType the class of the array to be sorted - * @param array the array to be sorted - * @param offset the relative offset, in bytes, from the base address of - * the array to partition, otherwise if the array is {@code null},an absolute - * address pointing to the first element to partition from. - * @param low the index of the first element, inclusive, to be sorted - * @param high the index of the last element, exclusive, to be sorted - * @param indexPivot the index of the pivot - */ - @IntrinsicCandidate - @ForceInline - private static int[] partitionSinglePivot(Class elemType, Object array, long offset, int low, int high, int indexPivot) { - return switch(array) { - case int[] arr -> partitionSinglePivot(arr, low, high, indexPivot); - case long[] arr -> partitionSinglePivot(arr, low, high, indexPivot); - case float[] arr -> partitionSinglePivot(arr, low, high, indexPivot); - case double[] arr -> partitionSinglePivot(arr, low, high, indexPivot); - default -> throw new UnsupportedOperationException(); - }; - } /** * Calculates the double depth of parallel merging. @@ -286,7 +264,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - mixedInsertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); + arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -294,7 +272,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high); + arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -380,7 +358,7 @@ && tryMergeRuns(sorter, a, low, size)) { * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = partitionDualPivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5); + pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -404,7 +382,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = partitionSinglePivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3); + pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -517,15 +495,18 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot the index of the pivot + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot) { + private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) { + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot; + int e3 = indexPivot1; int pivot = a[e3]; /* @@ -596,10 +577,11 @@ private static int[] partitionSinglePivot(int[] a, int low, int high, int indexP * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted - * @param end the index of the last element for simple insertion sort * @param high the index of the last element, exclusive, to be sorted */ - private static void mixedInsertionSort(int[] a, int low, int end, int high) { + private static void mixedInsertionSort(int[] a, int low, int high) { + int size = high - low; + int end = high - 3 * ((size >> 5) << 3); if (end == high) { /* @@ -1089,8 +1071,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - int last = high - 3 * ((size >> 5) << 3); - mixedInsertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); + arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -1098,7 +1079,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high); + arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -1185,7 +1166,7 @@ && tryMergeRuns(sorter, a, low, size)) { * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = partitionDualPivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5); + pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1206,7 +1187,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = partitionSinglePivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3); + pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1318,16 +1299,19 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot the index of the pivot + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot) { + private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) { + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot; + int e3 = indexPivot1; long pivot = a[e3]; /* @@ -1398,10 +1382,11 @@ private static int[] partitionSinglePivot(long[] a, int low, int high, int index * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted - * @param end the index of the last element for simple insertion sort * @param high the index of the last element, exclusive, to be sorted */ - private static void mixedInsertionSort(long[] a, int low, int end, int high) { + private static void mixedInsertionSort(long[] a, int low, int high) { + int size = high - low; + int end = high - 3 * ((size >> 5) << 3); if (end == high) { /* @@ -2678,8 +2663,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - int last = high - 3 * ((size >> 5) << 3); - mixedInsertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); + arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -2687,7 +2671,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high); + arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -2774,7 +2758,7 @@ && tryMergeRuns(sorter, a, low, size)) { * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = partitionDualPivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5); + pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2795,7 +2779,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = partitionSinglePivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3); + pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2907,16 +2891,18 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot the index of the pivot + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot) { + private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) { + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot; + int e3 = indexPivot1; float pivot = a[e3]; /* @@ -2987,10 +2973,11 @@ private static int[] partitionSinglePivot(float[] a, int low, int high, int inde * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted - * @param end the index of the last element for simple insertion sort * @param high the index of the last element, exclusive, to be sorted */ - private static void mixedInsertionSort(float[] a, int low, int end, int high) { + private static void mixedInsertionSort(float[] a, int low, int high) { + int size = high - low; + int end = high - 3 * ((size >> 5) << 3); if (end == high) { /* @@ -3531,8 +3518,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - int last = high - 3 * ((size >> 5) << 3); - mixedInsertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3)); + arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -3540,7 +3526,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - insertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high); + arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -3627,7 +3613,7 @@ && tryMergeRuns(sorter, a, low, size)) { * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = partitionDualPivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5); + pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3648,7 +3634,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = partitionSinglePivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3); + pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -3762,15 +3748,18 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot the index of the pivot + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot */ @ForceInline - private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot) { + private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) { + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot; + int e3 = indexPivot1; double pivot = a[e3]; /* @@ -3841,10 +3830,11 @@ private static int[] partitionSinglePivot(double[] a, int low, int high, int ind * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted - * @param end the index of the last element for simple insertion sort * @param high the index of the last element, exclusive, to be sorted */ - private static void mixedInsertionSort(double[] a, int low, int end, int high) { + private static void mixedInsertionSort(double[] a, int low, int high) { + int size = high - low; + int end = high - 3 * ((size >> 5) << 3); if (end == high) { /* diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java index ce5b2ff87e07d..d368885abe082 100644 --- a/test/jdk/java/util/Arrays/Sorting.java +++ b/test/jdk/java/util/Arrays/Sorting.java @@ -26,7 +26,7 @@ * @compile/module=java.base java/util/SortingHelper.java * @bug 6880672 6896573 6899694 6976036 7013585 7018258 8003981 8226297 * @build Sorting - * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySortI,_arraySortMI,_arrayPartitionSP,_arrayPartitionDP Sorting -shortrun + * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySort,_arrayPartition, Sorting -shortrun * @run main/othervm -XX:CompileThreshold=1 -XX:-TieredCompilation Sorting -shortrun * @summary Exercise Arrays.sort, Arrays.parallelSort * From e63a2aa081275c3f1ed2ccc4315a60f304d18b34 Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 15 Sep 2023 15:09:03 -0700 Subject: [PATCH 34/40] Move functional interfaces close to the associated methods --- src/hotspot/share/classfile/vmIntrinsics.hpp | 6 +- .../classes/java/util/DualPivotQuicksort.java | 78 +++++++++---------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index fcd7e6a89b7a0..d6c22e6eaed6e 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -343,11 +343,11 @@ class methodHandle; \ do_intrinsic(_arraySort, java_util_DualPivotQuicksort, arraySort_name, arraySort_signature, F_S) \ do_name( arraySort_name, "arraySort") \ - do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/SortOperation;)V") \ + do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/DualPivotQuicksort$SortOperation;)V") \ \ - do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ + do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ do_name( arrayPartition_name, "arrayPartition") \ - do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/PartitionOperation;)[I") \ + do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/DualPivotQuicksort$PartitionOperation;)[I") \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ do_name( copyOfRange_name, "copyOfRange") \ diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index d6fe8ddcffcf4..85a750c25066e 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -54,40 +54,6 @@ * @since 1.7 * 14 */ -/** - * Represents a function that accepts an array and sorts a specified range - * of the array into ascending order. - */ -@FunctionalInterface -interface SortOperation { - /** - * Sorts the specified range of the array. - * - * @param a the array to be sorted - * @param low the index of the first element, inclusive, to be sorted - * @param high the index of the last element, exclusive, to be sorted - */ - void sort(A a, int low, int high); -} - -/** - * Represents a function that accepts an array and partitions a specified range - * of the array based on the pivots provided. - */ -@FunctionalInterface -interface PartitionOperation { - /** - * Partitions the specified range of the array. - * - * @param a the array to be sorted - * @param low the index of the first element, inclusive, to be sorted - * @param high the index of the last element, exclusive, to be sorted - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot - */ - int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2); -} - final class DualPivotQuicksort { @@ -161,6 +127,22 @@ private DualPivotQuicksort() {} */ private static final int MAX_RECURSION_DEPTH = 64 * DELTA; + /** + * Represents a function that accepts an array and sorts a specified range + * of the array into ascending order. + */ + @FunctionalInterface + private static interface SortOperation { + /** + * Sorts the specified range of the array. + * + * @param a the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + */ + void sort(A a, int low, int high); + } + /** * Sorts the specified array into ascending numerical order using * mixed insertion sort.The intrinsic is free to choose its own @@ -176,10 +158,29 @@ private DualPivotQuicksort() {} * @param so the method reference for the fallback implementation */ @IntrinsicCandidate + @ForceInline private static void arraySort(Class elemType, A array, long offset, int low, int high, SortOperation so) { so.sort(array, low, high); } + /** + * Represents a function that accepts an array and partitions a specified range + * of the array based on the pivots provided. + */ + @FunctionalInterface + interface PartitionOperation { + /** + * Partitions the specified range of the array. + * + * @param a the array to be sorted + * @param low the index of the first element, inclusive, to be sorted + * @param high the index of the last element, exclusive, to be sorted + * @param indexPivot1 the index of pivot1, the first pivot + * @param indexPivot2 the index of pivot2, the second pivot + */ + int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2); + } + /** * Partitions the specified array based on the two pivots provided. * @@ -200,7 +201,6 @@ private static int[] arrayPartition(Class elemType, A array, long offset, return po.partition(array, low, high, indexPivot1, indexPivot2); } - /** * Calculates the double depth of parallel merging. * Depth is negative, if tasks split before sorting. @@ -501,7 +501,7 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv */ @ForceInline private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; @@ -1305,7 +1305,7 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi */ @ForceInline private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; @@ -2897,7 +2897,7 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP */ @ForceInline private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; int upper = end; @@ -3753,7 +3753,7 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index */ @ForceInline private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same"); + if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; From 7fc1afac4ca287908f2ddaeb2bd554044791452e Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Mon, 18 Sep 2023 11:44:59 -0700 Subject: [PATCH 35/40] Remove the unnecessary exception in single pivot partitioning fallback method --- src/hotspot/share/opto/library_call.cpp | 4 +- .../classes/java/util/DualPivotQuicksort.java | 38 ++++++++----------- test/jdk/java/util/Arrays/Sorting.java | 2 +- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 9d119a19ea313..eb8c76dfc56aa 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -5405,7 +5405,7 @@ bool LibraryCallKit::inline_array_partition() { guarantee(alloc != nullptr, "created above"); Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT)); - // pass the bastic type enum to the stub + // pass the basic type enum to the stub Node* elemType = intcon(bt); // Call the stub @@ -5450,7 +5450,7 @@ bool LibraryCallKit::inline_array_sort() { } Node* obj_adr = make_unsafe_address(obj, offset); - // pass the bastic type enum to the stub + // pass the basic type enum to the stub Node* elemType = intcon(bt); // Call the stub. diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index 85a750c25066e..c0f95f390cfde 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -128,13 +128,13 @@ private DualPivotQuicksort() {} private static final int MAX_RECURSION_DEPTH = 64 * DELTA; /** - * Represents a function that accepts an array and sorts a specified range - * of the array into ascending order. + * Represents a function that accepts an array and sorts the specified range + * of an array into ascending order. */ @FunctionalInterface private static interface SortOperation { /** - * Sorts the specified range of the array. + * Sorts the specified range of an array. * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted @@ -144,9 +144,7 @@ private static interface SortOperation { } /** - * Sorts the specified array into ascending numerical order using - * mixed insertion sort.The intrinsic is free to choose its own - * sorting algorithm. + * Sorts the specified range of an array into ascending numerical order. * * @param elemType the class of the elements of the array to be sorted * @param array the array to be sorted @@ -164,13 +162,13 @@ private static void arraySort(Class elemType, A array, long offset, int l } /** - * Represents a function that accepts an array and partitions a specified range - * of the array based on the pivots provided. + * Represents a function that accepts an array and partitions the specified range + * of an array using the pivots provided. */ @FunctionalInterface interface PartitionOperation { /** - * Partitions the specified range of the array. + * Partitions the specified range of an array using the given pivots. * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted @@ -182,7 +180,7 @@ interface PartitionOperation { } /** - * Partitions the specified array based on the two pivots provided. + * Partitions the specified range of an array using the two pivots provided. * * @param elemType the class of the array to be sorted * @param array the array to be sorted @@ -401,7 +399,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots provided. + * Partitions the specified range of an array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -490,7 +488,7 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv /** - * Partitions the specified range of the array using a single pivot provided. + * Partitions the specified range of an array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -501,7 +499,6 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv */ @ForceInline private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; @@ -1206,7 +1203,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots provided. + * Partitions the specified range of an array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -1294,7 +1291,7 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi /** - * Partitions the specified range of the array using a single pivot provided. + * Partitions the specified range of an array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -1305,7 +1302,6 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi */ @ForceInline private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; @@ -2798,7 +2794,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots provided. + * Partitions the specified range of an array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -2886,7 +2882,7 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP /** - * Partitions the specified range of the array using a single pivot provided. + * Partitions the specified range of an array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -2897,7 +2893,6 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP */ @ForceInline private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; int upper = end; @@ -3654,7 +3649,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of the array using the two pivots provided. + * Partitions the specified range of an array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -3743,7 +3738,7 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index /** - * Partitions the specified range of the array using a single pivot provided. + * Partitions the specified range of an array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -3753,7 +3748,6 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index */ @ForceInline private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) { - if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same"); int end = high - 1; int lower = low; diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java index d368885abe082..113c8a688620c 100644 --- a/test/jdk/java/util/Arrays/Sorting.java +++ b/test/jdk/java/util/Arrays/Sorting.java @@ -47,7 +47,7 @@ public class Sorting { // Array lengths used in a long run (default) private static final int[] LONG_RUN_LENGTHS = { - 1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000}; + 1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000 }; // Array lengths used in a short run private static final int[] SHORT_RUN_LENGTHS = { From bf41d2ab73e55bdcaaae35bceab456d07136748f Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Mon, 18 Sep 2023 18:48:30 -0700 Subject: [PATCH 36/40] Rename arraySort and arrayPartition Java methods to sort and partition. Cleanup some comments --- src/hotspot/share/classfile/vmIntrinsics.hpp | 4 +- .../classes/java/util/DualPivotQuicksort.java | 79 +++++++++---------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index d6c22e6eaed6e..66b8a43640728 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -342,11 +342,11 @@ class methodHandle; do_signature(copyOf_signature, "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;") \ \ do_intrinsic(_arraySort, java_util_DualPivotQuicksort, arraySort_name, arraySort_signature, F_S) \ - do_name( arraySort_name, "arraySort") \ + do_name( arraySort_name, "sort") \ do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/DualPivotQuicksort$SortOperation;)V") \ \ do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \ - do_name( arrayPartition_name, "arrayPartition") \ + do_name( arrayPartition_name, "partition") \ do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/DualPivotQuicksort$PartitionOperation;)[I") \ \ do_intrinsic(_copyOfRange, java_util_Arrays, copyOfRange_name, copyOfRange_signature, F_S) \ diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index c0f95f390cfde..3fa87815596db 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -27,7 +27,6 @@ import java.util.concurrent.CountedCompleter; import java.util.concurrent.RecursiveTask; -import java.util.Arrays; import jdk.internal.misc.Unsafe; import jdk.internal.vm.annotation.IntrinsicCandidate; import jdk.internal.vm.annotation.ForceInline; @@ -128,13 +127,13 @@ private DualPivotQuicksort() {} private static final int MAX_RECURSION_DEPTH = 64 * DELTA; /** - * Represents a function that accepts an array and sorts the specified range - * of an array into ascending order. + * Represents a function that accepts the array and sorts the specified range + * of the array into ascending order. */ @FunctionalInterface private static interface SortOperation { /** - * Sorts the specified range of an array. + * Sorts the specified range of the array. * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted @@ -144,7 +143,7 @@ private static interface SortOperation { } /** - * Sorts the specified range of an array into ascending numerical order. + * Sorts the specified range of the array into ascending numerical order. * * @param elemType the class of the elements of the array to be sorted * @param array the array to be sorted @@ -157,18 +156,18 @@ private static interface SortOperation { */ @IntrinsicCandidate @ForceInline - private static void arraySort(Class elemType, A array, long offset, int low, int high, SortOperation so) { + private static void sort(Class elemType, A array, long offset, int low, int high, SortOperation so) { so.sort(array, low, high); } /** - * Represents a function that accepts an array and partitions the specified range - * of an array using the pivots provided. + * Represents a function that accepts the array and partitions the specified range + * of the array using the pivots provided. */ @FunctionalInterface interface PartitionOperation { /** - * Partitions the specified range of an array using the given pivots. + * Partitions the specified range of the array using the given pivots. * * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted @@ -180,7 +179,7 @@ interface PartitionOperation { } /** - * Partitions the specified range of an array using the two pivots provided. + * Partitions the specified range of the array using the two pivots provided. * * @param elemType the class of the array to be sorted * @param array the array to be sorted @@ -195,7 +194,7 @@ interface PartitionOperation { */ @IntrinsicCandidate @ForceInline - private static int[] arrayPartition(Class elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation po) { + private static int[] partition(Class elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation po) { return po.partition(array, low, high, indexPivot1, indexPivot2); } @@ -255,14 +254,13 @@ static void sort(int[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { - int[] pivotIndices; while (true) { int end = high - 1, size = high - low; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); + sort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -270,7 +268,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); + sort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -356,7 +354,7 @@ && tryMergeRuns(sorter, a, low, size)) { * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); + int[] pivotIndices = partition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -380,7 +378,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); + int[] pivotIndices = partition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -399,7 +397,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of an array using the two pivots provided. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -488,7 +486,7 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv /** - * Partitions the specified range of an array using a single pivot provided. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -1060,7 +1058,6 @@ static void sort(long[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { - int[] pivotIndices; while (true) { int end = high - 1, size = high - low; @@ -1068,7 +1065,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); + sort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -1076,7 +1073,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); + sort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -1156,14 +1153,14 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { + if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); + int[] pivotIndices = partition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1184,7 +1181,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); + int[] pivotIndices = partition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -1203,7 +1200,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of an array using the two pivots provided. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -1291,7 +1288,7 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi /** - * Partitions the specified range of an array using a single pivot provided. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -2651,7 +2648,6 @@ static void sort(float[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { - int[] pivotIndices; while (true) { int end = high - 1, size = high - low; @@ -2659,7 +2655,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); + sort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -2667,7 +2663,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); + sort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -2747,14 +2743,14 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { + if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); + int[] pivotIndices = partition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2775,7 +2771,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); + int[] pivotIndices = partition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -2794,7 +2790,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of an array using the two pivots provided. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -2882,7 +2878,7 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP /** - * Partitions the specified range of an array using a single pivot provided. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -3506,14 +3502,13 @@ static void sort(double[] a, int parallelism, int low, int high) { * @param high the index of the last element, exclusive, to be sorted */ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { - int[] pivotIndices; while (true) { int end = high - 1, size = high - low; /* * Run mixed insertion sort on small non-leftmost parts. */ if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) { - arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); + sort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort); return; } @@ -3521,7 +3516,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) { * Invoke insertion sort on small leftmost part. */ if (size < MAX_INSERTION_SORT_SIZE) { - arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); + sort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort); return; } @@ -3601,14 +3596,14 @@ && tryMergeRuns(sorter, a, low, size)) { /* * Partitioning with 2 pivots in case of different elements. */ - if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { + if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) { /* * Use the first and fifth of the five sorted elements as * the pivots. These values are inexpensive approximation * of tertiles. Note, that pivot1 < pivot2. */ - pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); + int[] pivotIndices = partition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot); lower = pivotIndices[0]; upper = pivotIndices[1]; /* @@ -3629,7 +3624,7 @@ && tryMergeRuns(sorter, a, low, size)) { * Use the third of the five sorted elements as the pivot. * This value is inexpensive approximation of the median. */ - pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); + int[] pivotIndices = partition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot); lower = pivotIndices[0]; upper = pivotIndices[1]; @@ -3649,7 +3644,7 @@ && tryMergeRuns(sorter, a, low, size)) { } /** - * Partitions the specified range of an array using the two pivots provided. + * Partitions the specified range of the array using the two pivots provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning @@ -3738,7 +3733,7 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index /** - * Partitions the specified range of an array using a single pivot provided. + * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning From 3e0b8cfcc380d6ff9b0511eb763d7f7a49c541f9 Mon Sep 17 00:00:00 2001 From: Srinivas Vamsi Parasa Date: Mon, 18 Sep 2023 18:52:14 -0700 Subject: [PATCH 37/40] Update DualPivotQuicksort.java --- src/java.base/share/classes/java/util/DualPivotQuicksort.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index 3fa87815596db..f93507ea8709f 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -31,7 +31,6 @@ import jdk.internal.vm.annotation.IntrinsicCandidate; import jdk.internal.vm.annotation.ForceInline; - /** * This class implements powerful and fully optimized versions, both * sequential and parallel, of the Dual-Pivot Quicksort algorithm by @@ -52,8 +51,6 @@ * * @since 1.7 * 14 */ - - final class DualPivotQuicksort { /** From b04cb6c3c41c7327f9dc67653e24b08693329e3e Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Wed, 20 Sep 2023 10:11:28 -0700 Subject: [PATCH 38/40] change variable names of indexPivot* to pivotIndex* --- .../classes/java/util/DualPivotQuicksort.java | 92 +++++++++---------- 1 file changed, 43 insertions(+), 49 deletions(-) diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index f93507ea8709f..4675b8f8d9ff3 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -169,10 +169,10 @@ interface PartitionOperation { * @param a the array to be sorted * @param low the index of the first element, inclusive, to be sorted * @param high the index of the last element, exclusive, to be sorted - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot */ - int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2); + int[] partition(A a, int low, int high, int pivotIndex1, int pivotIndex2); } /** @@ -185,14 +185,14 @@ interface PartitionOperation { * address pointing to the first element to partition from. * @param low the index of the first element, inclusive, to be sorted * @param high the index of the last element, exclusive, to be sorted - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * @param po the method reference for the fallback implementation */ @IntrinsicCandidate @ForceInline - private static int[] partition(Class elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation po) { - return po.partition(array, low, high, indexPivot1, indexPivot2); + private static int[] partition(Class elemType, A array, long offset, int low, int high, int pivotIndex1, int pivotIndex2, PartitionOperation po) { + return po.partition(array, low, high, pivotIndex1, pivotIndex2); } /** @@ -399,18 +399,18 @@ && tryMergeRuns(sorter, a, low, size)) { * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionDualPivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionDualPivot(int[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e1 = indexPivot1; - int e5 = indexPivot2; + int e1 = pivotIndex1; + int e5 = pivotIndex2; int pivot1 = a[e1]; int pivot2 = a[e5]; @@ -480,25 +480,23 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv return new int[] {lower, upper}; } - - /** * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionSinglePivot(int[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot1; + int e3 = pivotIndex1; int pivot = a[e3]; /* @@ -1202,18 +1200,18 @@ && tryMergeRuns(sorter, a, low, size)) { * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionDualPivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionDualPivot(long[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e1 = indexPivot1; - int e5 = indexPivot2; + int e1 = pivotIndex1; + int e5 = pivotIndex2; long pivot1 = a[e1]; long pivot2 = a[e5]; @@ -1283,25 +1281,24 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi return new int[] {lower, upper}; } - /** * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionSinglePivot(long[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot1; + int e3 = pivotIndex1; long pivot = a[e3]; /* @@ -2792,18 +2789,18 @@ && tryMergeRuns(sorter, a, low, size)) { * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionDualPivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionDualPivot(float[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e1 = indexPivot1; - int e5 = indexPivot2; + int e1 = pivotIndex1; + int e5 = pivotIndex2; float pivot1 = a[e1]; float pivot2 = a[e5]; @@ -2873,24 +2870,23 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP return new int[] {lower, upper}; } - /** * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionSinglePivot(float[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot1; + int e3 = pivotIndex1; float pivot = a[e3]; /* @@ -3646,18 +3642,18 @@ && tryMergeRuns(sorter, a, low, size)) { * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot * */ @ForceInline - private static int[] partitionDualPivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionDualPivot(double[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e1 = indexPivot1; - int e5 = indexPivot2; + int e1 = pivotIndex1; + int e5 = pivotIndex2; double pivot1 = a[e1]; double pivot2 = a[e5]; @@ -3727,25 +3723,23 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index return new int[] {lower, upper}; } - - /** * Partitions the specified range of the array using a single pivot provided. * * @param array the array to be partitioned * @param low the index of the first element, inclusive, for partitioning * @param high the index of the last element, exclusive, for partitioning - * @param indexPivot1 the index of pivot1, the first pivot - * @param indexPivot2 the index of pivot2, the second pivot + * @param pivotIndex1 the index of pivot1, the first pivot + * @param pivotIndex2 the index of pivot2, the second pivot */ @ForceInline - private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) { + private static int[] partitionSinglePivot(double[] a, int low, int high, int pivotIndex1, int pivotIndex2) { int end = high - 1; int lower = low; int upper = end; - int e3 = indexPivot1; + int e3 = pivotIndex1; double pivot = a[e3]; /* From dbf433215121bcfa64e713951d9373607add922e Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Fri, 22 Sep 2023 09:39:18 -0700 Subject: [PATCH 39/40] Update CompileThresholdScaling only for the sort and partition intrinsics; update build script to remove nested if --- make/modules/java.base/Lib.gmk | 30 ++++++++++++-------------- test/jdk/java/util/Arrays/Sorting.java | 4 ++-- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 976f5e8e75582..47a41d62f9f34 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -226,7 +226,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true) NAME := fallbackLinker, \ CFLAGS := $(CFLAGS_JDKLIB) $(LIBFFI_CFLAGS), \ LDFLAGS := $(LDFLAGS_JDKLIB) \ - $(call SET_SHARED_LIBRARY_ORIGIN), \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ LIBS := $(LIBFFI_LIBS), \ LIBS_windows := $(LIBFFI_LIBS) ws2_32.lib, \ )) @@ -236,22 +236,20 @@ endif ################################################################################ -ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true) - ifeq ($(TOOLCHAIN_TYPE), gcc) - $(eval $(call SetupJdkLibrary, BUILD_LIB_SIMD_SORT, \ - NAME := simdsort, \ - TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ - OPTIMIZATION := HIGH, \ - CFLAGS := $(CFLAGS_JDKLIB), \ - CXXFLAGS := $(CXXFLAGS_JDKLIB), \ - LDFLAGS := $(LDFLAGS_JDKLIB) \ - $(call SET_SHARED_LIBRARY_ORIGIN), \ - LIBS := $(LIBCXX), \ - LIBS_linux := -lc -lm -ldl, \ - )) +ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc) + $(eval $(call SetupJdkLibrary, BUILD_LIB_SIMD_SORT, \ + NAME := simdsort, \ + TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ + OPTIMIZATION := HIGH, \ + CFLAGS := $(CFLAGS_JDKLIB), \ + CXXFLAGS := $(CXXFLAGS_JDKLIB), \ + LDFLAGS := $(LDFLAGS_JDKLIB) \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ + LIBS := $(LIBCXX), \ + LIBS_linux := -lc -lm -ldl, \ + )) - TARGETS += $(BUILD_LIB_SIMD_SORT) - endif + TARGETS += $(BUILD_LIB_SIMD_SORT) endif ################################################################################ diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java index 113c8a688620c..f285b0c65b72c 100644 --- a/test/jdk/java/util/Arrays/Sorting.java +++ b/test/jdk/java/util/Arrays/Sorting.java @@ -26,8 +26,8 @@ * @compile/module=java.base java/util/SortingHelper.java * @bug 6880672 6896573 6899694 6976036 7013585 7018258 8003981 8226297 * @build Sorting - * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySort,_arrayPartition, Sorting -shortrun - * @run main/othervm -XX:CompileThreshold=1 -XX:-TieredCompilation Sorting -shortrun + * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySort,_arrayPartition Sorting -shortrun + * @run main/othervm -XX:-TieredCompilation -XX:CompileCommand=CompileThresholdScaling,java.util.DualPivotQuicksort::sort,0.0001 Sorting -shortrun * @summary Exercise Arrays.sort, Arrays.parallelSort * * @author Vladimir Yaroslavskiy From a5262d8673c3388638f45204057c7127eda87c7d Mon Sep 17 00:00:00 2001 From: vamsi-parasa Date: Thu, 5 Oct 2023 16:29:35 -0700 Subject: [PATCH 40/40] fix code style and formatting --- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 16 +- src/hotspot/share/opto/library_call.cpp | 13 +- .../native/libsimdsort/avx512-32bit-qsort.hpp | 2 - .../native/libsimdsort/avx512-64bit-common.h | 2 - .../native/libsimdsort/avx512-64bit-qsort.hpp | 2 - .../native/libsimdsort/avx512-common-qsort.h | 2 - ...t_linux_x86.cpp => avx512-linux-qsort.cpp} | 2 - .../classes/java/util/DualPivotQuicksort.java | 396 +++++++++--------- 8 files changed, 211 insertions(+), 224 deletions(-) rename src/java.base/linux/native/libsimdsort/{avxsort_linux_x86.cpp => avx512-linux-qsort.cpp} (98%) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index c76d5ce064914..79ebef8b58113 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4173,14 +4173,13 @@ void StubGenerator::generate_compiler_stubs() { } // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics - if (UseAVX > 2 && VM_Version::supports_avx512dq()) { - - void *libsimdsort = nullptr; - char ebuf_[1024]; - char dll_name_simd_sort[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) { - libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); - } + if (UseAVX > 2 && VM_Version::supports_avx512dq()) { + void *libsimdsort = nullptr; + char ebuf_[1024]; + char dll_name_simd_sort[JVM_MAXPATHLEN]; + if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) { + libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); + } // Get addresses for avx512 sort and partition routines if (libsimdsort != nullptr) { log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort)); @@ -4190,7 +4189,6 @@ void StubGenerator::generate_compiler_stubs() { snprintf(ebuf_, sizeof(ebuf_), "avx512_partition"); StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_); - } } diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index d24fe5dc4495d..4a9d7fb161667 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -5367,9 +5367,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_ //------------------------------inline_array_partition----------------------- bool LibraryCallKit::inline_array_partition() { - address stubAddr = nullptr; - const char *stubName; - stubName = "array_partition_stub"; + const char *stubName = "array_partition_stub"; Node* elementType = null_check(argument(0)); Node* obj = argument(1); @@ -5382,6 +5380,7 @@ bool LibraryCallKit::inline_array_partition() { const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); + address stubAddr = nullptr; stubAddr = StubRoutines::select_array_partition_function(); // stub not loaded if (stubAddr == nullptr) { @@ -5395,10 +5394,9 @@ bool LibraryCallKit::inline_array_partition() { Node* obj_adr = make_unsafe_address(obj, offset); // create the pivotIndices array of type int and size = 2 - Node* pivotIndices = nullptr; Node* size = intcon(2); Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT))); - pivotIndices = new_array(klass_node, size, 0); // no arguments to push + Node* pivotIndices = new_array(klass_node, size, 0); // no arguments to push AllocateArrayNode* alloc = tightly_coupled_allocation(pivotIndices); guarantee(alloc != nullptr, "created above"); Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT)); @@ -5409,7 +5407,8 @@ bool LibraryCallKit::inline_array_partition() { // Call the stub make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(), stubAddr, stubName, TypePtr::BOTTOM, - obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2); + obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, + indexPivot1, indexPivot2); if (!stopped()) { set_result(pivotIndices); @@ -5422,7 +5421,6 @@ bool LibraryCallKit::inline_array_partition() { //------------------------------inline_array_sort----------------------- bool LibraryCallKit::inline_array_sort() { - address stubAddr = nullptr; const char *stubName; stubName = "arraysort_stub"; @@ -5435,6 +5433,7 @@ bool LibraryCallKit::inline_array_sort() { const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr(); ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); BasicType bt = elem_type->basic_type(); + address stubAddr = nullptr; stubAddr = StubRoutines::select_arraysort_function(); //stub not loaded if (stubAddr == nullptr) { diff --git a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp index 15e406a822900..4fbe9b97450c6 100644 --- a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp +++ b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp @@ -1,8 +1,6 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Copyright (c) 2021 Serge Sans Paille. All rights reserved. - * Intel x86-simd-sort source code. - * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h index bb7553229eacb..9993cd22e6377 100644 --- a/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h +++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h @@ -1,7 +1,5 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. - * Intel x86-simd-sort source code. - * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp index 3028f45a79407..e28ebe19695de 100644 --- a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp +++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp @@ -1,7 +1,5 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. - * Intel x86-simd-sort source code. - * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h index 16aeb0d50a30f..b008bcd54b80c 100644 --- a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h +++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h @@ -1,8 +1,6 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Copyright (c) 2021 Serge Sans Paille. All rights reserved. - * Intel x86-simd-sort source code. - * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp similarity index 98% rename from src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp rename to src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp index a4ac2a8e4955f..6bd0c5871d6cb 100644 --- a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp +++ b/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp @@ -1,7 +1,5 @@ /* * Copyright (c) 2023 Intel Corporation. All rights reserved. - * Intel x86-simd-sort source code. - * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java index 4675b8f8d9ff3..0dd4b6e354aed 100644 --- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java +++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java @@ -415,40 +415,40 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int pivotInd int pivot2 = a[e5]; /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. - */ + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ a[e1] = a[lower]; a[e5] = a[upper]; /* - * Skip elements, which are less or greater than the pivots. - */ + * Skip elements, which are less or greater than the pivots. + */ while (a[++lower] < pivot1); while (a[--upper] > pivot2); /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ for (int unused = --lower, k = ++upper; --k > lower; ) { int ak = a[k]; @@ -500,33 +500,33 @@ private static int[] partitionSinglePivot(int[] a, int low, int high, int pivotI int pivot = a[e3]; /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. - */ + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ a[e3] = a[lower]; /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ for (int k = ++upper; --k > lower; ) { int ak = a[k]; @@ -547,8 +547,8 @@ private static int[] partitionSinglePivot(int[] a, int low, int high, int pivotI } /* - * Swap the pivot into its final position. - */ + * Swap the pivot into its final position. + */ a[low] = a[lower]; a[lower] = pivot; return new int[] {lower, upper}; } @@ -1216,40 +1216,40 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int pivotIn long pivot2 = a[e5]; /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. - */ + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ a[e1] = a[lower]; a[e5] = a[upper]; /* - * Skip elements, which are less or greater than the pivots. - */ + * Skip elements, which are less or greater than the pivots. + */ while (a[++lower] < pivot1); while (a[--upper] > pivot2); /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ for (int unused = --lower, k = ++upper; --k > lower; ) { long ak = a[k]; @@ -1302,33 +1302,33 @@ private static int[] partitionSinglePivot(long[] a, int low, int high, int pivot long pivot = a[e3]; /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. - */ + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ a[e3] = a[lower]; /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ for (int k = ++upper; --k > lower; ) { long ak = a[k]; @@ -1349,8 +1349,8 @@ private static int[] partitionSinglePivot(long[] a, int low, int high, int pivot } /* - * Swap the pivot into its final position. - */ + * Swap the pivot into its final position. + */ a[low] = a[lower]; a[lower] = pivot; return new int[] {lower, upper}; } @@ -2805,40 +2805,40 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int pivotI float pivot2 = a[e5]; /* - * The first and the last elements to be sorted are moved - * to the locations formerly occupied by the pivots. When - * partitioning is completed, the pivots are swapped back - * into their final positions, and excluded from the next - * subsequent sorting. - */ + * The first and the last elements to be sorted are moved + * to the locations formerly occupied by the pivots. When + * partitioning is completed, the pivots are swapped back + * into their final positions, and excluded from the next + * subsequent sorting. + */ a[e1] = a[lower]; a[e5] = a[upper]; /* - * Skip elements, which are less or greater than the pivots. - */ - while (a[++lower] < pivot1); - while (a[--upper] > pivot2); + * Skip elements, which are less or greater than the pivots. + */ + while (a[++lower] < pivot1); + while (a[--upper] > pivot2); /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ for (int unused = --lower, k = ++upper; --k > lower; ) { float ak = a[k]; @@ -2890,33 +2890,33 @@ private static int[] partitionSinglePivot(float[] a, int low, int high, int pivo float pivot = a[e3]; /* - * The first element to be sorted is moved to the - * location formerly occupied by the pivot. After - * completion of partitioning the pivot is swapped - * back into its final position, and excluded from - * the next subsequent sorting. - */ + * The first element to be sorted is moved to the + * location formerly occupied by the pivot. After + * completion of partitioning the pivot is swapped + * back into its final position, and excluded from + * the next subsequent sorting. + */ a[e3] = a[lower]; /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ for (int k = ++upper; --k > lower; ) { float ak = a[k]; @@ -2937,8 +2937,8 @@ private static int[] partitionSinglePivot(float[] a, int low, int high, int pivo } /* - * Swap the pivot into its final position. - */ + * Swap the pivot into its final position. + */ a[low] = a[lower]; a[lower] = pivot; return new int[] {lower, upper}; } @@ -3670,28 +3670,28 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int pivot /* * Skip elements, which are less or greater than the pivots. */ - while (a[++lower] < pivot1); - while (a[--upper] > pivot2); + while (a[++lower] < pivot1); + while (a[--upper] > pivot2); /* - * Backward 3-interval partitioning - * - * left part central part right part - * +------------------------------------------------------------+ - * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | - * +------------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot1 - * pivot1 <= all in (k, upper) <= pivot2 - * all in [upper, end) > pivot2 - * - * Pointer k is the last index of ?-part - */ + * Backward 3-interval partitioning + * + * left part central part right part + * +------------------------------------------------------------+ + * | < pivot1 | ? | pivot1 <= && <= pivot2 | > pivot2 | + * +------------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot1 + * pivot1 <= all in (k, upper) <= pivot2 + * all in [upper, end) > pivot2 + * + * Pointer k is the last index of ?-part + */ for (int unused = --lower, k = ++upper; --k > lower; ) { double ak = a[k]; @@ -3752,24 +3752,24 @@ private static int[] partitionSinglePivot(double[] a, int low, int high, int piv a[e3] = a[lower]; /* - * Traditional 3-way (Dutch National Flag) partitioning - * - * left part central part right part - * +------------------------------------------------------+ - * | < pivot | ? | == pivot | > pivot | - * +------------------------------------------------------+ - * ^ ^ ^ - * | | | - * lower k upper - * - * Invariants: - * - * all in (low, lower] < pivot - * all in (k, upper) == pivot - * all in [upper, end] > pivot - * - * Pointer k is the last index of ?-part - */ + * Traditional 3-way (Dutch National Flag) partitioning + * + * left part central part right part + * +------------------------------------------------------+ + * | < pivot | ? | == pivot | > pivot | + * +------------------------------------------------------+ + * ^ ^ ^ + * | | | + * lower k upper + * + * Invariants: + * + * all in (low, lower] < pivot + * all in (k, upper) == pivot + * all in [upper, end] > pivot + * + * Pointer k is the last index of ?-part + */ for (int k = ++upper; --k > lower; ) { double ak = a[k]; @@ -3790,8 +3790,8 @@ private static int[] partitionSinglePivot(double[] a, int low, int high, int piv } /* - * Swap the pivot into its final position. - */ + * Swap the pivot into its final position. + */ a[low] = a[lower]; a[lower] = pivot; return new int[] {lower, upper}; }